1 /*
2  * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.
9  *
10  * This code is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13  * version 2 for more details (a copy is included in the LICENSE file that
14  * accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License version
17  * 2 along with this work; if not, write to the Free Software Foundation,
18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19  *
20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21  * or visit www.oracle.com if you need additional information or have any
22  * questions.
23  *
24  */
25 
26 #include <sys/types.h>
27 
28 #include "precompiled.hpp"
29 #include "jvm.h"
30 #include "asm/assembler.hpp"
31 #include "asm/assembler.inline.hpp"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/cardTable.hpp"
34 #include "gc/shared/barrierSetAssembler.hpp"
35 #include "gc/shared/cardTableBarrierSet.hpp"
36 #include "interpreter/interpreter.hpp"
37 #include "compiler/disassembler.hpp"
38 #include "memory/resourceArea.hpp"
39 #include "memory/universe.hpp"
40 #include "nativeInst_aarch64.hpp"
41 #include "oops/accessDecorators.hpp"
42 #include "oops/compressedOops.inline.hpp"
43 #include "oops/klass.inline.hpp"
44 #include "runtime/biasedLocking.hpp"
45 #include "runtime/icache.hpp"
46 #include "runtime/interfaceSupport.inline.hpp"
47 #include "runtime/jniHandles.inline.hpp"
48 #include "runtime/sharedRuntime.hpp"
49 #include "runtime/thread.hpp"
50 #include "utilities/powerOfTwo.hpp"
51 #ifdef COMPILER1
52 #include "c1/c1_LIRAssembler.hpp"
53 #endif
54 #ifdef COMPILER2
55 #include "oops/oop.hpp"
56 #include "opto/compile.hpp"
57 #include "opto/node.hpp"
58 #include "opto/output.hpp"
59 #endif
60 
61 #ifdef PRODUCT
62 #define BLOCK_COMMENT(str) /* nothing */
63 #else
64 #define BLOCK_COMMENT(str) block_comment(str)
65 #endif
66 #define STOP(str) stop(str);
67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
68 
69 // Patch any kind of instruction; there may be several instructions.
70 // Return the total length (in bytes) of the instructions.
pd_patch_instruction_size(address branch,address target)71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
72   int instructions = 1;
73   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
74   long offset = (target - branch) >> 2;
75   unsigned insn = *(unsigned*)branch;
76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
77     // Load register (literal)
78     Instruction_aarch64::spatch(branch, 23, 5, offset);
79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
80     // Unconditional branch (immediate)
81     Instruction_aarch64::spatch(branch, 25, 0, offset);
82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
83     // Conditional branch (immediate)
84     Instruction_aarch64::spatch(branch, 23, 5, offset);
85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
86     // Compare & branch (immediate)
87     Instruction_aarch64::spatch(branch, 23, 5, offset);
88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
89     // Test & branch (immediate)
90     Instruction_aarch64::spatch(branch, 18, 5, offset);
91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
92     // PC-rel. addressing
93     offset = target-branch;
94     int shift = Instruction_aarch64::extract(insn, 31, 31);
95     if (shift) {
96       u_int64_t dest = (u_int64_t)target;
97       uint64_t pc_page = (uint64_t)branch >> 12;
98       uint64_t adr_page = (uint64_t)target >> 12;
99       unsigned offset_lo = dest & 0xfff;
100       offset = adr_page - pc_page;
101 
102       // We handle 4 types of PC relative addressing
103       //   1 - adrp    Rx, target_page
104       //       ldr/str Ry, [Rx, #offset_in_page]
105       //   2 - adrp    Rx, target_page
106       //       add     Ry, Rx, #offset_in_page
107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
108       //       movk    Rx, #imm16<<32
109       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
110       // In the first 3 cases we must check that Rx is the same in the adrp and the
111       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
112       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
113       // to be followed by a random unrelated ldr/str, add or movk instruction.
114       //
115       unsigned insn2 = ((unsigned*)branch)[1];
116       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
117                 Instruction_aarch64::extract(insn, 4, 0) ==
118                         Instruction_aarch64::extract(insn2, 9, 5)) {
119         // Load/store register (unsigned immediate)
120         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
121         Instruction_aarch64::patch(branch + sizeof (unsigned),
122                                     21, 10, offset_lo >> size);
123         guarantee(((dest >> size) << size) == dest, "misaligned target");
124         instructions = 2;
125       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
126                 Instruction_aarch64::extract(insn, 4, 0) ==
127                         Instruction_aarch64::extract(insn2, 4, 0)) {
128         // add (immediate)
129         Instruction_aarch64::patch(branch + sizeof (unsigned),
130                                    21, 10, offset_lo);
131         instructions = 2;
132       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
133                    Instruction_aarch64::extract(insn, 4, 0) ==
134                      Instruction_aarch64::extract(insn2, 4, 0)) {
135         // movk #imm16<<32
136         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
137         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
138         long pc_page = (long)branch >> 12;
139         long adr_page = (long)dest >> 12;
140         offset = adr_page - pc_page;
141         instructions = 2;
142       }
143     }
144     int offset_lo = offset & 3;
145     offset >>= 2;
146     Instruction_aarch64::spatch(branch, 23, 5, offset);
147     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
148   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
149     u_int64_t dest = (u_int64_t)target;
150     // Move wide constant
151     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
152     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
153     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
154     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
155     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
156     assert(target_addr_for_insn(branch) == target, "should be");
157     instructions = 3;
158   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
159              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
160     // nothing to do
161     assert(target == 0, "did not expect to relocate target for polling page load");
162   } else {
163     ShouldNotReachHere();
164   }
165   return instructions * NativeInstruction::instruction_size;
166 }
167 
patch_oop(address insn_addr,address o)168 int MacroAssembler::patch_oop(address insn_addr, address o) {
169   int instructions;
170   unsigned insn = *(unsigned*)insn_addr;
171   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
172 
173   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
174   // narrow OOPs by setting the upper 16 bits in the first
175   // instruction.
176   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
177     // Move narrow OOP
178     narrowOop n = CompressedOops::encode((oop)o);
179     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
180     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
181     instructions = 2;
182   } else {
183     // Move wide OOP
184     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
185     uintptr_t dest = (uintptr_t)o;
186     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
187     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
188     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
189     instructions = 3;
190   }
191   return instructions * NativeInstruction::instruction_size;
192 }
193 
patch_narrow_klass(address insn_addr,narrowKlass n)194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
195   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
196   // We encode narrow ones by setting the upper 16 bits in the first
197   // instruction.
198   NativeInstruction *insn = nativeInstruction_at(insn_addr);
199   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
200          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
201 
202   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
203   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
204   return 2 * NativeInstruction::instruction_size;
205 }
206 
target_addr_for_insn(address insn_addr,unsigned insn)207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
208   long offset = 0;
209   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
210     // Load register (literal)
211     offset = Instruction_aarch64::sextract(insn, 23, 5);
212     return address(((uint64_t)insn_addr + (offset << 2)));
213   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
214     // Unconditional branch (immediate)
215     offset = Instruction_aarch64::sextract(insn, 25, 0);
216   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
217     // Conditional branch (immediate)
218     offset = Instruction_aarch64::sextract(insn, 23, 5);
219   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
220     // Compare & branch (immediate)
221     offset = Instruction_aarch64::sextract(insn, 23, 5);
222    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
223     // Test & branch (immediate)
224     offset = Instruction_aarch64::sextract(insn, 18, 5);
225   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
226     // PC-rel. addressing
227     offset = Instruction_aarch64::extract(insn, 30, 29);
228     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
229     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
230     if (shift) {
231       offset <<= shift;
232       uint64_t target_page = ((uint64_t)insn_addr) + offset;
233       target_page &= ((uint64_t)-1) << shift;
234       // Return the target address for the following sequences
235       //   1 - adrp    Rx, target_page
236       //       ldr/str Ry, [Rx, #offset_in_page]
237       //   2 - adrp    Rx, target_page
238       //       add     Ry, Rx, #offset_in_page
239       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
240       //       movk    Rx, #imm12<<32
241       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
242       //
243       // In the first two cases  we check that the register is the same and
244       // return the target_page + the offset within the page.
245       // Otherwise we assume it is a page aligned relocation and return
246       // the target page only.
247       //
248       unsigned insn2 = ((unsigned*)insn_addr)[1];
249       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
250                 Instruction_aarch64::extract(insn, 4, 0) ==
251                         Instruction_aarch64::extract(insn2, 9, 5)) {
252         // Load/store register (unsigned immediate)
253         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
254         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
255         return address(target_page + (byte_offset << size));
256       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
257                 Instruction_aarch64::extract(insn, 4, 0) ==
258                         Instruction_aarch64::extract(insn2, 4, 0)) {
259         // add (immediate)
260         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
261         return address(target_page + byte_offset);
262       } else {
263         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
264                Instruction_aarch64::extract(insn, 4, 0) ==
265                  Instruction_aarch64::extract(insn2, 4, 0)) {
266           target_page = (target_page & 0xffffffff) |
267                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
268         }
269         return (address)target_page;
270       }
271     } else {
272       ShouldNotReachHere();
273     }
274   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
275     u_int32_t *insns = (u_int32_t *)insn_addr;
276     // Move wide constant: movz, movk, movk.  See movptr().
277     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
278     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
279     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
280                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
281                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
282   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
283              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
284     return 0;
285   } else {
286     ShouldNotReachHere();
287   }
288   return address(((uint64_t)insn_addr + (offset << 2)));
289 }
290 
safepoint_poll(Label & slow_path)291 void MacroAssembler::safepoint_poll(Label& slow_path) {
292   ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
293   tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
294 }
295 
296 // Just like safepoint_poll, but use an acquiring load for thread-
297 // local polling.
298 //
299 // We need an acquire here to ensure that any subsequent load of the
300 // global SafepointSynchronize::_state flag is ordered after this load
301 // of the local Thread::_polling page.  We don't want this poll to
302 // return false (i.e. not safepointing) and a later poll of the global
303 // SafepointSynchronize::_state spuriously to return true.
304 //
305 // This is to avoid a race when we're in a native->Java transition
306 // racing the code which wakes up from a safepoint.
307 //
safepoint_poll_acquire(Label & slow_path)308 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
309   lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
310   ldar(rscratch1, rscratch1);
311   tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
312 }
313 
reset_last_Java_frame(bool clear_fp)314 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
315   // we must set sp to zero to clear frame
316   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
317 
318   // must clear fp, so that compiled frames are not confused; it is
319   // possible that we need it only for debugging
320   if (clear_fp) {
321     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
322   }
323 
324   // Always clear the pc because it could have been set by make_walkable()
325   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
326 }
327 
328 // Calls to C land
329 //
330 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
331 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
332 // has to be reset to 0. This is required to allow proper stack traversal.
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Register last_java_pc,Register scratch)333 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
334                                          Register last_java_fp,
335                                          Register last_java_pc,
336                                          Register scratch) {
337 
338   if (last_java_pc->is_valid()) {
339       str(last_java_pc, Address(rthread,
340                                 JavaThread::frame_anchor_offset()
341                                 + JavaFrameAnchor::last_Java_pc_offset()));
342     }
343 
344   // determine last_java_sp register
345   if (last_java_sp == sp) {
346     mov(scratch, sp);
347     last_java_sp = scratch;
348   } else if (!last_java_sp->is_valid()) {
349     last_java_sp = esp;
350   }
351 
352   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
353 
354   // last_java_fp is optional
355   if (last_java_fp->is_valid()) {
356     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
357   }
358 }
359 
set_last_Java_frame(Register last_java_sp,Register last_java_fp,address last_java_pc,Register scratch)360 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
361                                          Register last_java_fp,
362                                          address  last_java_pc,
363                                          Register scratch) {
364   assert(last_java_pc != NULL, "must provide a valid PC");
365 
366   adr(scratch, last_java_pc);
367   str(scratch, Address(rthread,
368                        JavaThread::frame_anchor_offset()
369                        + JavaFrameAnchor::last_Java_pc_offset()));
370 
371   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
372 }
373 
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Label & L,Register scratch)374 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
375                                          Register last_java_fp,
376                                          Label &L,
377                                          Register scratch) {
378   if (L.is_bound()) {
379     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
380   } else {
381     InstructionMark im(this);
382     L.add_patch_at(code(), locator());
383     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
384   }
385 }
386 
far_call(Address entry,CodeBuffer * cbuf,Register tmp)387 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
388   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
389   assert(CodeCache::find_blob(entry.target()) != NULL,
390          "destination of far call not found in code cache");
391   if (far_branches()) {
392     unsigned long offset;
393     // We can use ADRP here because we know that the total size of
394     // the code cache cannot exceed 2Gb.
395     adrp(tmp, entry, offset);
396     add(tmp, tmp, offset);
397     if (cbuf) cbuf->set_insts_mark();
398     blr(tmp);
399   } else {
400     if (cbuf) cbuf->set_insts_mark();
401     bl(entry);
402   }
403 }
404 
far_jump(Address entry,CodeBuffer * cbuf,Register tmp)405 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
406   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
407   assert(CodeCache::find_blob(entry.target()) != NULL,
408          "destination of far call not found in code cache");
409   if (far_branches()) {
410     unsigned long offset;
411     // We can use ADRP here because we know that the total size of
412     // the code cache cannot exceed 2Gb.
413     adrp(tmp, entry, offset);
414     add(tmp, tmp, offset);
415     if (cbuf) cbuf->set_insts_mark();
416     br(tmp);
417   } else {
418     if (cbuf) cbuf->set_insts_mark();
419     b(entry);
420   }
421 }
422 
reserved_stack_check()423 void MacroAssembler::reserved_stack_check() {
424     // testing if reserved zone needs to be enabled
425     Label no_reserved_zone_enabling;
426 
427     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
428     cmp(sp, rscratch1);
429     br(Assembler::LO, no_reserved_zone_enabling);
430 
431     enter();   // LR and FP are live.
432     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
433     mov(c_rarg0, rthread);
434     blr(rscratch1);
435     leave();
436 
437     // We have already removed our own frame.
438     // throw_delayed_StackOverflowError will think that it's been
439     // called by our caller.
440     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
441     br(rscratch1);
442     should_not_reach_here();
443 
444     bind(no_reserved_zone_enabling);
445 }
446 
biased_locking_enter(Register lock_reg,Register obj_reg,Register swap_reg,Register tmp_reg,bool swap_reg_contains_mark,Label & done,Label * slow_case,BiasedLockingCounters * counters)447 int MacroAssembler::biased_locking_enter(Register lock_reg,
448                                          Register obj_reg,
449                                          Register swap_reg,
450                                          Register tmp_reg,
451                                          bool swap_reg_contains_mark,
452                                          Label& done,
453                                          Label* slow_case,
454                                          BiasedLockingCounters* counters) {
455   assert(UseBiasedLocking, "why call this otherwise?");
456   assert_different_registers(lock_reg, obj_reg, swap_reg);
457 
458   if (PrintBiasedLockingStatistics && counters == NULL)
459     counters = BiasedLocking::counters();
460 
461   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
462   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
463   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
464   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
465   Address saved_mark_addr(lock_reg, 0);
466 
467   // Biased locking
468   // See whether the lock is currently biased toward our thread and
469   // whether the epoch is still valid
470   // Note that the runtime guarantees sufficient alignment of JavaThread
471   // pointers to allow age to be placed into low bits
472   // First check to see whether biasing is even enabled for this object
473   Label cas_label;
474   int null_check_offset = -1;
475   if (!swap_reg_contains_mark) {
476     null_check_offset = offset();
477     ldr(swap_reg, mark_addr);
478   }
479   andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place);
480   cmp(tmp_reg, (u1)markWord::biased_lock_pattern);
481   br(Assembler::NE, cas_label);
482   // The bias pattern is present in the object's header. Need to check
483   // whether the bias owner and the epoch are both still current.
484   load_prototype_header(tmp_reg, obj_reg);
485   orr(tmp_reg, tmp_reg, rthread);
486   eor(tmp_reg, swap_reg, tmp_reg);
487   andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place));
488   if (counters != NULL) {
489     Label around;
490     cbnz(tmp_reg, around);
491     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
492     b(done);
493     bind(around);
494   } else {
495     cbz(tmp_reg, done);
496   }
497 
498   Label try_revoke_bias;
499   Label try_rebias;
500 
501   // At this point we know that the header has the bias pattern and
502   // that we are not the bias owner in the current epoch. We need to
503   // figure out more details about the state of the header in order to
504   // know what operations can be legally performed on the object's
505   // header.
506 
507   // If the low three bits in the xor result aren't clear, that means
508   // the prototype header is no longer biased and we have to revoke
509   // the bias on this object.
510   andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place);
511   cbnz(rscratch1, try_revoke_bias);
512 
513   // Biasing is still enabled for this data type. See whether the
514   // epoch of the current bias is still valid, meaning that the epoch
515   // bits of the mark word are equal to the epoch bits of the
516   // prototype header. (Note that the prototype header's epoch bits
517   // only change at a safepoint.) If not, attempt to rebias the object
518   // toward the current thread. Note that we must be absolutely sure
519   // that the current epoch is invalid in order to do this because
520   // otherwise the manipulations it performs on the mark word are
521   // illegal.
522   andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place);
523   cbnz(rscratch1, try_rebias);
524 
525   // The epoch of the current bias is still valid but we know nothing
526   // about the owner; it might be set or it might be clear. Try to
527   // acquire the bias of the object using an atomic operation. If this
528   // fails we will go in to the runtime to revoke the object's bias.
529   // Note that we first construct the presumed unbiased header so we
530   // don't accidentally blow away another thread's valid bias.
531   {
532     Label here;
533     mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
534     andr(swap_reg, swap_reg, rscratch1);
535     orr(tmp_reg, swap_reg, rthread);
536     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
537     // If the biasing toward our thread failed, this means that
538     // another thread succeeded in biasing it toward itself and we
539     // need to revoke that bias. The revocation will occur in the
540     // interpreter runtime in the slow case.
541     bind(here);
542     if (counters != NULL) {
543       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
544                   tmp_reg, rscratch1, rscratch2);
545     }
546   }
547   b(done);
548 
549   bind(try_rebias);
550   // At this point we know the epoch has expired, meaning that the
551   // current "bias owner", if any, is actually invalid. Under these
552   // circumstances _only_, we are allowed to use the current header's
553   // value as the comparison value when doing the cas to acquire the
554   // bias in the current epoch. In other words, we allow transfer of
555   // the bias from one thread to another directly in this situation.
556   //
557   // FIXME: due to a lack of registers we currently blow away the age
558   // bits in this situation. Should attempt to preserve them.
559   {
560     Label here;
561     load_prototype_header(tmp_reg, obj_reg);
562     orr(tmp_reg, rthread, tmp_reg);
563     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
564     // If the biasing toward our thread failed, then another thread
565     // succeeded in biasing it toward itself and we need to revoke that
566     // bias. The revocation will occur in the runtime in the slow case.
567     bind(here);
568     if (counters != NULL) {
569       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
570                   tmp_reg, rscratch1, rscratch2);
571     }
572   }
573   b(done);
574 
575   bind(try_revoke_bias);
576   // The prototype mark in the klass doesn't have the bias bit set any
577   // more, indicating that objects of this data type are not supposed
578   // to be biased any more. We are going to try to reset the mark of
579   // this object to the prototype value and fall through to the
580   // CAS-based locking scheme. Note that if our CAS fails, it means
581   // that another thread raced us for the privilege of revoking the
582   // bias of this particular object, so it's okay to continue in the
583   // normal locking code.
584   //
585   // FIXME: due to a lack of registers we currently blow away the age
586   // bits in this situation. Should attempt to preserve them.
587   {
588     Label here, nope;
589     load_prototype_header(tmp_reg, obj_reg);
590     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
591     bind(here);
592 
593     // Fall through to the normal CAS-based lock, because no matter what
594     // the result of the above CAS, some thread must have succeeded in
595     // removing the bias bit from the object's header.
596     if (counters != NULL) {
597       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
598                   rscratch1, rscratch2);
599     }
600     bind(nope);
601   }
602 
603   bind(cas_label);
604 
605   return null_check_offset;
606 }
607 
biased_locking_exit(Register obj_reg,Register temp_reg,Label & done)608 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
609   assert(UseBiasedLocking, "why call this otherwise?");
610 
611   // Check for biased locking unlock case, which is a no-op
612   // Note: we do not have to check the thread ID for two reasons.
613   // First, the interpreter checks for IllegalMonitorStateException at
614   // a higher level. Second, if the bias was revoked while we held the
615   // lock, the object could not be rebiased toward another thread, so
616   // the bias bit would be clear.
617   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
618   andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
619   cmp(temp_reg, (u1)markWord::biased_lock_pattern);
620   br(Assembler::EQ, done);
621 }
622 
pass_arg0(MacroAssembler * masm,Register arg)623 static void pass_arg0(MacroAssembler* masm, Register arg) {
624   if (c_rarg0 != arg ) {
625     masm->mov(c_rarg0, arg);
626   }
627 }
628 
pass_arg1(MacroAssembler * masm,Register arg)629 static void pass_arg1(MacroAssembler* masm, Register arg) {
630   if (c_rarg1 != arg ) {
631     masm->mov(c_rarg1, arg);
632   }
633 }
634 
pass_arg2(MacroAssembler * masm,Register arg)635 static void pass_arg2(MacroAssembler* masm, Register arg) {
636   if (c_rarg2 != arg ) {
637     masm->mov(c_rarg2, arg);
638   }
639 }
640 
pass_arg3(MacroAssembler * masm,Register arg)641 static void pass_arg3(MacroAssembler* masm, Register arg) {
642   if (c_rarg3 != arg ) {
643     masm->mov(c_rarg3, arg);
644   }
645 }
646 
call_VM_base(Register oop_result,Register java_thread,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)647 void MacroAssembler::call_VM_base(Register oop_result,
648                                   Register java_thread,
649                                   Register last_java_sp,
650                                   address  entry_point,
651                                   int      number_of_arguments,
652                                   bool     check_exceptions) {
653    // determine java_thread register
654   if (!java_thread->is_valid()) {
655     java_thread = rthread;
656   }
657 
658   // determine last_java_sp register
659   if (!last_java_sp->is_valid()) {
660     last_java_sp = esp;
661   }
662 
663   // debugging support
664   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
665   assert(java_thread == rthread, "unexpected register");
666 #ifdef ASSERT
667   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
668   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
669 #endif // ASSERT
670 
671   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
672   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
673 
674   // push java thread (becomes first argument of C function)
675 
676   mov(c_rarg0, java_thread);
677 
678   // set last Java frame before call
679   assert(last_java_sp != rfp, "can't use rfp");
680 
681   Label l;
682   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
683 
684   // do the call, remove parameters
685   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
686 
687   // lr could be poisoned with PAC signature during throw_pending_exception
688   // if it was tail-call optimized by compiler, since lr is not callee-saved
689   // reload it with proper value
690   adr(lr, l);
691 
692   // reset last Java frame
693   // Only interpreter should have to clear fp
694   reset_last_Java_frame(true);
695 
696    // C++ interp handles this in the interpreter
697   check_and_handle_popframe(java_thread);
698   check_and_handle_earlyret(java_thread);
699 
700   if (check_exceptions) {
701     // check for pending exceptions (java_thread is set upon return)
702     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
703     Label ok;
704     cbz(rscratch1, ok);
705     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
706     br(rscratch1);
707     bind(ok);
708   }
709 
710   // get oop result if there is one and reset the value in the thread
711   if (oop_result->is_valid()) {
712     get_vm_result(oop_result, java_thread);
713   }
714 }
715 
call_VM_helper(Register oop_result,address entry_point,int number_of_arguments,bool check_exceptions)716 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
717   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
718 }
719 
720 // Maybe emit a call via a trampoline.  If the code cache is small
721 // trampolines won't be emitted.
722 
trampoline_call(Address entry,CodeBuffer * cbuf)723 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
724   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
725   assert(entry.rspec().type() == relocInfo::runtime_call_type
726          || entry.rspec().type() == relocInfo::opt_virtual_call_type
727          || entry.rspec().type() == relocInfo::static_call_type
728          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
729 
730   // We need a trampoline if branches are far.
731   if (far_branches()) {
732     bool in_scratch_emit_size = false;
733 #ifdef COMPILER2
734     // We don't want to emit a trampoline if C2 is generating dummy
735     // code during its branch shortening phase.
736     CompileTask* task = ciEnv::current()->task();
737     in_scratch_emit_size =
738       (task != NULL && is_c2_compile(task->comp_level()) &&
739        Compile::current()->output()->in_scratch_emit_size());
740 #endif
741     if (!in_scratch_emit_size) {
742       address stub = emit_trampoline_stub(offset(), entry.target());
743       if (stub == NULL) {
744         postcond(pc() == badAddress);
745         return NULL; // CodeCache is full
746       }
747     }
748   }
749 
750   if (cbuf) cbuf->set_insts_mark();
751   relocate(entry.rspec());
752   if (!far_branches()) {
753     bl(entry.target());
754   } else {
755     bl(pc());
756   }
757   // just need to return a non-null address
758   postcond(pc() != badAddress);
759   return pc();
760 }
761 
762 
763 // Emit a trampoline stub for a call to a target which is too far away.
764 //
765 // code sequences:
766 //
767 // call-site:
768 //   branch-and-link to <destination> or <trampoline stub>
769 //
770 // Related trampoline stub for this call site in the stub section:
771 //   load the call target from the constant pool
772 //   branch (LR still points to the call site above)
773 
emit_trampoline_stub(int insts_call_instruction_offset,address dest)774 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
775                                              address dest) {
776   // Max stub size: alignment nop, TrampolineStub.
777   address stub = start_a_stub(NativeInstruction::instruction_size
778                    + NativeCallTrampolineStub::instruction_size);
779   if (stub == NULL) {
780     return NULL;  // CodeBuffer::expand failed
781   }
782 
783   // Create a trampoline stub relocation which relates this trampoline stub
784   // with the call instruction at insts_call_instruction_offset in the
785   // instructions code-section.
786   align(wordSize);
787   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
788                                             + insts_call_instruction_offset));
789   const int stub_start_offset = offset();
790 
791   // Now, create the trampoline stub's code:
792   // - load the call
793   // - call
794   Label target;
795   ldr(rscratch1, target);
796   br(rscratch1);
797   bind(target);
798   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
799          "should be");
800   emit_int64((int64_t)dest);
801 
802   const address stub_start_addr = addr_at(stub_start_offset);
803 
804   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
805 
806   end_a_stub();
807   return stub_start_addr;
808 }
809 
emit_static_call_stub()810 void MacroAssembler::emit_static_call_stub() {
811   // CompiledDirectStaticCall::set_to_interpreted knows the
812   // exact layout of this stub.
813 
814   isb();
815   mov_metadata(rmethod, (Metadata*)NULL);
816 
817   // Jump to the entry point of the i2c stub.
818   movptr(rscratch1, 0);
819   br(rscratch1);
820 }
821 
c2bool(Register x)822 void MacroAssembler::c2bool(Register x) {
823   // implements x == 0 ? 0 : 1
824   // note: must only look at least-significant byte of x
825   //       since C-style booleans are stored in one byte
826   //       only! (was bug)
827   tst(x, 0xff);
828   cset(x, Assembler::NE);
829 }
830 
ic_call(address entry,jint method_index)831 address MacroAssembler::ic_call(address entry, jint method_index) {
832   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
833   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
834   // unsigned long offset;
835   // ldr_constant(rscratch2, const_ptr);
836   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
837   return trampoline_call(Address(entry, rh));
838 }
839 
840 // Implementation of call_VM versions
841 
call_VM(Register oop_result,address entry_point,bool check_exceptions)842 void MacroAssembler::call_VM(Register oop_result,
843                              address entry_point,
844                              bool check_exceptions) {
845   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
846 }
847 
call_VM(Register oop_result,address entry_point,Register arg_1,bool check_exceptions)848 void MacroAssembler::call_VM(Register oop_result,
849                              address entry_point,
850                              Register arg_1,
851                              bool check_exceptions) {
852   pass_arg1(this, arg_1);
853   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
854 }
855 
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)856 void MacroAssembler::call_VM(Register oop_result,
857                              address entry_point,
858                              Register arg_1,
859                              Register arg_2,
860                              bool check_exceptions) {
861   assert(arg_1 != c_rarg2, "smashed arg");
862   pass_arg2(this, arg_2);
863   pass_arg1(this, arg_1);
864   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
865 }
866 
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)867 void MacroAssembler::call_VM(Register oop_result,
868                              address entry_point,
869                              Register arg_1,
870                              Register arg_2,
871                              Register arg_3,
872                              bool check_exceptions) {
873   assert(arg_1 != c_rarg3, "smashed arg");
874   assert(arg_2 != c_rarg3, "smashed arg");
875   pass_arg3(this, arg_3);
876 
877   assert(arg_1 != c_rarg2, "smashed arg");
878   pass_arg2(this, arg_2);
879 
880   pass_arg1(this, arg_1);
881   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
882 }
883 
call_VM(Register oop_result,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)884 void MacroAssembler::call_VM(Register oop_result,
885                              Register last_java_sp,
886                              address entry_point,
887                              int number_of_arguments,
888                              bool check_exceptions) {
889   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
890 }
891 
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,bool check_exceptions)892 void MacroAssembler::call_VM(Register oop_result,
893                              Register last_java_sp,
894                              address entry_point,
895                              Register arg_1,
896                              bool check_exceptions) {
897   pass_arg1(this, arg_1);
898   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
899 }
900 
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)901 void MacroAssembler::call_VM(Register oop_result,
902                              Register last_java_sp,
903                              address entry_point,
904                              Register arg_1,
905                              Register arg_2,
906                              bool check_exceptions) {
907 
908   assert(arg_1 != c_rarg2, "smashed arg");
909   pass_arg2(this, arg_2);
910   pass_arg1(this, arg_1);
911   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
912 }
913 
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)914 void MacroAssembler::call_VM(Register oop_result,
915                              Register last_java_sp,
916                              address entry_point,
917                              Register arg_1,
918                              Register arg_2,
919                              Register arg_3,
920                              bool check_exceptions) {
921   assert(arg_1 != c_rarg3, "smashed arg");
922   assert(arg_2 != c_rarg3, "smashed arg");
923   pass_arg3(this, arg_3);
924   assert(arg_1 != c_rarg2, "smashed arg");
925   pass_arg2(this, arg_2);
926   pass_arg1(this, arg_1);
927   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
928 }
929 
930 
get_vm_result(Register oop_result,Register java_thread)931 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
932   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
933   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
934   verify_oop(oop_result, "broken oop in call_VM_base");
935 }
936 
get_vm_result_2(Register metadata_result,Register java_thread)937 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
938   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
939   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
940 }
941 
align(int modulus)942 void MacroAssembler::align(int modulus) {
943   while (offset() % modulus != 0) nop();
944 }
945 
946 // these are no-ops overridden by InterpreterMacroAssembler
947 
check_and_handle_earlyret(Register java_thread)948 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
949 
check_and_handle_popframe(Register java_thread)950 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
951 
952 
delayed_value_impl(intptr_t * delayed_value_addr,Register tmp,int offset)953 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
954                                                       Register tmp,
955                                                       int offset) {
956   intptr_t value = *delayed_value_addr;
957   if (value != 0)
958     return RegisterOrConstant(value + offset);
959 
960   // load indirectly to solve generation ordering problem
961   ldr(tmp, ExternalAddress((address) delayed_value_addr));
962 
963   if (offset != 0)
964     add(tmp, tmp, offset);
965 
966   return RegisterOrConstant(tmp);
967 }
968 
969 // Look up the method for a megamorphic invokeinterface call.
970 // The target method is determined by <intf_klass, itable_index>.
971 // The receiver klass is in recv_klass.
972 // On success, the result will be in method_result, and execution falls through.
973 // On failure, execution transfers to the given label.
lookup_interface_method(Register recv_klass,Register intf_klass,RegisterOrConstant itable_index,Register method_result,Register scan_temp,Label & L_no_such_interface,bool return_method)974 void MacroAssembler::lookup_interface_method(Register recv_klass,
975                                              Register intf_klass,
976                                              RegisterOrConstant itable_index,
977                                              Register method_result,
978                                              Register scan_temp,
979                                              Label& L_no_such_interface,
980                          bool return_method) {
981   assert_different_registers(recv_klass, intf_klass, scan_temp);
982   assert_different_registers(method_result, intf_klass, scan_temp);
983   assert(recv_klass != method_result || !return_method,
984      "recv_klass can be destroyed when method isn't needed");
985   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
986          "caller must use same register for non-constant itable index as for method");
987 
988   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
989   int vtable_base = in_bytes(Klass::vtable_start_offset());
990   int itentry_off = itableMethodEntry::method_offset_in_bytes();
991   int scan_step   = itableOffsetEntry::size() * wordSize;
992   int vte_size    = vtableEntry::size_in_bytes();
993   assert(vte_size == wordSize, "else adjust times_vte_scale");
994 
995   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
996 
997   // %%% Could store the aligned, prescaled offset in the klassoop.
998   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
999   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1000   add(scan_temp, scan_temp, vtable_base);
1001 
1002   if (return_method) {
1003     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1004     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1005     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1006     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1007     if (itentry_off)
1008       add(recv_klass, recv_klass, itentry_off);
1009   }
1010 
1011   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1012   //   if (scan->interface() == intf) {
1013   //     result = (klass + scan->offset() + itable_index);
1014   //   }
1015   // }
1016   Label search, found_method;
1017 
1018   for (int peel = 1; peel >= 0; peel--) {
1019     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1020     cmp(intf_klass, method_result);
1021 
1022     if (peel) {
1023       br(Assembler::EQ, found_method);
1024     } else {
1025       br(Assembler::NE, search);
1026       // (invert the test to fall through to found_method...)
1027     }
1028 
1029     if (!peel)  break;
1030 
1031     bind(search);
1032 
1033     // Check that the previous entry is non-null.  A null entry means that
1034     // the receiver class doesn't implement the interface, and wasn't the
1035     // same as when the caller was compiled.
1036     cbz(method_result, L_no_such_interface);
1037     add(scan_temp, scan_temp, scan_step);
1038   }
1039 
1040   bind(found_method);
1041 
1042   // Got a hit.
1043   if (return_method) {
1044     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1045     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1046   }
1047 }
1048 
1049 // virtual method calling
lookup_virtual_method(Register recv_klass,RegisterOrConstant vtable_index,Register method_result)1050 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1051                                            RegisterOrConstant vtable_index,
1052                                            Register method_result) {
1053   const int base = in_bytes(Klass::vtable_start_offset());
1054   assert(vtableEntry::size() * wordSize == 8,
1055          "adjust the scaling in the code below");
1056   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1057 
1058   if (vtable_index.is_register()) {
1059     lea(method_result, Address(recv_klass,
1060                                vtable_index.as_register(),
1061                                Address::lsl(LogBytesPerWord)));
1062     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1063   } else {
1064     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1065     ldr(method_result,
1066         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1067   }
1068 }
1069 
check_klass_subtype(Register sub_klass,Register super_klass,Register temp_reg,Label & L_success)1070 void MacroAssembler::check_klass_subtype(Register sub_klass,
1071                            Register super_klass,
1072                            Register temp_reg,
1073                            Label& L_success) {
1074   Label L_failure;
1075   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1076   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1077   bind(L_failure);
1078 }
1079 
1080 
check_klass_subtype_fast_path(Register sub_klass,Register super_klass,Register temp_reg,Label * L_success,Label * L_failure,Label * L_slow_path,RegisterOrConstant super_check_offset)1081 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1082                                                    Register super_klass,
1083                                                    Register temp_reg,
1084                                                    Label* L_success,
1085                                                    Label* L_failure,
1086                                                    Label* L_slow_path,
1087                                         RegisterOrConstant super_check_offset) {
1088   assert_different_registers(sub_klass, super_klass, temp_reg);
1089   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1090   if (super_check_offset.is_register()) {
1091     assert_different_registers(sub_klass, super_klass,
1092                                super_check_offset.as_register());
1093   } else if (must_load_sco) {
1094     assert(temp_reg != noreg, "supply either a temp or a register offset");
1095   }
1096 
1097   Label L_fallthrough;
1098   int label_nulls = 0;
1099   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1100   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1101   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1102   assert(label_nulls <= 1, "at most one NULL in the batch");
1103 
1104   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1105   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1106   Address super_check_offset_addr(super_klass, sco_offset);
1107 
1108   // Hacked jmp, which may only be used just before L_fallthrough.
1109 #define final_jmp(label)                                                \
1110   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1111   else                            b(label)                /*omit semi*/
1112 
1113   // If the pointers are equal, we are done (e.g., String[] elements).
1114   // This self-check enables sharing of secondary supertype arrays among
1115   // non-primary types such as array-of-interface.  Otherwise, each such
1116   // type would need its own customized SSA.
1117   // We move this check to the front of the fast path because many
1118   // type checks are in fact trivially successful in this manner,
1119   // so we get a nicely predicted branch right at the start of the check.
1120   cmp(sub_klass, super_klass);
1121   br(Assembler::EQ, *L_success);
1122 
1123   // Check the supertype display:
1124   if (must_load_sco) {
1125     ldrw(temp_reg, super_check_offset_addr);
1126     super_check_offset = RegisterOrConstant(temp_reg);
1127   }
1128   Address super_check_addr(sub_klass, super_check_offset);
1129   ldr(rscratch1, super_check_addr);
1130   cmp(super_klass, rscratch1); // load displayed supertype
1131 
1132   // This check has worked decisively for primary supers.
1133   // Secondary supers are sought in the super_cache ('super_cache_addr').
1134   // (Secondary supers are interfaces and very deeply nested subtypes.)
1135   // This works in the same check above because of a tricky aliasing
1136   // between the super_cache and the primary super display elements.
1137   // (The 'super_check_addr' can address either, as the case requires.)
1138   // Note that the cache is updated below if it does not help us find
1139   // what we need immediately.
1140   // So if it was a primary super, we can just fail immediately.
1141   // Otherwise, it's the slow path for us (no success at this point).
1142 
1143   if (super_check_offset.is_register()) {
1144     br(Assembler::EQ, *L_success);
1145     subs(zr, super_check_offset.as_register(), sc_offset);
1146     if (L_failure == &L_fallthrough) {
1147       br(Assembler::EQ, *L_slow_path);
1148     } else {
1149       br(Assembler::NE, *L_failure);
1150       final_jmp(*L_slow_path);
1151     }
1152   } else if (super_check_offset.as_constant() == sc_offset) {
1153     // Need a slow path; fast failure is impossible.
1154     if (L_slow_path == &L_fallthrough) {
1155       br(Assembler::EQ, *L_success);
1156     } else {
1157       br(Assembler::NE, *L_slow_path);
1158       final_jmp(*L_success);
1159     }
1160   } else {
1161     // No slow path; it's a fast decision.
1162     if (L_failure == &L_fallthrough) {
1163       br(Assembler::EQ, *L_success);
1164     } else {
1165       br(Assembler::NE, *L_failure);
1166       final_jmp(*L_success);
1167     }
1168   }
1169 
1170   bind(L_fallthrough);
1171 
1172 #undef final_jmp
1173 }
1174 
1175 // These two are taken from x86, but they look generally useful
1176 
1177 // scans count pointer sized words at [addr] for occurence of value,
1178 // generic
repne_scan(Register addr,Register value,Register count,Register scratch)1179 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1180                                 Register scratch) {
1181   Label Lloop, Lexit;
1182   cbz(count, Lexit);
1183   bind(Lloop);
1184   ldr(scratch, post(addr, wordSize));
1185   cmp(value, scratch);
1186   br(EQ, Lexit);
1187   sub(count, count, 1);
1188   cbnz(count, Lloop);
1189   bind(Lexit);
1190 }
1191 
1192 // scans count 4 byte words at [addr] for occurence of value,
1193 // generic
repne_scanw(Register addr,Register value,Register count,Register scratch)1194 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1195                                 Register scratch) {
1196   Label Lloop, Lexit;
1197   cbz(count, Lexit);
1198   bind(Lloop);
1199   ldrw(scratch, post(addr, wordSize));
1200   cmpw(value, scratch);
1201   br(EQ, Lexit);
1202   sub(count, count, 1);
1203   cbnz(count, Lloop);
1204   bind(Lexit);
1205 }
1206 
check_klass_subtype_slow_path(Register sub_klass,Register super_klass,Register temp_reg,Register temp2_reg,Label * L_success,Label * L_failure,bool set_cond_codes)1207 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1208                                                    Register super_klass,
1209                                                    Register temp_reg,
1210                                                    Register temp2_reg,
1211                                                    Label* L_success,
1212                                                    Label* L_failure,
1213                                                    bool set_cond_codes) {
1214   assert_different_registers(sub_klass, super_klass, temp_reg);
1215   if (temp2_reg != noreg)
1216     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1217 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1218 
1219   Label L_fallthrough;
1220   int label_nulls = 0;
1221   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1222   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1223   assert(label_nulls <= 1, "at most one NULL in the batch");
1224 
1225   // a couple of useful fields in sub_klass:
1226   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1227   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1228   Address secondary_supers_addr(sub_klass, ss_offset);
1229   Address super_cache_addr(     sub_klass, sc_offset);
1230 
1231   BLOCK_COMMENT("check_klass_subtype_slow_path");
1232 
1233   // Do a linear scan of the secondary super-klass chain.
1234   // This code is rarely used, so simplicity is a virtue here.
1235   // The repne_scan instruction uses fixed registers, which we must spill.
1236   // Don't worry too much about pre-existing connections with the input regs.
1237 
1238   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1239   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1240 
1241   RegSet pushed_registers;
1242   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1243   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1244 
1245   if (super_klass != r0 || UseCompressedOops) {
1246     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1247   }
1248 
1249   push(pushed_registers, sp);
1250 
1251   // Get super_klass value into r0 (even if it was in r5 or r2).
1252   if (super_klass != r0) {
1253     mov(r0, super_klass);
1254   }
1255 
1256 #ifndef PRODUCT
1257   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1258   Address pst_counter_addr(rscratch2);
1259   ldr(rscratch1, pst_counter_addr);
1260   add(rscratch1, rscratch1, 1);
1261   str(rscratch1, pst_counter_addr);
1262 #endif //PRODUCT
1263 
1264   // We will consult the secondary-super array.
1265   ldr(r5, secondary_supers_addr);
1266   // Load the array length.
1267   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1268   // Skip to start of data.
1269   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1270 
1271   cmp(sp, zr); // Clear Z flag; SP is never zero
1272   // Scan R2 words at [R5] for an occurrence of R0.
1273   // Set NZ/Z based on last compare.
1274   repne_scan(r5, r0, r2, rscratch1);
1275 
1276   // Unspill the temp. registers:
1277   pop(pushed_registers, sp);
1278 
1279   br(Assembler::NE, *L_failure);
1280 
1281   // Success.  Cache the super we found and proceed in triumph.
1282   str(super_klass, super_cache_addr);
1283 
1284   if (L_success != &L_fallthrough) {
1285     b(*L_success);
1286   }
1287 
1288 #undef IS_A_TEMP
1289 
1290   bind(L_fallthrough);
1291 }
1292 
clinit_barrier(Register klass,Register scratch,Label * L_fast_path,Label * L_slow_path)1293 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1294   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
1295   assert_different_registers(klass, rthread, scratch);
1296 
1297   Label L_fallthrough, L_tmp;
1298   if (L_fast_path == NULL) {
1299     L_fast_path = &L_fallthrough;
1300   } else if (L_slow_path == NULL) {
1301     L_slow_path = &L_fallthrough;
1302   }
1303   // Fast path check: class is fully initialized
1304   ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1305   subs(zr, scratch, InstanceKlass::fully_initialized);
1306   br(Assembler::EQ, *L_fast_path);
1307 
1308   // Fast path check: current thread is initializer thread
1309   ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1310   cmp(rthread, scratch);
1311 
1312   if (L_slow_path == &L_fallthrough) {
1313     br(Assembler::EQ, *L_fast_path);
1314     bind(*L_slow_path);
1315   } else if (L_fast_path == &L_fallthrough) {
1316     br(Assembler::NE, *L_slow_path);
1317     bind(*L_fast_path);
1318   } else {
1319     Unimplemented();
1320   }
1321 }
1322 
verify_oop(Register reg,const char * s)1323 void MacroAssembler::verify_oop(Register reg, const char* s) {
1324   if (!VerifyOops) return;
1325 
1326   // Pass register number to verify_oop_subroutine
1327   const char* b = NULL;
1328   {
1329     ResourceMark rm;
1330     stringStream ss;
1331     ss.print("verify_oop: %s: %s", reg->name(), s);
1332     b = code_string(ss.as_string());
1333   }
1334   BLOCK_COMMENT("verify_oop {");
1335 
1336   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1337   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1338 
1339   mov(r0, reg);
1340   movptr(rscratch1, (uintptr_t)(address)b);
1341 
1342   // call indirectly to solve generation ordering problem
1343   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1344   ldr(rscratch2, Address(rscratch2));
1345   blr(rscratch2);
1346 
1347   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1348   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1349 
1350   BLOCK_COMMENT("} verify_oop");
1351 }
1352 
verify_oop_addr(Address addr,const char * s)1353 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1354   if (!VerifyOops) return;
1355 
1356   const char* b = NULL;
1357   {
1358     ResourceMark rm;
1359     stringStream ss;
1360     ss.print("verify_oop_addr: %s", s);
1361     b = code_string(ss.as_string());
1362   }
1363   BLOCK_COMMENT("verify_oop_addr {");
1364 
1365   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1366   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1367 
1368   // addr may contain sp so we will have to adjust it based on the
1369   // pushes that we just did.
1370   if (addr.uses(sp)) {
1371     lea(r0, addr);
1372     ldr(r0, Address(r0, 4 * wordSize));
1373   } else {
1374     ldr(r0, addr);
1375   }
1376   movptr(rscratch1, (uintptr_t)(address)b);
1377 
1378   // call indirectly to solve generation ordering problem
1379   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1380   ldr(rscratch2, Address(rscratch2));
1381   blr(rscratch2);
1382 
1383   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1384   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1385 
1386   BLOCK_COMMENT("} verify_oop_addr");
1387 }
1388 
argument_address(RegisterOrConstant arg_slot,int extra_slot_offset)1389 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1390                                          int extra_slot_offset) {
1391   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1392   int stackElementSize = Interpreter::stackElementSize;
1393   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1394 #ifdef ASSERT
1395   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1396   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1397 #endif
1398   if (arg_slot.is_constant()) {
1399     return Address(esp, arg_slot.as_constant() * stackElementSize
1400                    + offset);
1401   } else {
1402     add(rscratch1, esp, arg_slot.as_register(),
1403         ext::uxtx, exact_log2(stackElementSize));
1404     return Address(rscratch1, offset);
1405   }
1406 }
1407 
call_VM_leaf_base(address entry_point,int number_of_arguments,Label * retaddr)1408 void MacroAssembler::call_VM_leaf_base(address entry_point,
1409                                        int number_of_arguments,
1410                                        Label *retaddr) {
1411   Label E, L;
1412 
1413   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1414 
1415   mov(rscratch1, entry_point);
1416   blr(rscratch1);
1417   if (retaddr)
1418     bind(*retaddr);
1419 
1420   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1421   maybe_isb();
1422 }
1423 
call_VM_leaf(address entry_point,int number_of_arguments)1424 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1425   call_VM_leaf_base(entry_point, number_of_arguments);
1426 }
1427 
call_VM_leaf(address entry_point,Register arg_0)1428 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1429   pass_arg0(this, arg_0);
1430   call_VM_leaf_base(entry_point, 1);
1431 }
1432 
call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1433 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1434   pass_arg0(this, arg_0);
1435   pass_arg1(this, arg_1);
1436   call_VM_leaf_base(entry_point, 2);
1437 }
1438 
call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1439 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1440                                   Register arg_1, Register arg_2) {
1441   pass_arg0(this, arg_0);
1442   pass_arg1(this, arg_1);
1443   pass_arg2(this, arg_2);
1444   call_VM_leaf_base(entry_point, 3);
1445 }
1446 
super_call_VM_leaf(address entry_point,Register arg_0)1447 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1448   pass_arg0(this, arg_0);
1449   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1450 }
1451 
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1452 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1453 
1454   assert(arg_0 != c_rarg1, "smashed arg");
1455   pass_arg1(this, arg_1);
1456   pass_arg0(this, arg_0);
1457   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1458 }
1459 
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1460 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1461   assert(arg_0 != c_rarg2, "smashed arg");
1462   assert(arg_1 != c_rarg2, "smashed arg");
1463   pass_arg2(this, arg_2);
1464   assert(arg_0 != c_rarg1, "smashed arg");
1465   pass_arg1(this, arg_1);
1466   pass_arg0(this, arg_0);
1467   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1468 }
1469 
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2,Register arg_3)1470 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1471   assert(arg_0 != c_rarg3, "smashed arg");
1472   assert(arg_1 != c_rarg3, "smashed arg");
1473   assert(arg_2 != c_rarg3, "smashed arg");
1474   pass_arg3(this, arg_3);
1475   assert(arg_0 != c_rarg2, "smashed arg");
1476   assert(arg_1 != c_rarg2, "smashed arg");
1477   pass_arg2(this, arg_2);
1478   assert(arg_0 != c_rarg1, "smashed arg");
1479   pass_arg1(this, arg_1);
1480   pass_arg0(this, arg_0);
1481   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1482 }
1483 
null_check(Register reg,int offset)1484 void MacroAssembler::null_check(Register reg, int offset) {
1485   if (needs_explicit_null_check(offset)) {
1486     // provoke OS NULL exception if reg = NULL by
1487     // accessing M[reg] w/o changing any registers
1488     // NOTE: this is plenty to provoke a segv
1489     ldr(zr, Address(reg));
1490   } else {
1491     // nothing to do, (later) access of M[reg + offset]
1492     // will provoke OS NULL exception if reg = NULL
1493   }
1494 }
1495 
1496 // MacroAssembler protected routines needed to implement
1497 // public methods
1498 
mov(Register r,Address dest)1499 void MacroAssembler::mov(Register r, Address dest) {
1500   code_section()->relocate(pc(), dest.rspec());
1501   u_int64_t imm64 = (u_int64_t)dest.target();
1502   movptr(r, imm64);
1503 }
1504 
1505 // Move a constant pointer into r.  In AArch64 mode the virtual
1506 // address space is 48 bits in size, so we only need three
1507 // instructions to create a patchable instruction sequence that can
1508 // reach anywhere.
movptr(Register r,uintptr_t imm64)1509 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1510 #ifndef PRODUCT
1511   {
1512     char buffer[64];
1513     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1514     block_comment(buffer);
1515   }
1516 #endif
1517   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1518   movz(r, imm64 & 0xffff);
1519   imm64 >>= 16;
1520   movk(r, imm64 & 0xffff, 16);
1521   imm64 >>= 16;
1522   movk(r, imm64 & 0xffff, 32);
1523 }
1524 
1525 // Macro to mov replicated immediate to vector register.
1526 //  Vd will get the following values for different arrangements in T
1527 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1528 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1529 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1530 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1531 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1532 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1533 //   T1D/T2D: invalid
mov(FloatRegister Vd,SIMD_Arrangement T,u_int32_t imm32)1534 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1535   assert(T != T1D && T != T2D, "invalid arrangement");
1536   if (T == T8B || T == T16B) {
1537     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1538     movi(Vd, T, imm32 & 0xff, 0);
1539     return;
1540   }
1541   u_int32_t nimm32 = ~imm32;
1542   if (T == T4H || T == T8H) {
1543     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1544     imm32 &= 0xffff;
1545     nimm32 &= 0xffff;
1546   }
1547   u_int32_t x = imm32;
1548   int movi_cnt = 0;
1549   int movn_cnt = 0;
1550   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1551   x = nimm32;
1552   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1553   if (movn_cnt < movi_cnt) imm32 = nimm32;
1554   unsigned lsl = 0;
1555   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1556   if (movn_cnt < movi_cnt)
1557     mvni(Vd, T, imm32 & 0xff, lsl);
1558   else
1559     movi(Vd, T, imm32 & 0xff, lsl);
1560   imm32 >>= 8; lsl += 8;
1561   while (imm32) {
1562     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1563     if (movn_cnt < movi_cnt)
1564       bici(Vd, T, imm32 & 0xff, lsl);
1565     else
1566       orri(Vd, T, imm32 & 0xff, lsl);
1567     lsl += 8; imm32 >>= 8;
1568   }
1569 }
1570 
mov_immediate64(Register dst,u_int64_t imm64)1571 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1572 {
1573 #ifndef PRODUCT
1574   {
1575     char buffer[64];
1576     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1577     block_comment(buffer);
1578   }
1579 #endif
1580   if (operand_valid_for_logical_immediate(false, imm64)) {
1581     orr(dst, zr, imm64);
1582   } else {
1583     // we can use a combination of MOVZ or MOVN with
1584     // MOVK to build up the constant
1585     u_int64_t imm_h[4];
1586     int zero_count = 0;
1587     int neg_count = 0;
1588     int i;
1589     for (i = 0; i < 4; i++) {
1590       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1591       if (imm_h[i] == 0) {
1592         zero_count++;
1593       } else if (imm_h[i] == 0xffffL) {
1594         neg_count++;
1595       }
1596     }
1597     if (zero_count == 4) {
1598       // one MOVZ will do
1599       movz(dst, 0);
1600     } else if (neg_count == 4) {
1601       // one MOVN will do
1602       movn(dst, 0);
1603     } else if (zero_count == 3) {
1604       for (i = 0; i < 4; i++) {
1605         if (imm_h[i] != 0L) {
1606           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1607           break;
1608         }
1609       }
1610     } else if (neg_count == 3) {
1611       // one MOVN will do
1612       for (int i = 0; i < 4; i++) {
1613         if (imm_h[i] != 0xffffL) {
1614           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1615           break;
1616         }
1617       }
1618     } else if (zero_count == 2) {
1619       // one MOVZ and one MOVK will do
1620       for (i = 0; i < 3; i++) {
1621         if (imm_h[i] != 0L) {
1622           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1623           i++;
1624           break;
1625         }
1626       }
1627       for (;i < 4; i++) {
1628         if (imm_h[i] != 0L) {
1629           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1630         }
1631       }
1632     } else if (neg_count == 2) {
1633       // one MOVN and one MOVK will do
1634       for (i = 0; i < 4; i++) {
1635         if (imm_h[i] != 0xffffL) {
1636           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1637           i++;
1638           break;
1639         }
1640       }
1641       for (;i < 4; i++) {
1642         if (imm_h[i] != 0xffffL) {
1643           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1644         }
1645       }
1646     } else if (zero_count == 1) {
1647       // one MOVZ and two MOVKs will do
1648       for (i = 0; i < 4; i++) {
1649         if (imm_h[i] != 0L) {
1650           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1651           i++;
1652           break;
1653         }
1654       }
1655       for (;i < 4; i++) {
1656         if (imm_h[i] != 0x0L) {
1657           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1658         }
1659       }
1660     } else if (neg_count == 1) {
1661       // one MOVN and two MOVKs will do
1662       for (i = 0; i < 4; i++) {
1663         if (imm_h[i] != 0xffffL) {
1664           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1665           i++;
1666           break;
1667         }
1668       }
1669       for (;i < 4; i++) {
1670         if (imm_h[i] != 0xffffL) {
1671           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1672         }
1673       }
1674     } else {
1675       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1676       movz(dst, (u_int32_t)imm_h[0], 0);
1677       for (i = 1; i < 4; i++) {
1678         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1679       }
1680     }
1681   }
1682 }
1683 
mov_immediate32(Register dst,u_int32_t imm32)1684 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1685 {
1686 #ifndef PRODUCT
1687     {
1688       char buffer[64];
1689       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1690       block_comment(buffer);
1691     }
1692 #endif
1693   if (operand_valid_for_logical_immediate(true, imm32)) {
1694     orrw(dst, zr, imm32);
1695   } else {
1696     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1697     // constant
1698     u_int32_t imm_h[2];
1699     imm_h[0] = imm32 & 0xffff;
1700     imm_h[1] = ((imm32 >> 16) & 0xffff);
1701     if (imm_h[0] == 0) {
1702       movzw(dst, imm_h[1], 16);
1703     } else if (imm_h[0] == 0xffff) {
1704       movnw(dst, imm_h[1] ^ 0xffff, 16);
1705     } else if (imm_h[1] == 0) {
1706       movzw(dst, imm_h[0], 0);
1707     } else if (imm_h[1] == 0xffff) {
1708       movnw(dst, imm_h[0] ^ 0xffff, 0);
1709     } else {
1710       // use a MOVZ and MOVK (makes it easier to debug)
1711       movzw(dst, imm_h[0], 0);
1712       movkw(dst, imm_h[1], 16);
1713     }
1714   }
1715 }
1716 
1717 // Form an address from base + offset in Rd.  Rd may or may
1718 // not actually be used: you must use the Address that is returned.
1719 // It is up to you to ensure that the shift provided matches the size
1720 // of your data.
form_address(Register Rd,Register base,long byte_offset,int shift)1721 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1722   if (Address::offset_ok_for_immed(byte_offset, shift))
1723     // It fits; no need for any heroics
1724     return Address(base, byte_offset);
1725 
1726   // Don't do anything clever with negative or misaligned offsets
1727   unsigned mask = (1 << shift) - 1;
1728   if (byte_offset < 0 || byte_offset & mask) {
1729     mov(Rd, byte_offset);
1730     add(Rd, base, Rd);
1731     return Address(Rd);
1732   }
1733 
1734   // See if we can do this with two 12-bit offsets
1735   {
1736     unsigned long word_offset = byte_offset >> shift;
1737     unsigned long masked_offset = word_offset & 0xfff000;
1738     if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
1739         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1740       add(Rd, base, masked_offset << shift);
1741       word_offset -= masked_offset;
1742       return Address(Rd, word_offset << shift);
1743     }
1744   }
1745 
1746   // Do it the hard way
1747   mov(Rd, byte_offset);
1748   add(Rd, base, Rd);
1749   return Address(Rd);
1750 }
1751 
atomic_incw(Register counter_addr,Register tmp,Register tmp2)1752 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1753   if (UseLSE) {
1754     mov(tmp, 1);
1755     ldadd(Assembler::word, tmp, zr, counter_addr);
1756     return;
1757   }
1758   Label retry_load;
1759   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1760     prfm(Address(counter_addr), PSTL1STRM);
1761   bind(retry_load);
1762   // flush and load exclusive from the memory location
1763   ldxrw(tmp, counter_addr);
1764   addw(tmp, tmp, 1);
1765   // if we store+flush with no intervening write tmp wil be zero
1766   stxrw(tmp2, tmp, counter_addr);
1767   cbnzw(tmp2, retry_load);
1768 }
1769 
1770 
corrected_idivl(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1771 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1772                                     bool want_remainder, Register scratch)
1773 {
1774   // Full implementation of Java idiv and irem.  The function
1775   // returns the (pc) offset of the div instruction - may be needed
1776   // for implicit exceptions.
1777   //
1778   // constraint : ra/rb =/= scratch
1779   //         normal case
1780   //
1781   // input : ra: dividend
1782   //         rb: divisor
1783   //
1784   // result: either
1785   //         quotient  (= ra idiv rb)
1786   //         remainder (= ra irem rb)
1787 
1788   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1789 
1790   int idivl_offset = offset();
1791   if (! want_remainder) {
1792     sdivw(result, ra, rb);
1793   } else {
1794     sdivw(scratch, ra, rb);
1795     Assembler::msubw(result, scratch, rb, ra);
1796   }
1797 
1798   return idivl_offset;
1799 }
1800 
corrected_idivq(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1801 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1802                                     bool want_remainder, Register scratch)
1803 {
1804   // Full implementation of Java ldiv and lrem.  The function
1805   // returns the (pc) offset of the div instruction - may be needed
1806   // for implicit exceptions.
1807   //
1808   // constraint : ra/rb =/= scratch
1809   //         normal case
1810   //
1811   // input : ra: dividend
1812   //         rb: divisor
1813   //
1814   // result: either
1815   //         quotient  (= ra idiv rb)
1816   //         remainder (= ra irem rb)
1817 
1818   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1819 
1820   int idivq_offset = offset();
1821   if (! want_remainder) {
1822     sdiv(result, ra, rb);
1823   } else {
1824     sdiv(scratch, ra, rb);
1825     Assembler::msub(result, scratch, rb, ra);
1826   }
1827 
1828   return idivq_offset;
1829 }
1830 
membar(Membar_mask_bits order_constraint)1831 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1832   address prev = pc() - NativeMembar::instruction_size;
1833   address last = code()->last_insn();
1834   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1835     NativeMembar *bar = NativeMembar_at(prev);
1836     // We are merging two memory barrier instructions.  On AArch64 we
1837     // can do this simply by ORing them together.
1838     bar->set_kind(bar->get_kind() | order_constraint);
1839     BLOCK_COMMENT("merged membar");
1840   } else {
1841     code()->set_last_insn(pc());
1842     dmb(Assembler::barrier(order_constraint));
1843   }
1844 }
1845 
try_merge_ldst(Register rt,const Address & adr,size_t size_in_bytes,bool is_store)1846 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1847   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1848     merge_ldst(rt, adr, size_in_bytes, is_store);
1849     code()->clear_last_insn();
1850     return true;
1851   } else {
1852     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1853     const unsigned mask = size_in_bytes - 1;
1854     if (adr.getMode() == Address::base_plus_offset &&
1855         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1856       code()->set_last_insn(pc());
1857     }
1858     return false;
1859   }
1860 }
1861 
ldr(Register Rx,const Address & adr)1862 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1863   // We always try to merge two adjacent loads into one ldp.
1864   if (!try_merge_ldst(Rx, adr, 8, false)) {
1865     Assembler::ldr(Rx, adr);
1866   }
1867 }
1868 
ldrw(Register Rw,const Address & adr)1869 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1870   // We always try to merge two adjacent loads into one ldp.
1871   if (!try_merge_ldst(Rw, adr, 4, false)) {
1872     Assembler::ldrw(Rw, adr);
1873   }
1874 }
1875 
str(Register Rx,const Address & adr)1876 void MacroAssembler::str(Register Rx, const Address &adr) {
1877   // We always try to merge two adjacent stores into one stp.
1878   if (!try_merge_ldst(Rx, adr, 8, true)) {
1879     Assembler::str(Rx, adr);
1880   }
1881 }
1882 
strw(Register Rw,const Address & adr)1883 void MacroAssembler::strw(Register Rw, const Address &adr) {
1884   // We always try to merge two adjacent stores into one stp.
1885   if (!try_merge_ldst(Rw, adr, 4, true)) {
1886     Assembler::strw(Rw, adr);
1887   }
1888 }
1889 
1890 // MacroAssembler routines found actually to be needed
1891 
push(Register src)1892 void MacroAssembler::push(Register src)
1893 {
1894   str(src, Address(pre(esp, -1 * wordSize)));
1895 }
1896 
pop(Register dst)1897 void MacroAssembler::pop(Register dst)
1898 {
1899   ldr(dst, Address(post(esp, 1 * wordSize)));
1900 }
1901 
1902 // Note: load_unsigned_short used to be called load_unsigned_word.
load_unsigned_short(Register dst,Address src)1903 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1904   int off = offset();
1905   ldrh(dst, src);
1906   return off;
1907 }
1908 
load_unsigned_byte(Register dst,Address src)1909 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1910   int off = offset();
1911   ldrb(dst, src);
1912   return off;
1913 }
1914 
load_signed_short(Register dst,Address src)1915 int MacroAssembler::load_signed_short(Register dst, Address src) {
1916   int off = offset();
1917   ldrsh(dst, src);
1918   return off;
1919 }
1920 
load_signed_byte(Register dst,Address src)1921 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1922   int off = offset();
1923   ldrsb(dst, src);
1924   return off;
1925 }
1926 
load_signed_short32(Register dst,Address src)1927 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1928   int off = offset();
1929   ldrshw(dst, src);
1930   return off;
1931 }
1932 
load_signed_byte32(Register dst,Address src)1933 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1934   int off = offset();
1935   ldrsbw(dst, src);
1936   return off;
1937 }
1938 
load_sized_value(Register dst,Address src,size_t size_in_bytes,bool is_signed,Register dst2)1939 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1940   switch (size_in_bytes) {
1941   case  8:  ldr(dst, src); break;
1942   case  4:  ldrw(dst, src); break;
1943   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1944   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1945   default:  ShouldNotReachHere();
1946   }
1947 }
1948 
store_sized_value(Address dst,Register src,size_t size_in_bytes,Register src2)1949 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1950   switch (size_in_bytes) {
1951   case  8:  str(src, dst); break;
1952   case  4:  strw(src, dst); break;
1953   case  2:  strh(src, dst); break;
1954   case  1:  strb(src, dst); break;
1955   default:  ShouldNotReachHere();
1956   }
1957 }
1958 
decrementw(Register reg,int value)1959 void MacroAssembler::decrementw(Register reg, int value)
1960 {
1961   if (value < 0)  { incrementw(reg, -value);      return; }
1962   if (value == 0) {                               return; }
1963   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1964   /* else */ {
1965     guarantee(reg != rscratch2, "invalid dst for register decrement");
1966     movw(rscratch2, (unsigned)value);
1967     subw(reg, reg, rscratch2);
1968   }
1969 }
1970 
decrement(Register reg,int value)1971 void MacroAssembler::decrement(Register reg, int value)
1972 {
1973   if (value < 0)  { increment(reg, -value);      return; }
1974   if (value == 0) {                              return; }
1975   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1976   /* else */ {
1977     assert(reg != rscratch2, "invalid dst for register decrement");
1978     mov(rscratch2, (unsigned long)value);
1979     sub(reg, reg, rscratch2);
1980   }
1981 }
1982 
decrementw(Address dst,int value)1983 void MacroAssembler::decrementw(Address dst, int value)
1984 {
1985   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1986   if (dst.getMode() == Address::literal) {
1987     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1988     lea(rscratch2, dst);
1989     dst = Address(rscratch2);
1990   }
1991   ldrw(rscratch1, dst);
1992   decrementw(rscratch1, value);
1993   strw(rscratch1, dst);
1994 }
1995 
decrement(Address dst,int value)1996 void MacroAssembler::decrement(Address dst, int value)
1997 {
1998   assert(!dst.uses(rscratch1), "invalid address for decrement");
1999   if (dst.getMode() == Address::literal) {
2000     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2001     lea(rscratch2, dst);
2002     dst = Address(rscratch2);
2003   }
2004   ldr(rscratch1, dst);
2005   decrement(rscratch1, value);
2006   str(rscratch1, dst);
2007 }
2008 
incrementw(Register reg,int value)2009 void MacroAssembler::incrementw(Register reg, int value)
2010 {
2011   if (value < 0)  { decrementw(reg, -value);      return; }
2012   if (value == 0) {                               return; }
2013   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2014   /* else */ {
2015     assert(reg != rscratch2, "invalid dst for register increment");
2016     movw(rscratch2, (unsigned)value);
2017     addw(reg, reg, rscratch2);
2018   }
2019 }
2020 
increment(Register reg,int value)2021 void MacroAssembler::increment(Register reg, int value)
2022 {
2023   if (value < 0)  { decrement(reg, -value);      return; }
2024   if (value == 0) {                              return; }
2025   if (value < (1 << 12)) { add(reg, reg, value); return; }
2026   /* else */ {
2027     assert(reg != rscratch2, "invalid dst for register increment");
2028     movw(rscratch2, (unsigned)value);
2029     add(reg, reg, rscratch2);
2030   }
2031 }
2032 
incrementw(Address dst,int value)2033 void MacroAssembler::incrementw(Address dst, int value)
2034 {
2035   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2036   if (dst.getMode() == Address::literal) {
2037     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2038     lea(rscratch2, dst);
2039     dst = Address(rscratch2);
2040   }
2041   ldrw(rscratch1, dst);
2042   incrementw(rscratch1, value);
2043   strw(rscratch1, dst);
2044 }
2045 
increment(Address dst,int value)2046 void MacroAssembler::increment(Address dst, int value)
2047 {
2048   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2049   if (dst.getMode() == Address::literal) {
2050     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2051     lea(rscratch2, dst);
2052     dst = Address(rscratch2);
2053   }
2054   ldr(rscratch1, dst);
2055   increment(rscratch1, value);
2056   str(rscratch1, dst);
2057 }
2058 
2059 
pusha()2060 void MacroAssembler::pusha() {
2061   push(0x7fffffff, sp);
2062 }
2063 
popa()2064 void MacroAssembler::popa() {
2065   pop(0x7fffffff, sp);
2066 }
2067 
2068 // Push lots of registers in the bit set supplied.  Don't push sp.
2069 // Return the number of words pushed
push(unsigned int bitset,Register stack)2070 int MacroAssembler::push(unsigned int bitset, Register stack) {
2071   int words_pushed = 0;
2072 
2073   // Scan bitset to accumulate register pairs
2074   unsigned char regs[32];
2075   int count = 0;
2076   for (int reg = 0; reg <= 30; reg++) {
2077     if (1 & bitset)
2078       regs[count++] = reg;
2079     bitset >>= 1;
2080   }
2081   regs[count++] = zr->encoding_nocheck();
2082   count &= ~1;  // Only push an even nuber of regs
2083 
2084   if (count) {
2085     stp(as_Register(regs[0]), as_Register(regs[1]),
2086        Address(pre(stack, -count * wordSize)));
2087     words_pushed += 2;
2088   }
2089   for (int i = 2; i < count; i += 2) {
2090     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2091        Address(stack, i * wordSize));
2092     words_pushed += 2;
2093   }
2094 
2095   assert(words_pushed == count, "oops, pushed != count");
2096 
2097   return count;
2098 }
2099 
pop(unsigned int bitset,Register stack)2100 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2101   int words_pushed = 0;
2102 
2103   // Scan bitset to accumulate register pairs
2104   unsigned char regs[32];
2105   int count = 0;
2106   for (int reg = 0; reg <= 30; reg++) {
2107     if (1 & bitset)
2108       regs[count++] = reg;
2109     bitset >>= 1;
2110   }
2111   regs[count++] = zr->encoding_nocheck();
2112   count &= ~1;
2113 
2114   for (int i = 2; i < count; i += 2) {
2115     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2116        Address(stack, i * wordSize));
2117     words_pushed += 2;
2118   }
2119   if (count) {
2120     ldp(as_Register(regs[0]), as_Register(regs[1]),
2121        Address(post(stack, count * wordSize)));
2122     words_pushed += 2;
2123   }
2124 
2125   assert(words_pushed == count, "oops, pushed != count");
2126 
2127   return count;
2128 }
2129 
2130 // Push lots of registers in the bit set supplied.  Don't push sp.
2131 // Return the number of words pushed
push_fp(unsigned int bitset,Register stack)2132 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2133   int words_pushed = 0;
2134 
2135   // Scan bitset to accumulate register pairs
2136   unsigned char regs[32];
2137   int count = 0;
2138   for (int reg = 0; reg <= 31; reg++) {
2139     if (1 & bitset)
2140       regs[count++] = reg;
2141     bitset >>= 1;
2142   }
2143 
2144   if (count == 0) {
2145     return 0;
2146   }
2147 
2148   if (count == 1) {
2149     strq(as_FloatRegister(regs[0]), Address(pre(stack, -wordSize * 2)));
2150     return 1;
2151   }
2152 
2153   bool odd = (count & 1) == 1;
2154   int push_slots = count + (odd ? 1 : 0);
2155 
2156   // Always pushing full 128 bit registers.
2157   stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -push_slots * wordSize * 2)));
2158   words_pushed += 2;
2159 
2160   for (int i = 2; i + 1 < count; i += 2) {
2161     stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2162     words_pushed += 2;
2163   }
2164 
2165   if (odd) {
2166     strq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2167     words_pushed++;
2168   }
2169 
2170   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2171   return count;
2172 }
2173 
pop_fp(unsigned int bitset,Register stack)2174 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2175   int words_pushed = 0;
2176 
2177   // Scan bitset to accumulate register pairs
2178   unsigned char regs[32];
2179   int count = 0;
2180   for (int reg = 0; reg <= 31; reg++) {
2181     if (1 & bitset)
2182       regs[count++] = reg;
2183     bitset >>= 1;
2184   }
2185 
2186   if (count == 0) {
2187     return 0;
2188   }
2189 
2190   if (count == 1) {
2191     ldrq(as_FloatRegister(regs[0]), Address(post(stack, wordSize * 2)));
2192     return 1;
2193   }
2194 
2195   bool odd = (count & 1) == 1;
2196   int push_slots = count + (odd ? 1 : 0);
2197 
2198   if (odd) {
2199     ldrq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2200     words_pushed++;
2201   }
2202 
2203   for (int i = 2; i + 1 < count; i += 2) {
2204     ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2205     words_pushed += 2;
2206   }
2207 
2208   ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, push_slots * wordSize * 2)));
2209   words_pushed += 2;
2210 
2211   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2212 
2213   return count;
2214 }
2215 
2216 #ifdef ASSERT
verify_heapbase(const char * msg)2217 void MacroAssembler::verify_heapbase(const char* msg) {
2218 #if 0
2219   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2220   assert (Universe::heap() != NULL, "java heap should be initialized");
2221   if (!UseCompressedOops || Universe::ptr_base() == NULL) {
2222     // rheapbase is allocated as general register
2223     return;
2224   }
2225   if (CheckCompressedOops) {
2226     Label ok;
2227     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2228     cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2229     br(Assembler::EQ, ok);
2230     stop(msg);
2231     bind(ok);
2232     pop(1 << rscratch1->encoding(), sp);
2233   }
2234 #endif
2235 }
2236 #endif
2237 
resolve_jobject(Register value,Register thread,Register tmp)2238 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2239   Label done, not_weak;
2240   cbz(value, done);           // Use NULL as-is.
2241 
2242   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2243   tbz(r0, 0, not_weak);    // Test for jweak tag.
2244 
2245   // Resolve jweak.
2246   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2247                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2248   verify_oop(value);
2249   b(done);
2250 
2251   bind(not_weak);
2252   // Resolve (untagged) jobject.
2253   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2254   verify_oop(value);
2255   bind(done);
2256 }
2257 
stop(const char * msg)2258 void MacroAssembler::stop(const char* msg) {
2259   BLOCK_COMMENT(msg);
2260   dcps1(0xdeae);
2261   emit_int64((uintptr_t)msg);
2262 }
2263 
unimplemented(const char * what)2264 void MacroAssembler::unimplemented(const char* what) {
2265   const char* buf = NULL;
2266   {
2267     ResourceMark rm;
2268     stringStream ss;
2269     ss.print("unimplemented: %s", what);
2270     buf = code_string(ss.as_string());
2271   }
2272   stop(buf);
2273 }
2274 
2275 // If a constant does not fit in an immediate field, generate some
2276 // number of MOV instructions and then perform the operation.
wrap_add_sub_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2277 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2278                                            add_sub_imm_insn insn1,
2279                                            add_sub_reg_insn insn2) {
2280   assert(Rd != zr, "Rd = zr and not setting flags?");
2281   if (operand_valid_for_add_sub_immediate((int)imm)) {
2282     (this->*insn1)(Rd, Rn, imm);
2283   } else {
2284     if (uabs(imm) < (1 << 24)) {
2285        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2286        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2287     } else {
2288        assert_different_registers(Rd, Rn);
2289        mov(Rd, (uint64_t)imm);
2290        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2291     }
2292   }
2293 }
2294 
2295 // Seperate vsn which sets the flags. Optimisations are more restricted
2296 // because we must set the flags correctly.
wrap_adds_subs_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2297 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2298                                            add_sub_imm_insn insn1,
2299                                            add_sub_reg_insn insn2) {
2300   if (operand_valid_for_add_sub_immediate((int)imm)) {
2301     (this->*insn1)(Rd, Rn, imm);
2302   } else {
2303     assert_different_registers(Rd, Rn);
2304     assert(Rd != zr, "overflow in immediate operand");
2305     mov(Rd, (uint64_t)imm);
2306     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2307   }
2308 }
2309 
2310 
add(Register Rd,Register Rn,RegisterOrConstant increment)2311 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2312   if (increment.is_register()) {
2313     add(Rd, Rn, increment.as_register());
2314   } else {
2315     add(Rd, Rn, increment.as_constant());
2316   }
2317 }
2318 
addw(Register Rd,Register Rn,RegisterOrConstant increment)2319 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2320   if (increment.is_register()) {
2321     addw(Rd, Rn, increment.as_register());
2322   } else {
2323     addw(Rd, Rn, increment.as_constant());
2324   }
2325 }
2326 
sub(Register Rd,Register Rn,RegisterOrConstant decrement)2327 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2328   if (decrement.is_register()) {
2329     sub(Rd, Rn, decrement.as_register());
2330   } else {
2331     sub(Rd, Rn, decrement.as_constant());
2332   }
2333 }
2334 
subw(Register Rd,Register Rn,RegisterOrConstant decrement)2335 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2336   if (decrement.is_register()) {
2337     subw(Rd, Rn, decrement.as_register());
2338   } else {
2339     subw(Rd, Rn, decrement.as_constant());
2340   }
2341 }
2342 
reinit_heapbase()2343 void MacroAssembler::reinit_heapbase()
2344 {
2345   if (UseCompressedOops) {
2346     if (Universe::is_fully_initialized()) {
2347       mov(rheapbase, CompressedOops::ptrs_base());
2348     } else {
2349       lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2350       ldr(rheapbase, Address(rheapbase));
2351     }
2352   }
2353 }
2354 
2355 // this simulates the behaviour of the x86 cmpxchg instruction using a
2356 // load linked/store conditional pair. we use the acquire/release
2357 // versions of these instructions so that we flush pending writes as
2358 // per Java semantics.
2359 
2360 // n.b the x86 version assumes the old value to be compared against is
2361 // in rax and updates rax with the value located in memory if the
2362 // cmpxchg fails. we supply a register for the old value explicitly
2363 
2364 // the aarch64 load linked/store conditional instructions do not
2365 // accept an offset. so, unlike x86, we must provide a plain register
2366 // to identify the memory word to be compared/exchanged rather than a
2367 // register+offset Address.
2368 
cmpxchgptr(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2369 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2370                                 Label &succeed, Label *fail) {
2371   // oldv holds comparison value
2372   // newv holds value to write in exchange
2373   // addr identifies memory word to compare against/update
2374   if (UseLSE) {
2375     mov(tmp, oldv);
2376     casal(Assembler::xword, oldv, newv, addr);
2377     cmp(tmp, oldv);
2378     br(Assembler::EQ, succeed);
2379     membar(AnyAny);
2380   } else {
2381     Label retry_load, nope;
2382     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2383       prfm(Address(addr), PSTL1STRM);
2384     bind(retry_load);
2385     // flush and load exclusive from the memory location
2386     // and fail if it is not what we expect
2387     ldaxr(tmp, addr);
2388     cmp(tmp, oldv);
2389     br(Assembler::NE, nope);
2390     // if we store+flush with no intervening write tmp wil be zero
2391     stlxr(tmp, newv, addr);
2392     cbzw(tmp, succeed);
2393     // retry so we only ever return after a load fails to compare
2394     // ensures we don't return a stale value after a failed write.
2395     b(retry_load);
2396     // if the memory word differs we return it in oldv and signal a fail
2397     bind(nope);
2398     membar(AnyAny);
2399     mov(oldv, tmp);
2400   }
2401   if (fail)
2402     b(*fail);
2403 }
2404 
cmpxchg_obj_header(Register oldv,Register newv,Register obj,Register tmp,Label & succeed,Label * fail)2405 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2406                                         Label &succeed, Label *fail) {
2407   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2408   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2409 }
2410 
cmpxchgw(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2411 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2412                                 Label &succeed, Label *fail) {
2413   // oldv holds comparison value
2414   // newv holds value to write in exchange
2415   // addr identifies memory word to compare against/update
2416   // tmp returns 0/1 for success/failure
2417   if (UseLSE) {
2418     mov(tmp, oldv);
2419     casal(Assembler::word, oldv, newv, addr);
2420     cmp(tmp, oldv);
2421     br(Assembler::EQ, succeed);
2422     membar(AnyAny);
2423   } else {
2424     Label retry_load, nope;
2425     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2426       prfm(Address(addr), PSTL1STRM);
2427     bind(retry_load);
2428     // flush and load exclusive from the memory location
2429     // and fail if it is not what we expect
2430     ldaxrw(tmp, addr);
2431     cmp(tmp, oldv);
2432     br(Assembler::NE, nope);
2433     // if we store+flush with no intervening write tmp wil be zero
2434     stlxrw(tmp, newv, addr);
2435     cbzw(tmp, succeed);
2436     // retry so we only ever return after a load fails to compare
2437     // ensures we don't return a stale value after a failed write.
2438     b(retry_load);
2439     // if the memory word differs we return it in oldv and signal a fail
2440     bind(nope);
2441     membar(AnyAny);
2442     mov(oldv, tmp);
2443   }
2444   if (fail)
2445     b(*fail);
2446 }
2447 
2448 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2449 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2450 // Pass a register for the result, otherwise pass noreg.
2451 
2452 // Clobbers rscratch1
cmpxchg(Register addr,Register expected,Register new_val,enum operand_size size,bool acquire,bool release,bool weak,Register result)2453 void MacroAssembler::cmpxchg(Register addr, Register expected,
2454                              Register new_val,
2455                              enum operand_size size,
2456                              bool acquire, bool release,
2457                              bool weak,
2458                              Register result) {
2459   if (result == noreg)  result = rscratch1;
2460   BLOCK_COMMENT("cmpxchg {");
2461   if (UseLSE) {
2462     mov(result, expected);
2463     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2464     compare_eq(result, expected, size);
2465   } else {
2466     Label retry_load, done;
2467     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2468       prfm(Address(addr), PSTL1STRM);
2469     bind(retry_load);
2470     load_exclusive(result, addr, size, acquire);
2471     compare_eq(result, expected, size);
2472     br(Assembler::NE, done);
2473     store_exclusive(rscratch1, new_val, addr, size, release);
2474     if (weak) {
2475       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2476     } else {
2477       cbnzw(rscratch1, retry_load);
2478     }
2479     bind(done);
2480   }
2481   BLOCK_COMMENT("} cmpxchg");
2482 }
2483 
2484 // A generic comparison. Only compares for equality, clobbers rscratch1.
compare_eq(Register rm,Register rn,enum operand_size size)2485 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2486   if (size == xword) {
2487     cmp(rm, rn);
2488   } else if (size == word) {
2489     cmpw(rm, rn);
2490   } else if (size == halfword) {
2491     eorw(rscratch1, rm, rn);
2492     ands(zr, rscratch1, 0xffff);
2493   } else if (size == byte) {
2494     eorw(rscratch1, rm, rn);
2495     ands(zr, rscratch1, 0xff);
2496   } else {
2497     ShouldNotReachHere();
2498   }
2499 }
2500 
2501 
different(Register a,RegisterOrConstant b,Register c)2502 static bool different(Register a, RegisterOrConstant b, Register c) {
2503   if (b.is_constant())
2504     return a != c;
2505   else
2506     return a != b.as_register() && a != c && b.as_register() != c;
2507 }
2508 
2509 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2510 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2511   if (UseLSE) {                                                         \
2512     prev = prev->is_valid() ? prev : zr;                                \
2513     if (incr.is_register()) {                                           \
2514       AOP(sz, incr.as_register(), prev, addr);                          \
2515     } else {                                                            \
2516       mov(rscratch2, incr.as_constant());                               \
2517       AOP(sz, rscratch2, prev, addr);                                   \
2518     }                                                                   \
2519     return;                                                             \
2520   }                                                                     \
2521   Register result = rscratch2;                                          \
2522   if (prev->is_valid())                                                 \
2523     result = different(prev, incr, addr) ? prev : rscratch2;            \
2524                                                                         \
2525   Label retry_load;                                                     \
2526   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2527     prfm(Address(addr), PSTL1STRM);                                     \
2528   bind(retry_load);                                                     \
2529   LDXR(result, addr);                                                   \
2530   OP(rscratch1, result, incr);                                          \
2531   STXR(rscratch2, rscratch1, addr);                                     \
2532   cbnzw(rscratch2, retry_load);                                         \
2533   if (prev->is_valid() && prev != result) {                             \
2534     IOP(prev, rscratch1, incr);                                         \
2535   }                                                                     \
2536 }
2537 
2538 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2539 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2540 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2541 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2542 
2543 #undef ATOMIC_OP
2544 
2545 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2546 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2547   if (UseLSE) {                                                         \
2548     prev = prev->is_valid() ? prev : zr;                                \
2549     AOP(sz, newv, prev, addr);                                          \
2550     return;                                                             \
2551   }                                                                     \
2552   Register result = rscratch2;                                          \
2553   if (prev->is_valid())                                                 \
2554     result = different(prev, newv, addr) ? prev : rscratch2;            \
2555                                                                         \
2556   Label retry_load;                                                     \
2557   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2558     prfm(Address(addr), PSTL1STRM);                                     \
2559   bind(retry_load);                                                     \
2560   LDXR(result, addr);                                                   \
2561   STXR(rscratch1, newv, addr);                                          \
2562   cbnzw(rscratch1, retry_load);                                         \
2563   if (prev->is_valid() && prev != result)                               \
2564     mov(prev, result);                                                  \
2565 }
2566 
2567 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2568 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2569 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2570 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2571 
2572 #undef ATOMIC_XCHG
2573 
2574 #ifndef PRODUCT
2575 extern "C" void findpc(intptr_t x);
2576 #endif
2577 
debug64(char * msg,int64_t pc,int64_t regs[])2578 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2579 {
2580   // In order to get locks to work, we need to fake a in_VM state
2581   if (ShowMessageBoxOnError ) {
2582     JavaThread* thread = JavaThread::current();
2583     JavaThreadState saved_state = thread->thread_state();
2584     thread->set_thread_state(_thread_in_vm);
2585 #ifndef PRODUCT
2586     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2587       ttyLocker ttyl;
2588       BytecodeCounter::print();
2589     }
2590 #endif
2591     if (os::message_box(msg, "Execution stopped, print registers?")) {
2592       ttyLocker ttyl;
2593       tty->print_cr(" pc = 0x%016lx", pc);
2594 #ifndef PRODUCT
2595       tty->cr();
2596       findpc(pc);
2597       tty->cr();
2598 #endif
2599       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2600       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2601       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2602       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2603       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2604       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2605       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2606       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2607       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2608       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2609       tty->print_cr("r10 = 0x%016lx", regs[10]);
2610       tty->print_cr("r11 = 0x%016lx", regs[11]);
2611       tty->print_cr("r12 = 0x%016lx", regs[12]);
2612       tty->print_cr("r13 = 0x%016lx", regs[13]);
2613       tty->print_cr("r14 = 0x%016lx", regs[14]);
2614       tty->print_cr("r15 = 0x%016lx", regs[15]);
2615       tty->print_cr("r16 = 0x%016lx", regs[16]);
2616       tty->print_cr("r17 = 0x%016lx", regs[17]);
2617       tty->print_cr("r18 = 0x%016lx", regs[18]);
2618       tty->print_cr("r19 = 0x%016lx", regs[19]);
2619       tty->print_cr("r20 = 0x%016lx", regs[20]);
2620       tty->print_cr("r21 = 0x%016lx", regs[21]);
2621       tty->print_cr("r22 = 0x%016lx", regs[22]);
2622       tty->print_cr("r23 = 0x%016lx", regs[23]);
2623       tty->print_cr("r24 = 0x%016lx", regs[24]);
2624       tty->print_cr("r25 = 0x%016lx", regs[25]);
2625       tty->print_cr("r26 = 0x%016lx", regs[26]);
2626       tty->print_cr("r27 = 0x%016lx", regs[27]);
2627       tty->print_cr("r28 = 0x%016lx", regs[28]);
2628       tty->print_cr("r30 = 0x%016lx", regs[30]);
2629       tty->print_cr("r31 = 0x%016lx", regs[31]);
2630       BREAKPOINT;
2631     }
2632   }
2633   fatal("DEBUG MESSAGE: %s", msg);
2634 }
2635 
push_call_clobbered_registers()2636 void MacroAssembler::push_call_clobbered_registers() {
2637   int step = 4 * wordSize;
2638   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2639   sub(sp, sp, step);
2640   mov(rscratch1, -step);
2641   // Push v0-v7, v16-v31.
2642   for (int i = 31; i>= 4; i -= 4) {
2643     if (i <= v7->encoding() || i >= v16->encoding())
2644       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2645           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2646   }
2647   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2648       as_FloatRegister(3), T1D, Address(sp));
2649 }
2650 
pop_call_clobbered_registers()2651 void MacroAssembler::pop_call_clobbered_registers() {
2652   for (int i = 0; i < 32; i += 4) {
2653     if (i <= v7->encoding() || i >= v16->encoding())
2654       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2655           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2656   }
2657 
2658   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2659 }
2660 
push_CPU_state(bool save_vectors)2661 void MacroAssembler::push_CPU_state(bool save_vectors) {
2662   int step = (save_vectors ? 8 : 4) * wordSize;
2663   push(0x3fffffff, sp);         // integer registers except lr & sp
2664   mov(rscratch1, -step);
2665   sub(sp, sp, step);
2666   for (int i = 28; i >= 4; i -= 4) {
2667     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2668         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2669   }
2670   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2671 }
2672 
pop_CPU_state(bool restore_vectors)2673 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2674   int step = (restore_vectors ? 8 : 4) * wordSize;
2675   for (int i = 0; i <= 28; i += 4)
2676     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2677         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2678   pop(0x3fffffff, sp);         // integer registers except lr & sp
2679 }
2680 
2681 /**
2682  * Helpers for multiply_to_len().
2683  */
add2_with_carry(Register final_dest_hi,Register dest_hi,Register dest_lo,Register src1,Register src2)2684 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2685                                      Register src1, Register src2) {
2686   adds(dest_lo, dest_lo, src1);
2687   adc(dest_hi, dest_hi, zr);
2688   adds(dest_lo, dest_lo, src2);
2689   adc(final_dest_hi, dest_hi, zr);
2690 }
2691 
2692 // Generate an address from (r + r1 extend offset).  "size" is the
2693 // size of the operand.  The result may be in rscratch2.
offsetted_address(Register r,Register r1,Address::extend ext,int offset,int size)2694 Address MacroAssembler::offsetted_address(Register r, Register r1,
2695                                           Address::extend ext, int offset, int size) {
2696   if (offset || (ext.shift() % size != 0)) {
2697     lea(rscratch2, Address(r, r1, ext));
2698     return Address(rscratch2, offset);
2699   } else {
2700     return Address(r, r1, ext);
2701   }
2702 }
2703 
spill_address(int size,int offset,Register tmp)2704 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2705 {
2706   assert(offset >= 0, "spill to negative address?");
2707   // Offset reachable ?
2708   //   Not aligned - 9 bits signed offset
2709   //   Aligned - 12 bits unsigned offset shifted
2710   Register base = sp;
2711   if ((offset & (size-1)) && offset >= (1<<8)) {
2712     add(tmp, base, offset & ((1<<12)-1));
2713     base = tmp;
2714     offset &= -1u<<12;
2715   }
2716 
2717   if (offset >= (1<<12) * size) {
2718     add(tmp, base, offset & (((1<<12)-1)<<12));
2719     base = tmp;
2720     offset &= ~(((1<<12)-1)<<12);
2721   }
2722 
2723   return Address(base, offset);
2724 }
2725 
2726 // Checks whether offset is aligned.
2727 // Returns true if it is, else false.
merge_alignment_check(Register base,size_t size,long cur_offset,long prev_offset) const2728 bool MacroAssembler::merge_alignment_check(Register base,
2729                                            size_t size,
2730                                            long cur_offset,
2731                                            long prev_offset) const {
2732   if (AvoidUnalignedAccesses) {
2733     if (base == sp) {
2734       // Checks whether low offset if aligned to pair of registers.
2735       long pair_mask = size * 2 - 1;
2736       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2737       return (offset & pair_mask) == 0;
2738     } else { // If base is not sp, we can't guarantee the access is aligned.
2739       return false;
2740     }
2741   } else {
2742     long mask = size - 1;
2743     // Load/store pair instruction only supports element size aligned offset.
2744     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2745   }
2746 }
2747 
2748 // Checks whether current and previous loads/stores can be merged.
2749 // Returns true if it can be merged, else false.
ldst_can_merge(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store) const2750 bool MacroAssembler::ldst_can_merge(Register rt,
2751                                     const Address &adr,
2752                                     size_t cur_size_in_bytes,
2753                                     bool is_store) const {
2754   address prev = pc() - NativeInstruction::instruction_size;
2755   address last = code()->last_insn();
2756 
2757   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2758     return false;
2759   }
2760 
2761   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2762     return false;
2763   }
2764 
2765   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2766   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2767 
2768   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2769   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2770 
2771   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2772     return false;
2773   }
2774 
2775   long max_offset = 63 * prev_size_in_bytes;
2776   long min_offset = -64 * prev_size_in_bytes;
2777 
2778   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2779 
2780   // Only same base can be merged.
2781   if (adr.base() != prev_ldst->base()) {
2782     return false;
2783   }
2784 
2785   long cur_offset = adr.offset();
2786   long prev_offset = prev_ldst->offset();
2787   size_t diff = abs(cur_offset - prev_offset);
2788   if (diff != prev_size_in_bytes) {
2789     return false;
2790   }
2791 
2792   // Following cases can not be merged:
2793   // ldr x2, [x2, #8]
2794   // ldr x3, [x2, #16]
2795   // or:
2796   // ldr x2, [x3, #8]
2797   // ldr x2, [x3, #16]
2798   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2799   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2800     return false;
2801   }
2802 
2803   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2804   // Offset range must be in ldp/stp instruction's range.
2805   if (low_offset > max_offset || low_offset < min_offset) {
2806     return false;
2807   }
2808 
2809   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2810     return true;
2811   }
2812 
2813   return false;
2814 }
2815 
2816 // Merge current load/store with previous load/store into ldp/stp.
merge_ldst(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store)2817 void MacroAssembler::merge_ldst(Register rt,
2818                                 const Address &adr,
2819                                 size_t cur_size_in_bytes,
2820                                 bool is_store) {
2821 
2822   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2823 
2824   Register rt_low, rt_high;
2825   address prev = pc() - NativeInstruction::instruction_size;
2826   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2827 
2828   long offset;
2829 
2830   if (adr.offset() < prev_ldst->offset()) {
2831     offset = adr.offset();
2832     rt_low = rt;
2833     rt_high = prev_ldst->target();
2834   } else {
2835     offset = prev_ldst->offset();
2836     rt_low = prev_ldst->target();
2837     rt_high = rt;
2838   }
2839 
2840   Address adr_p = Address(prev_ldst->base(), offset);
2841   // Overwrite previous generated binary.
2842   code_section()->set_end(prev);
2843 
2844   const int sz = prev_ldst->size_in_bytes();
2845   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2846   if (!is_store) {
2847     BLOCK_COMMENT("merged ldr pair");
2848     if (sz == 8) {
2849       ldp(rt_low, rt_high, adr_p);
2850     } else {
2851       ldpw(rt_low, rt_high, adr_p);
2852     }
2853   } else {
2854     BLOCK_COMMENT("merged str pair");
2855     if (sz == 8) {
2856       stp(rt_low, rt_high, adr_p);
2857     } else {
2858       stpw(rt_low, rt_high, adr_p);
2859     }
2860   }
2861 }
2862 
2863 /**
2864  * Multiply 64 bit by 64 bit first loop.
2865  */
multiply_64_x_64_loop(Register x,Register xstart,Register x_xstart,Register y,Register y_idx,Register z,Register carry,Register product,Register idx,Register kdx)2866 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2867                                            Register y, Register y_idx, Register z,
2868                                            Register carry, Register product,
2869                                            Register idx, Register kdx) {
2870   //
2871   //  jlong carry, x[], y[], z[];
2872   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2873   //    huge_128 product = y[idx] * x[xstart] + carry;
2874   //    z[kdx] = (jlong)product;
2875   //    carry  = (jlong)(product >>> 64);
2876   //  }
2877   //  z[xstart] = carry;
2878   //
2879 
2880   Label L_first_loop, L_first_loop_exit;
2881   Label L_one_x, L_one_y, L_multiply;
2882 
2883   subsw(xstart, xstart, 1);
2884   br(Assembler::MI, L_one_x);
2885 
2886   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2887   ldr(x_xstart, Address(rscratch1));
2888   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2889 
2890   bind(L_first_loop);
2891   subsw(idx, idx, 1);
2892   br(Assembler::MI, L_first_loop_exit);
2893   subsw(idx, idx, 1);
2894   br(Assembler::MI, L_one_y);
2895   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2896   ldr(y_idx, Address(rscratch1));
2897   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2898   bind(L_multiply);
2899 
2900   // AArch64 has a multiply-accumulate instruction that we can't use
2901   // here because it has no way to process carries, so we have to use
2902   // separate add and adc instructions.  Bah.
2903   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2904   mul(product, x_xstart, y_idx);
2905   adds(product, product, carry);
2906   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2907 
2908   subw(kdx, kdx, 2);
2909   ror(product, product, 32); // back to big-endian
2910   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2911 
2912   b(L_first_loop);
2913 
2914   bind(L_one_y);
2915   ldrw(y_idx, Address(y,  0));
2916   b(L_multiply);
2917 
2918   bind(L_one_x);
2919   ldrw(x_xstart, Address(x,  0));
2920   b(L_first_loop);
2921 
2922   bind(L_first_loop_exit);
2923 }
2924 
2925 /**
2926  * Multiply 128 bit by 128. Unrolled inner loop.
2927  *
2928  */
multiply_128_x_128_loop(Register y,Register z,Register carry,Register carry2,Register idx,Register jdx,Register yz_idx1,Register yz_idx2,Register tmp,Register tmp3,Register tmp4,Register tmp6,Register product_hi)2929 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2930                                              Register carry, Register carry2,
2931                                              Register idx, Register jdx,
2932                                              Register yz_idx1, Register yz_idx2,
2933                                              Register tmp, Register tmp3, Register tmp4,
2934                                              Register tmp6, Register product_hi) {
2935 
2936   //   jlong carry, x[], y[], z[];
2937   //   int kdx = ystart+1;
2938   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2939   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2940   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2941   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2942   //     carry  = (jlong)(tmp4 >>> 64);
2943   //     z[kdx+idx+1] = (jlong)tmp3;
2944   //     z[kdx+idx] = (jlong)tmp4;
2945   //   }
2946   //   idx += 2;
2947   //   if (idx > 0) {
2948   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2949   //     z[kdx+idx] = (jlong)yz_idx1;
2950   //     carry  = (jlong)(yz_idx1 >>> 64);
2951   //   }
2952   //
2953 
2954   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2955 
2956   lsrw(jdx, idx, 2);
2957 
2958   bind(L_third_loop);
2959 
2960   subsw(jdx, jdx, 1);
2961   br(Assembler::MI, L_third_loop_exit);
2962   subw(idx, idx, 4);
2963 
2964   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2965 
2966   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2967 
2968   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2969 
2970   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2971   ror(yz_idx2, yz_idx2, 32);
2972 
2973   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2974 
2975   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2976   umulh(tmp4, product_hi, yz_idx1);
2977 
2978   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2979   ror(rscratch2, rscratch2, 32);
2980 
2981   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2982   umulh(carry2, product_hi, yz_idx2);
2983 
2984   // propagate sum of both multiplications into carry:tmp4:tmp3
2985   adds(tmp3, tmp3, carry);
2986   adc(tmp4, tmp4, zr);
2987   adds(tmp3, tmp3, rscratch1);
2988   adcs(tmp4, tmp4, tmp);
2989   adc(carry, carry2, zr);
2990   adds(tmp4, tmp4, rscratch2);
2991   adc(carry, carry, zr);
2992 
2993   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2994   ror(tmp4, tmp4, 32);
2995   stp(tmp4, tmp3, Address(tmp6, 0));
2996 
2997   b(L_third_loop);
2998   bind (L_third_loop_exit);
2999 
3000   andw (idx, idx, 0x3);
3001   cbz(idx, L_post_third_loop_done);
3002 
3003   Label L_check_1;
3004   subsw(idx, idx, 2);
3005   br(Assembler::MI, L_check_1);
3006 
3007   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3008   ldr(yz_idx1, Address(rscratch1, 0));
3009   ror(yz_idx1, yz_idx1, 32);
3010   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3011   umulh(tmp4, product_hi, yz_idx1);
3012   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3013   ldr(yz_idx2, Address(rscratch1, 0));
3014   ror(yz_idx2, yz_idx2, 32);
3015 
3016   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3017 
3018   ror(tmp3, tmp3, 32);
3019   str(tmp3, Address(rscratch1, 0));
3020 
3021   bind (L_check_1);
3022 
3023   andw (idx, idx, 0x1);
3024   subsw(idx, idx, 1);
3025   br(Assembler::MI, L_post_third_loop_done);
3026   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3027   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3028   umulh(carry2, tmp4, product_hi);
3029   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3030 
3031   add2_with_carry(carry2, tmp3, tmp4, carry);
3032 
3033   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3034   extr(carry, carry2, tmp3, 32);
3035 
3036   bind(L_post_third_loop_done);
3037 }
3038 
3039 /**
3040  * Code for BigInteger::multiplyToLen() instrinsic.
3041  *
3042  * r0: x
3043  * r1: xlen
3044  * r2: y
3045  * r3: ylen
3046  * r4:  z
3047  * r5: zlen
3048  * r10: tmp1
3049  * r11: tmp2
3050  * r12: tmp3
3051  * r13: tmp4
3052  * r14: tmp5
3053  * r15: tmp6
3054  * r16: tmp7
3055  *
3056  */
multiply_to_len(Register x,Register xlen,Register y,Register ylen,Register z,Register zlen,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,Register product_hi)3057 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3058                                      Register z, Register zlen,
3059                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3060                                      Register tmp5, Register tmp6, Register product_hi) {
3061 
3062   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3063 
3064   const Register idx = tmp1;
3065   const Register kdx = tmp2;
3066   const Register xstart = tmp3;
3067 
3068   const Register y_idx = tmp4;
3069   const Register carry = tmp5;
3070   const Register product  = xlen;
3071   const Register x_xstart = zlen;  // reuse register
3072 
3073   // First Loop.
3074   //
3075   //  final static long LONG_MASK = 0xffffffffL;
3076   //  int xstart = xlen - 1;
3077   //  int ystart = ylen - 1;
3078   //  long carry = 0;
3079   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3080   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3081   //    z[kdx] = (int)product;
3082   //    carry = product >>> 32;
3083   //  }
3084   //  z[xstart] = (int)carry;
3085   //
3086 
3087   movw(idx, ylen);      // idx = ylen;
3088   movw(kdx, zlen);      // kdx = xlen+ylen;
3089   mov(carry, zr);       // carry = 0;
3090 
3091   Label L_done;
3092 
3093   movw(xstart, xlen);
3094   subsw(xstart, xstart, 1);
3095   br(Assembler::MI, L_done);
3096 
3097   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3098 
3099   Label L_second_loop;
3100   cbzw(kdx, L_second_loop);
3101 
3102   Label L_carry;
3103   subw(kdx, kdx, 1);
3104   cbzw(kdx, L_carry);
3105 
3106   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3107   lsr(carry, carry, 32);
3108   subw(kdx, kdx, 1);
3109 
3110   bind(L_carry);
3111   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3112 
3113   // Second and third (nested) loops.
3114   //
3115   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3116   //   carry = 0;
3117   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3118   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3119   //                    (z[k] & LONG_MASK) + carry;
3120   //     z[k] = (int)product;
3121   //     carry = product >>> 32;
3122   //   }
3123   //   z[i] = (int)carry;
3124   // }
3125   //
3126   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3127 
3128   const Register jdx = tmp1;
3129 
3130   bind(L_second_loop);
3131   mov(carry, zr);                // carry = 0;
3132   movw(jdx, ylen);               // j = ystart+1
3133 
3134   subsw(xstart, xstart, 1);      // i = xstart-1;
3135   br(Assembler::MI, L_done);
3136 
3137   str(z, Address(pre(sp, -4 * wordSize)));
3138 
3139   Label L_last_x;
3140   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3141   subsw(xstart, xstart, 1);       // i = xstart-1;
3142   br(Assembler::MI, L_last_x);
3143 
3144   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3145   ldr(product_hi, Address(rscratch1));
3146   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3147 
3148   Label L_third_loop_prologue;
3149   bind(L_third_loop_prologue);
3150 
3151   str(ylen, Address(sp, wordSize));
3152   stp(x, xstart, Address(sp, 2 * wordSize));
3153   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3154                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3155   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3156   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3157 
3158   addw(tmp3, xlen, 1);
3159   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3160   subsw(tmp3, tmp3, 1);
3161   br(Assembler::MI, L_done);
3162 
3163   lsr(carry, carry, 32);
3164   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3165   b(L_second_loop);
3166 
3167   // Next infrequent code is moved outside loops.
3168   bind(L_last_x);
3169   ldrw(product_hi, Address(x,  0));
3170   b(L_third_loop_prologue);
3171 
3172   bind(L_done);
3173 }
3174 
3175 // Code for BigInteger::mulAdd instrinsic
3176 // out     = r0
3177 // in      = r1
3178 // offset  = r2  (already out.length-offset)
3179 // len     = r3
3180 // k       = r4
3181 //
3182 // pseudo code from java implementation:
3183 // carry = 0;
3184 // offset = out.length-offset - 1;
3185 // for (int j=len-1; j >= 0; j--) {
3186 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3187 //     out[offset--] = (int)product;
3188 //     carry = product >>> 32;
3189 // }
3190 // return (int)carry;
mul_add(Register out,Register in,Register offset,Register len,Register k)3191 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3192       Register len, Register k) {
3193     Label LOOP, END;
3194     // pre-loop
3195     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3196     csel(out, zr, out, Assembler::EQ);
3197     br(Assembler::EQ, END);
3198     add(in, in, len, LSL, 2); // in[j+1] address
3199     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3200     mov(out, zr); // used to keep carry now
3201     BIND(LOOP);
3202     ldrw(rscratch1, Address(pre(in, -4)));
3203     madd(rscratch1, rscratch1, k, out);
3204     ldrw(rscratch2, Address(pre(offset, -4)));
3205     add(rscratch1, rscratch1, rscratch2);
3206     strw(rscratch1, Address(offset));
3207     lsr(out, rscratch1, 32);
3208     subs(len, len, 1);
3209     br(Assembler::NE, LOOP);
3210     BIND(END);
3211 }
3212 
3213 /**
3214  * Emits code to update CRC-32 with a byte value according to constants in table
3215  *
3216  * @param [in,out]crc   Register containing the crc.
3217  * @param [in]val       Register containing the byte to fold into the CRC.
3218  * @param [in]table     Register containing the table of crc constants.
3219  *
3220  * uint32_t crc;
3221  * val = crc_table[(val ^ crc) & 0xFF];
3222  * crc = val ^ (crc >> 8);
3223  *
3224  */
update_byte_crc32(Register crc,Register val,Register table)3225 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3226   eor(val, val, crc);
3227   andr(val, val, 0xff);
3228   ldrw(val, Address(table, val, Address::lsl(2)));
3229   eor(crc, val, crc, Assembler::LSR, 8);
3230 }
3231 
3232 /**
3233  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3234  *
3235  * @param [in,out]crc   Register containing the crc.
3236  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3237  * @param [in]table0    Register containing table 0 of crc constants.
3238  * @param [in]table1    Register containing table 1 of crc constants.
3239  * @param [in]table2    Register containing table 2 of crc constants.
3240  * @param [in]table3    Register containing table 3 of crc constants.
3241  *
3242  * uint32_t crc;
3243  *   v = crc ^ v
3244  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3245  *
3246  */
update_word_crc32(Register crc,Register v,Register tmp,Register table0,Register table1,Register table2,Register table3,bool upper)3247 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3248         Register table0, Register table1, Register table2, Register table3,
3249         bool upper) {
3250   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3251   uxtb(tmp, v);
3252   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3253   ubfx(tmp, v, 8, 8);
3254   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3255   eor(crc, crc, tmp);
3256   ubfx(tmp, v, 16, 8);
3257   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3258   eor(crc, crc, tmp);
3259   ubfx(tmp, v, 24, 8);
3260   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3261   eor(crc, crc, tmp);
3262 }
3263 
kernel_crc32_using_crc32(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3264 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3265         Register len, Register tmp0, Register tmp1, Register tmp2,
3266         Register tmp3) {
3267     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3268     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3269 
3270     mvnw(crc, crc);
3271 
3272     subs(len, len, 128);
3273     br(Assembler::GE, CRC_by64_pre);
3274   BIND(CRC_less64);
3275     adds(len, len, 128-32);
3276     br(Assembler::GE, CRC_by32_loop);
3277   BIND(CRC_less32);
3278     adds(len, len, 32-4);
3279     br(Assembler::GE, CRC_by4_loop);
3280     adds(len, len, 4);
3281     br(Assembler::GT, CRC_by1_loop);
3282     b(L_exit);
3283 
3284   BIND(CRC_by32_loop);
3285     ldp(tmp0, tmp1, Address(post(buf, 16)));
3286     subs(len, len, 32);
3287     crc32x(crc, crc, tmp0);
3288     ldr(tmp2, Address(post(buf, 8)));
3289     crc32x(crc, crc, tmp1);
3290     ldr(tmp3, Address(post(buf, 8)));
3291     crc32x(crc, crc, tmp2);
3292     crc32x(crc, crc, tmp3);
3293     br(Assembler::GE, CRC_by32_loop);
3294     cmn(len, 32);
3295     br(Assembler::NE, CRC_less32);
3296     b(L_exit);
3297 
3298   BIND(CRC_by4_loop);
3299     ldrw(tmp0, Address(post(buf, 4)));
3300     subs(len, len, 4);
3301     crc32w(crc, crc, tmp0);
3302     br(Assembler::GE, CRC_by4_loop);
3303     adds(len, len, 4);
3304     br(Assembler::LE, L_exit);
3305   BIND(CRC_by1_loop);
3306     ldrb(tmp0, Address(post(buf, 1)));
3307     subs(len, len, 1);
3308     crc32b(crc, crc, tmp0);
3309     br(Assembler::GT, CRC_by1_loop);
3310     b(L_exit);
3311 
3312   BIND(CRC_by64_pre);
3313     sub(buf, buf, 8);
3314     ldp(tmp0, tmp1, Address(buf, 8));
3315     crc32x(crc, crc, tmp0);
3316     ldr(tmp2, Address(buf, 24));
3317     crc32x(crc, crc, tmp1);
3318     ldr(tmp3, Address(buf, 32));
3319     crc32x(crc, crc, tmp2);
3320     ldr(tmp0, Address(buf, 40));
3321     crc32x(crc, crc, tmp3);
3322     ldr(tmp1, Address(buf, 48));
3323     crc32x(crc, crc, tmp0);
3324     ldr(tmp2, Address(buf, 56));
3325     crc32x(crc, crc, tmp1);
3326     ldr(tmp3, Address(pre(buf, 64)));
3327 
3328     b(CRC_by64_loop);
3329 
3330     align(CodeEntryAlignment);
3331   BIND(CRC_by64_loop);
3332     subs(len, len, 64);
3333     crc32x(crc, crc, tmp2);
3334     ldr(tmp0, Address(buf, 8));
3335     crc32x(crc, crc, tmp3);
3336     ldr(tmp1, Address(buf, 16));
3337     crc32x(crc, crc, tmp0);
3338     ldr(tmp2, Address(buf, 24));
3339     crc32x(crc, crc, tmp1);
3340     ldr(tmp3, Address(buf, 32));
3341     crc32x(crc, crc, tmp2);
3342     ldr(tmp0, Address(buf, 40));
3343     crc32x(crc, crc, tmp3);
3344     ldr(tmp1, Address(buf, 48));
3345     crc32x(crc, crc, tmp0);
3346     ldr(tmp2, Address(buf, 56));
3347     crc32x(crc, crc, tmp1);
3348     ldr(tmp3, Address(pre(buf, 64)));
3349     br(Assembler::GE, CRC_by64_loop);
3350 
3351     // post-loop
3352     crc32x(crc, crc, tmp2);
3353     crc32x(crc, crc, tmp3);
3354 
3355     sub(len, len, 64);
3356     add(buf, buf, 8);
3357     cmn(len, 128);
3358     br(Assembler::NE, CRC_less64);
3359   BIND(L_exit);
3360     mvnw(crc, crc);
3361 }
3362 
3363 /**
3364  * @param crc   register containing existing CRC (32-bit)
3365  * @param buf   register pointing to input byte buffer (byte*)
3366  * @param len   register containing number of bytes
3367  * @param table register that will contain address of CRC table
3368  * @param tmp   scratch register
3369  */
kernel_crc32(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3370 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3371         Register table0, Register table1, Register table2, Register table3,
3372         Register tmp, Register tmp2, Register tmp3) {
3373   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3374   unsigned long offset;
3375 
3376   if (UseCRC32) {
3377       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3378       return;
3379   }
3380 
3381     mvnw(crc, crc);
3382 
3383     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3384     if (offset) add(table0, table0, offset);
3385     add(table1, table0, 1*256*sizeof(juint));
3386     add(table2, table0, 2*256*sizeof(juint));
3387     add(table3, table0, 3*256*sizeof(juint));
3388 
3389   if (UseNeon) {
3390       cmp(len, (u1)64);
3391       br(Assembler::LT, L_by16);
3392       eor(v16, T16B, v16, v16);
3393 
3394     Label L_fold;
3395 
3396       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3397 
3398       ld1(v0, v1, T2D, post(buf, 32));
3399       ld1r(v4, T2D, post(tmp, 8));
3400       ld1r(v5, T2D, post(tmp, 8));
3401       ld1r(v6, T2D, post(tmp, 8));
3402       ld1r(v7, T2D, post(tmp, 8));
3403       mov(v16, T4S, 0, crc);
3404 
3405       eor(v0, T16B, v0, v16);
3406       sub(len, len, 64);
3407 
3408     BIND(L_fold);
3409       pmull(v22, T8H, v0, v5, T8B);
3410       pmull(v20, T8H, v0, v7, T8B);
3411       pmull(v23, T8H, v0, v4, T8B);
3412       pmull(v21, T8H, v0, v6, T8B);
3413 
3414       pmull2(v18, T8H, v0, v5, T16B);
3415       pmull2(v16, T8H, v0, v7, T16B);
3416       pmull2(v19, T8H, v0, v4, T16B);
3417       pmull2(v17, T8H, v0, v6, T16B);
3418 
3419       uzp1(v24, T8H, v20, v22);
3420       uzp2(v25, T8H, v20, v22);
3421       eor(v20, T16B, v24, v25);
3422 
3423       uzp1(v26, T8H, v16, v18);
3424       uzp2(v27, T8H, v16, v18);
3425       eor(v16, T16B, v26, v27);
3426 
3427       ushll2(v22, T4S, v20, T8H, 8);
3428       ushll(v20, T4S, v20, T4H, 8);
3429 
3430       ushll2(v18, T4S, v16, T8H, 8);
3431       ushll(v16, T4S, v16, T4H, 8);
3432 
3433       eor(v22, T16B, v23, v22);
3434       eor(v18, T16B, v19, v18);
3435       eor(v20, T16B, v21, v20);
3436       eor(v16, T16B, v17, v16);
3437 
3438       uzp1(v17, T2D, v16, v20);
3439       uzp2(v21, T2D, v16, v20);
3440       eor(v17, T16B, v17, v21);
3441 
3442       ushll2(v20, T2D, v17, T4S, 16);
3443       ushll(v16, T2D, v17, T2S, 16);
3444 
3445       eor(v20, T16B, v20, v22);
3446       eor(v16, T16B, v16, v18);
3447 
3448       uzp1(v17, T2D, v20, v16);
3449       uzp2(v21, T2D, v20, v16);
3450       eor(v28, T16B, v17, v21);
3451 
3452       pmull(v22, T8H, v1, v5, T8B);
3453       pmull(v20, T8H, v1, v7, T8B);
3454       pmull(v23, T8H, v1, v4, T8B);
3455       pmull(v21, T8H, v1, v6, T8B);
3456 
3457       pmull2(v18, T8H, v1, v5, T16B);
3458       pmull2(v16, T8H, v1, v7, T16B);
3459       pmull2(v19, T8H, v1, v4, T16B);
3460       pmull2(v17, T8H, v1, v6, T16B);
3461 
3462       ld1(v0, v1, T2D, post(buf, 32));
3463 
3464       uzp1(v24, T8H, v20, v22);
3465       uzp2(v25, T8H, v20, v22);
3466       eor(v20, T16B, v24, v25);
3467 
3468       uzp1(v26, T8H, v16, v18);
3469       uzp2(v27, T8H, v16, v18);
3470       eor(v16, T16B, v26, v27);
3471 
3472       ushll2(v22, T4S, v20, T8H, 8);
3473       ushll(v20, T4S, v20, T4H, 8);
3474 
3475       ushll2(v18, T4S, v16, T8H, 8);
3476       ushll(v16, T4S, v16, T4H, 8);
3477 
3478       eor(v22, T16B, v23, v22);
3479       eor(v18, T16B, v19, v18);
3480       eor(v20, T16B, v21, v20);
3481       eor(v16, T16B, v17, v16);
3482 
3483       uzp1(v17, T2D, v16, v20);
3484       uzp2(v21, T2D, v16, v20);
3485       eor(v16, T16B, v17, v21);
3486 
3487       ushll2(v20, T2D, v16, T4S, 16);
3488       ushll(v16, T2D, v16, T2S, 16);
3489 
3490       eor(v20, T16B, v22, v20);
3491       eor(v16, T16B, v16, v18);
3492 
3493       uzp1(v17, T2D, v20, v16);
3494       uzp2(v21, T2D, v20, v16);
3495       eor(v20, T16B, v17, v21);
3496 
3497       shl(v16, T2D, v28, 1);
3498       shl(v17, T2D, v20, 1);
3499 
3500       eor(v0, T16B, v0, v16);
3501       eor(v1, T16B, v1, v17);
3502 
3503       subs(len, len, 32);
3504       br(Assembler::GE, L_fold);
3505 
3506       mov(crc, 0);
3507       mov(tmp, v0, T1D, 0);
3508       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3509       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3510       mov(tmp, v0, T1D, 1);
3511       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3512       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3513       mov(tmp, v1, T1D, 0);
3514       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3515       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3516       mov(tmp, v1, T1D, 1);
3517       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3518       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3519 
3520       add(len, len, 32);
3521   }
3522 
3523   BIND(L_by16);
3524     subs(len, len, 16);
3525     br(Assembler::GE, L_by16_loop);
3526     adds(len, len, 16-4);
3527     br(Assembler::GE, L_by4_loop);
3528     adds(len, len, 4);
3529     br(Assembler::GT, L_by1_loop);
3530     b(L_exit);
3531 
3532   BIND(L_by4_loop);
3533     ldrw(tmp, Address(post(buf, 4)));
3534     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3535     subs(len, len, 4);
3536     br(Assembler::GE, L_by4_loop);
3537     adds(len, len, 4);
3538     br(Assembler::LE, L_exit);
3539   BIND(L_by1_loop);
3540     subs(len, len, 1);
3541     ldrb(tmp, Address(post(buf, 1)));
3542     update_byte_crc32(crc, tmp, table0);
3543     br(Assembler::GT, L_by1_loop);
3544     b(L_exit);
3545 
3546     align(CodeEntryAlignment);
3547   BIND(L_by16_loop);
3548     subs(len, len, 16);
3549     ldp(tmp, tmp3, Address(post(buf, 16)));
3550     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3551     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3552     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3553     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3554     br(Assembler::GE, L_by16_loop);
3555     adds(len, len, 16-4);
3556     br(Assembler::GE, L_by4_loop);
3557     adds(len, len, 4);
3558     br(Assembler::GT, L_by1_loop);
3559   BIND(L_exit);
3560     mvnw(crc, crc);
3561 }
3562 
kernel_crc32c_using_crc32c(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3563 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3564         Register len, Register tmp0, Register tmp1, Register tmp2,
3565         Register tmp3) {
3566     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3567     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3568 
3569     subs(len, len, 128);
3570     br(Assembler::GE, CRC_by64_pre);
3571   BIND(CRC_less64);
3572     adds(len, len, 128-32);
3573     br(Assembler::GE, CRC_by32_loop);
3574   BIND(CRC_less32);
3575     adds(len, len, 32-4);
3576     br(Assembler::GE, CRC_by4_loop);
3577     adds(len, len, 4);
3578     br(Assembler::GT, CRC_by1_loop);
3579     b(L_exit);
3580 
3581   BIND(CRC_by32_loop);
3582     ldp(tmp0, tmp1, Address(post(buf, 16)));
3583     subs(len, len, 32);
3584     crc32cx(crc, crc, tmp0);
3585     ldr(tmp2, Address(post(buf, 8)));
3586     crc32cx(crc, crc, tmp1);
3587     ldr(tmp3, Address(post(buf, 8)));
3588     crc32cx(crc, crc, tmp2);
3589     crc32cx(crc, crc, tmp3);
3590     br(Assembler::GE, CRC_by32_loop);
3591     cmn(len, 32);
3592     br(Assembler::NE, CRC_less32);
3593     b(L_exit);
3594 
3595   BIND(CRC_by4_loop);
3596     ldrw(tmp0, Address(post(buf, 4)));
3597     subs(len, len, 4);
3598     crc32cw(crc, crc, tmp0);
3599     br(Assembler::GE, CRC_by4_loop);
3600     adds(len, len, 4);
3601     br(Assembler::LE, L_exit);
3602   BIND(CRC_by1_loop);
3603     ldrb(tmp0, Address(post(buf, 1)));
3604     subs(len, len, 1);
3605     crc32cb(crc, crc, tmp0);
3606     br(Assembler::GT, CRC_by1_loop);
3607     b(L_exit);
3608 
3609   BIND(CRC_by64_pre);
3610     sub(buf, buf, 8);
3611     ldp(tmp0, tmp1, Address(buf, 8));
3612     crc32cx(crc, crc, tmp0);
3613     ldr(tmp2, Address(buf, 24));
3614     crc32cx(crc, crc, tmp1);
3615     ldr(tmp3, Address(buf, 32));
3616     crc32cx(crc, crc, tmp2);
3617     ldr(tmp0, Address(buf, 40));
3618     crc32cx(crc, crc, tmp3);
3619     ldr(tmp1, Address(buf, 48));
3620     crc32cx(crc, crc, tmp0);
3621     ldr(tmp2, Address(buf, 56));
3622     crc32cx(crc, crc, tmp1);
3623     ldr(tmp3, Address(pre(buf, 64)));
3624 
3625     b(CRC_by64_loop);
3626 
3627     align(CodeEntryAlignment);
3628   BIND(CRC_by64_loop);
3629     subs(len, len, 64);
3630     crc32cx(crc, crc, tmp2);
3631     ldr(tmp0, Address(buf, 8));
3632     crc32cx(crc, crc, tmp3);
3633     ldr(tmp1, Address(buf, 16));
3634     crc32cx(crc, crc, tmp0);
3635     ldr(tmp2, Address(buf, 24));
3636     crc32cx(crc, crc, tmp1);
3637     ldr(tmp3, Address(buf, 32));
3638     crc32cx(crc, crc, tmp2);
3639     ldr(tmp0, Address(buf, 40));
3640     crc32cx(crc, crc, tmp3);
3641     ldr(tmp1, Address(buf, 48));
3642     crc32cx(crc, crc, tmp0);
3643     ldr(tmp2, Address(buf, 56));
3644     crc32cx(crc, crc, tmp1);
3645     ldr(tmp3, Address(pre(buf, 64)));
3646     br(Assembler::GE, CRC_by64_loop);
3647 
3648     // post-loop
3649     crc32cx(crc, crc, tmp2);
3650     crc32cx(crc, crc, tmp3);
3651 
3652     sub(len, len, 64);
3653     add(buf, buf, 8);
3654     cmn(len, 128);
3655     br(Assembler::NE, CRC_less64);
3656   BIND(L_exit);
3657 }
3658 
3659 /**
3660  * @param crc   register containing existing CRC (32-bit)
3661  * @param buf   register pointing to input byte buffer (byte*)
3662  * @param len   register containing number of bytes
3663  * @param table register that will contain address of CRC table
3664  * @param tmp   scratch register
3665  */
kernel_crc32c(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3666 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3667         Register table0, Register table1, Register table2, Register table3,
3668         Register tmp, Register tmp2, Register tmp3) {
3669   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3670 }
3671 
3672 
SkipIfEqual(MacroAssembler * masm,const bool * flag_addr,bool value)3673 SkipIfEqual::SkipIfEqual(
3674     MacroAssembler* masm, const bool* flag_addr, bool value) {
3675   _masm = masm;
3676   unsigned long offset;
3677   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3678   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3679   _masm->cbzw(rscratch1, _label);
3680 }
3681 
~SkipIfEqual()3682 SkipIfEqual::~SkipIfEqual() {
3683   _masm->bind(_label);
3684 }
3685 
addptr(const Address & dst,int32_t src)3686 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3687   Address adr;
3688   switch(dst.getMode()) {
3689   case Address::base_plus_offset:
3690     // This is the expected mode, although we allow all the other
3691     // forms below.
3692     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3693     break;
3694   default:
3695     lea(rscratch2, dst);
3696     adr = Address(rscratch2);
3697     break;
3698   }
3699   ldr(rscratch1, adr);
3700   add(rscratch1, rscratch1, src);
3701   str(rscratch1, adr);
3702 }
3703 
cmpptr(Register src1,Address src2)3704 void MacroAssembler::cmpptr(Register src1, Address src2) {
3705   unsigned long offset;
3706   adrp(rscratch1, src2, offset);
3707   ldr(rscratch1, Address(rscratch1, offset));
3708   cmp(src1, rscratch1);
3709 }
3710 
cmpoop(Register obj1,Register obj2)3711 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3712   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3713   bs->obj_equals(this, obj1, obj2);
3714 }
3715 
load_method_holder_cld(Register rresult,Register rmethod)3716 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
3717   load_method_holder(rresult, rmethod);
3718   ldr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
3719 }
3720 
load_method_holder(Register holder,Register method)3721 void MacroAssembler::load_method_holder(Register holder, Register method) {
3722   ldr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3723   ldr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3724   ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
3725 }
3726 
load_klass(Register dst,Register src)3727 void MacroAssembler::load_klass(Register dst, Register src) {
3728   if (UseCompressedClassPointers) {
3729     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3730     decode_klass_not_null(dst);
3731   } else {
3732     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3733   }
3734 }
3735 
3736 // ((OopHandle)result).resolve();
resolve_oop_handle(Register result,Register tmp)3737 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3738   // OopHandle::resolve is an indirection.
3739   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3740 }
3741 
3742 // ((WeakHandle)result).resolve();
resolve_weak_handle(Register rresult,Register rtmp)3743 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
3744   assert_different_registers(rresult, rtmp);
3745   Label resolved;
3746 
3747   // A null weak handle resolves to null.
3748   cbz(rresult, resolved);
3749 
3750   // Only 64 bit platforms support GCs that require a tmp register
3751   // Only IN_HEAP loads require a thread_tmp register
3752   // WeakHandle::resolve is an indirection like jweak.
3753   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3754                  rresult, Address(rresult), rtmp, /*tmp_thread*/noreg);
3755   bind(resolved);
3756 }
3757 
load_mirror(Register dst,Register method,Register tmp)3758 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3759   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3760   ldr(dst, Address(rmethod, Method::const_offset()));
3761   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3762   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3763   ldr(dst, Address(dst, mirror_offset));
3764   resolve_oop_handle(dst, tmp);
3765 }
3766 
cmp_klass(Register oop,Register trial_klass,Register tmp)3767 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3768   if (UseCompressedClassPointers) {
3769     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3770     if (CompressedKlassPointers::base() == NULL) {
3771       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3772       return;
3773     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3774                && CompressedKlassPointers::shift() == 0) {
3775       // Only the bottom 32 bits matter
3776       cmpw(trial_klass, tmp);
3777       return;
3778     }
3779     decode_klass_not_null(tmp);
3780   } else {
3781     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3782   }
3783   cmp(trial_klass, tmp);
3784 }
3785 
load_prototype_header(Register dst,Register src)3786 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3787   load_klass(dst, src);
3788   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3789 }
3790 
store_klass(Register dst,Register src)3791 void MacroAssembler::store_klass(Register dst, Register src) {
3792   // FIXME: Should this be a store release?  concurrent gcs assumes
3793   // klass length is valid if klass field is not null.
3794   if (UseCompressedClassPointers) {
3795     encode_klass_not_null(src);
3796     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3797   } else {
3798     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3799   }
3800 }
3801 
store_klass_gap(Register dst,Register src)3802 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3803   if (UseCompressedClassPointers) {
3804     // Store to klass gap in destination
3805     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3806   }
3807 }
3808 
3809 // Algorithm must match CompressedOops::encode.
encode_heap_oop(Register d,Register s)3810 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3811 #ifdef ASSERT
3812   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3813 #endif
3814   verify_oop(s, "broken oop in encode_heap_oop");
3815   if (CompressedOops::base() == NULL) {
3816     if (CompressedOops::shift() != 0) {
3817       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3818       lsr(d, s, LogMinObjAlignmentInBytes);
3819     } else {
3820       mov(d, s);
3821     }
3822   } else {
3823     subs(d, s, rheapbase);
3824     csel(d, d, zr, Assembler::HS);
3825     lsr(d, d, LogMinObjAlignmentInBytes);
3826 
3827     /*  Old algorithm: is this any worse?
3828     Label nonnull;
3829     cbnz(r, nonnull);
3830     sub(r, r, rheapbase);
3831     bind(nonnull);
3832     lsr(r, r, LogMinObjAlignmentInBytes);
3833     */
3834   }
3835 }
3836 
encode_heap_oop_not_null(Register r)3837 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3838 #ifdef ASSERT
3839   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3840   if (CheckCompressedOops) {
3841     Label ok;
3842     cbnz(r, ok);
3843     stop("null oop passed to encode_heap_oop_not_null");
3844     bind(ok);
3845   }
3846 #endif
3847   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3848   if (CompressedOops::base() != NULL) {
3849     sub(r, r, rheapbase);
3850   }
3851   if (CompressedOops::shift() != 0) {
3852     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3853     lsr(r, r, LogMinObjAlignmentInBytes);
3854   }
3855 }
3856 
encode_heap_oop_not_null(Register dst,Register src)3857 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3858 #ifdef ASSERT
3859   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3860   if (CheckCompressedOops) {
3861     Label ok;
3862     cbnz(src, ok);
3863     stop("null oop passed to encode_heap_oop_not_null2");
3864     bind(ok);
3865   }
3866 #endif
3867   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3868 
3869   Register data = src;
3870   if (CompressedOops::base() != NULL) {
3871     sub(dst, src, rheapbase);
3872     data = dst;
3873   }
3874   if (CompressedOops::shift() != 0) {
3875     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3876     lsr(dst, data, LogMinObjAlignmentInBytes);
3877     data = dst;
3878   }
3879   if (data == src)
3880     mov(dst, src);
3881 }
3882 
decode_heap_oop(Register d,Register s)3883 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3884 #ifdef ASSERT
3885   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3886 #endif
3887   if (CompressedOops::base() == NULL) {
3888     if (CompressedOops::shift() != 0 || d != s) {
3889       lsl(d, s, CompressedOops::shift());
3890     }
3891   } else {
3892     Label done;
3893     if (d != s)
3894       mov(d, s);
3895     cbz(s, done);
3896     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3897     bind(done);
3898   }
3899   verify_oop(d, "broken oop in decode_heap_oop");
3900 }
3901 
decode_heap_oop_not_null(Register r)3902 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3903   assert (UseCompressedOops, "should only be used for compressed headers");
3904   assert (Universe::heap() != NULL, "java heap should be initialized");
3905   // Cannot assert, unverified entry point counts instructions (see .ad file)
3906   // vtableStubs also counts instructions in pd_code_size_limit.
3907   // Also do not verify_oop as this is called by verify_oop.
3908   if (CompressedOops::shift() != 0) {
3909     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3910     if (CompressedOops::base() != NULL) {
3911       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3912     } else {
3913       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3914     }
3915   } else {
3916     assert (CompressedOops::base() == NULL, "sanity");
3917   }
3918 }
3919 
decode_heap_oop_not_null(Register dst,Register src)3920 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3921   assert (UseCompressedOops, "should only be used for compressed headers");
3922   assert (Universe::heap() != NULL, "java heap should be initialized");
3923   // Cannot assert, unverified entry point counts instructions (see .ad file)
3924   // vtableStubs also counts instructions in pd_code_size_limit.
3925   // Also do not verify_oop as this is called by verify_oop.
3926   if (CompressedOops::shift() != 0) {
3927     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3928     if (CompressedOops::base() != NULL) {
3929       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3930     } else {
3931       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3932     }
3933   } else {
3934     assert (CompressedOops::base() == NULL, "sanity");
3935     if (dst != src) {
3936       mov(dst, src);
3937     }
3938   }
3939 }
3940 
3941 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode(KlassDecodeNone);
3942 
klass_decode_mode()3943 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() {
3944   assert(UseCompressedClassPointers, "not using compressed class pointers");
3945   assert(Metaspace::initialized(), "metaspace not initialized yet");
3946 
3947   if (_klass_decode_mode != KlassDecodeNone) {
3948     return _klass_decode_mode;
3949   }
3950 
3951   assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift()
3952          || 0 == CompressedKlassPointers::shift(), "decode alg wrong");
3953 
3954   if (CompressedKlassPointers::base() == NULL) {
3955     return (_klass_decode_mode = KlassDecodeZero);
3956   }
3957 
3958   if (operand_valid_for_logical_immediate(
3959         /*is32*/false, (uint64_t)CompressedKlassPointers::base())) {
3960     const uint64_t range_mask =
3961       (1UL << log2_intptr(CompressedKlassPointers::range())) - 1;
3962     if (((uint64_t)CompressedKlassPointers::base() & range_mask) == 0) {
3963       return (_klass_decode_mode = KlassDecodeXor);
3964     }
3965   }
3966 
3967   const uint64_t shifted_base =
3968     (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
3969   guarantee((shifted_base & 0xffff0000ffffffff) == 0,
3970             "compressed class base bad alignment");
3971 
3972   return (_klass_decode_mode = KlassDecodeMovk);
3973 }
3974 
encode_klass_not_null(Register dst,Register src)3975 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3976   switch (klass_decode_mode()) {
3977   case KlassDecodeZero:
3978     if (CompressedKlassPointers::shift() != 0) {
3979       lsr(dst, src, LogKlassAlignmentInBytes);
3980     } else {
3981       if (dst != src) mov(dst, src);
3982     }
3983     break;
3984 
3985   case KlassDecodeXor:
3986     if (CompressedKlassPointers::shift() != 0) {
3987       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3988       lsr(dst, dst, LogKlassAlignmentInBytes);
3989     } else {
3990       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3991     }
3992     break;
3993 
3994   case KlassDecodeMovk:
3995     if (CompressedKlassPointers::shift() != 0) {
3996       ubfx(dst, src, LogKlassAlignmentInBytes, 32);
3997     } else {
3998       movw(dst, src);
3999     }
4000     break;
4001 
4002   case KlassDecodeNone:
4003     ShouldNotReachHere();
4004     break;
4005   }
4006 }
4007 
encode_klass_not_null(Register r)4008 void MacroAssembler::encode_klass_not_null(Register r) {
4009   encode_klass_not_null(r, r);
4010 }
4011 
decode_klass_not_null(Register dst,Register src)4012 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
4013   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4014 
4015   switch (klass_decode_mode()) {
4016   case KlassDecodeZero:
4017     if (CompressedKlassPointers::shift() != 0) {
4018       lsl(dst, src, LogKlassAlignmentInBytes);
4019     } else {
4020       if (dst != src) mov(dst, src);
4021     }
4022     break;
4023 
4024   case KlassDecodeXor:
4025     if (CompressedKlassPointers::shift() != 0) {
4026       lsl(dst, src, LogKlassAlignmentInBytes);
4027       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
4028     } else {
4029       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4030     }
4031     break;
4032 
4033   case KlassDecodeMovk: {
4034     const uint64_t shifted_base =
4035       (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4036 
4037     if (dst != src) movw(dst, src);
4038     movk(dst, shifted_base >> 32, 32);
4039 
4040     if (CompressedKlassPointers::shift() != 0) {
4041       lsl(dst, dst, LogKlassAlignmentInBytes);
4042     }
4043 
4044     break;
4045   }
4046 
4047   case KlassDecodeNone:
4048     ShouldNotReachHere();
4049     break;
4050   }
4051 }
4052 
decode_klass_not_null(Register r)4053 void  MacroAssembler::decode_klass_not_null(Register r) {
4054   decode_klass_not_null(r, r);
4055 }
4056 
set_narrow_oop(Register dst,jobject obj)4057 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4058 #ifdef ASSERT
4059   {
4060     ThreadInVMfromUnknown tiv;
4061     assert (UseCompressedOops, "should only be used for compressed oops");
4062     assert (Universe::heap() != NULL, "java heap should be initialized");
4063     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4064     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4065   }
4066 #endif
4067   int oop_index = oop_recorder()->find_index(obj);
4068   InstructionMark im(this);
4069   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4070   code_section()->relocate(inst_mark(), rspec);
4071   movz(dst, 0xDEAD, 16);
4072   movk(dst, 0xBEEF);
4073 }
4074 
set_narrow_klass(Register dst,Klass * k)4075 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4076   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4077   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4078   int index = oop_recorder()->find_index(k);
4079   assert(! Universe::heap()->is_in(k), "should not be an oop");
4080 
4081   InstructionMark im(this);
4082   RelocationHolder rspec = metadata_Relocation::spec(index);
4083   code_section()->relocate(inst_mark(), rspec);
4084   narrowKlass nk = CompressedKlassPointers::encode(k);
4085   movz(dst, (nk >> 16), 16);
4086   movk(dst, nk & 0xffff);
4087 }
4088 
access_load_at(BasicType type,DecoratorSet decorators,Register dst,Address src,Register tmp1,Register thread_tmp)4089 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4090                                     Register dst, Address src,
4091                                     Register tmp1, Register thread_tmp) {
4092   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4093   decorators = AccessInternal::decorator_fixup(decorators);
4094   bool as_raw = (decorators & AS_RAW) != 0;
4095   if (as_raw) {
4096     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4097   } else {
4098     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4099   }
4100 }
4101 
access_store_at(BasicType type,DecoratorSet decorators,Address dst,Register src,Register tmp1,Register thread_tmp)4102 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4103                                      Address dst, Register src,
4104                                      Register tmp1, Register thread_tmp) {
4105   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4106   decorators = AccessInternal::decorator_fixup(decorators);
4107   bool as_raw = (decorators & AS_RAW) != 0;
4108   if (as_raw) {
4109     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4110   } else {
4111     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4112   }
4113 }
4114 
resolve(DecoratorSet decorators,Register obj)4115 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4116   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4117   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4118     decorators |= ACCESS_READ | ACCESS_WRITE;
4119   }
4120   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4121   return bs->resolve(this, decorators, obj);
4122 }
4123 
load_heap_oop(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4124 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4125                                    Register thread_tmp, DecoratorSet decorators) {
4126   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4127 }
4128 
load_heap_oop_not_null(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4129 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4130                                             Register thread_tmp, DecoratorSet decorators) {
4131   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4132 }
4133 
store_heap_oop(Address dst,Register src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4134 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4135                                     Register thread_tmp, DecoratorSet decorators) {
4136   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4137 }
4138 
4139 // Used for storing NULLs.
store_heap_oop_null(Address dst)4140 void MacroAssembler::store_heap_oop_null(Address dst) {
4141   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4142 }
4143 
allocate_metadata_address(Metadata * obj)4144 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4145   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4146   int index = oop_recorder()->allocate_metadata_index(obj);
4147   RelocationHolder rspec = metadata_Relocation::spec(index);
4148   return Address((address)obj, rspec);
4149 }
4150 
4151 // Move an oop into a register.  immediate is true if we want
4152 // immediate instructions and nmethod entry barriers are not enabled.
4153 // i.e. we are not going to patch this instruction while the code is being
4154 // executed by another thread.
movoop(Register dst,jobject obj,bool immediate)4155 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4156   int oop_index;
4157   if (obj == NULL) {
4158     oop_index = oop_recorder()->allocate_oop_index(obj);
4159   } else {
4160 #ifdef ASSERT
4161     {
4162       ThreadInVMfromUnknown tiv;
4163       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4164     }
4165 #endif
4166     oop_index = oop_recorder()->find_index(obj);
4167   }
4168   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4169 
4170   // nmethod entry barrier necessitate using the constant pool. They have to be
4171   // ordered with respected to oop accesses.
4172   // Using immediate literals would necessitate ISBs.
4173   if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL || !immediate) {
4174     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4175     ldr_constant(dst, Address(dummy, rspec));
4176   } else
4177     mov(dst, Address((address)obj, rspec));
4178 
4179 }
4180 
4181 // Move a metadata address into a register.
mov_metadata(Register dst,Metadata * obj)4182 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4183   int oop_index;
4184   if (obj == NULL) {
4185     oop_index = oop_recorder()->allocate_metadata_index(obj);
4186   } else {
4187     oop_index = oop_recorder()->find_index(obj);
4188   }
4189   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4190   mov(dst, Address((address)obj, rspec));
4191 }
4192 
constant_oop_address(jobject obj)4193 Address MacroAssembler::constant_oop_address(jobject obj) {
4194 #ifdef ASSERT
4195   {
4196     ThreadInVMfromUnknown tiv;
4197     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4198     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4199   }
4200 #endif
4201   int oop_index = oop_recorder()->find_index(obj);
4202   return Address((address)obj, oop_Relocation::spec(oop_index));
4203 }
4204 
4205 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
tlab_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Register t2,Label & slow_case)4206 void MacroAssembler::tlab_allocate(Register obj,
4207                                    Register var_size_in_bytes,
4208                                    int con_size_in_bytes,
4209                                    Register t1,
4210                                    Register t2,
4211                                    Label& slow_case) {
4212   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4213   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4214 }
4215 
4216 // Defines obj, preserves var_size_in_bytes
eden_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Label & slow_case)4217 void MacroAssembler::eden_allocate(Register obj,
4218                                    Register var_size_in_bytes,
4219                                    int con_size_in_bytes,
4220                                    Register t1,
4221                                    Label& slow_case) {
4222   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4223   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4224 }
4225 
4226 // Zero words; len is in bytes
4227 // Destroys all registers except addr
4228 // len must be a nonzero multiple of wordSize
zero_memory(Register addr,Register len,Register t1)4229 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4230   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4231 
4232 #ifdef ASSERT
4233   { Label L;
4234     tst(len, BytesPerWord - 1);
4235     br(Assembler::EQ, L);
4236     stop("len is not a multiple of BytesPerWord");
4237     bind(L);
4238   }
4239 #endif
4240 
4241 #ifndef PRODUCT
4242   block_comment("zero memory");
4243 #endif
4244 
4245   Label loop;
4246   Label entry;
4247 
4248 //  Algorithm:
4249 //
4250 //    scratch1 = cnt & 7;
4251 //    cnt -= scratch1;
4252 //    p += scratch1;
4253 //    switch (scratch1) {
4254 //      do {
4255 //        cnt -= 8;
4256 //          p[-8] = 0;
4257 //        case 7:
4258 //          p[-7] = 0;
4259 //        case 6:
4260 //          p[-6] = 0;
4261 //          // ...
4262 //        case 1:
4263 //          p[-1] = 0;
4264 //        case 0:
4265 //          p += 8;
4266 //      } while (cnt);
4267 //    }
4268 
4269   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4270 
4271   lsr(len, len, LogBytesPerWord);
4272   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4273   sub(len, len, rscratch1);      // cnt -= unroll
4274   // t1 always points to the end of the region we're about to zero
4275   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4276   adr(rscratch2, entry);
4277   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4278   br(rscratch2);
4279   bind(loop);
4280   sub(len, len, unroll);
4281   for (int i = -unroll; i < 0; i++)
4282     Assembler::str(zr, Address(t1, i * wordSize));
4283   bind(entry);
4284   add(t1, t1, unroll * wordSize);
4285   cbnz(len, loop);
4286 }
4287 
verify_tlab()4288 void MacroAssembler::verify_tlab() {
4289 #ifdef ASSERT
4290   if (UseTLAB && VerifyOops) {
4291     Label next, ok;
4292 
4293     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4294 
4295     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4296     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4297     cmp(rscratch2, rscratch1);
4298     br(Assembler::HS, next);
4299     STOP("assert(top >= start)");
4300     should_not_reach_here();
4301 
4302     bind(next);
4303     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4304     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4305     cmp(rscratch2, rscratch1);
4306     br(Assembler::HS, ok);
4307     STOP("assert(top <= end)");
4308     should_not_reach_here();
4309 
4310     bind(ok);
4311     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4312   }
4313 #endif
4314 }
4315 
4316 // Writes to stack successive pages until offset reached to check for
4317 // stack overflow + shadow pages.  This clobbers tmp.
bang_stack_size(Register size,Register tmp)4318 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4319   assert_different_registers(tmp, size, rscratch1);
4320   mov(tmp, sp);
4321   // Bang stack for total size given plus shadow page size.
4322   // Bang one page at a time because large size can bang beyond yellow and
4323   // red zones.
4324   Label loop;
4325   mov(rscratch1, os::vm_page_size());
4326   bind(loop);
4327   lea(tmp, Address(tmp, -os::vm_page_size()));
4328   subsw(size, size, rscratch1);
4329   str(size, Address(tmp));
4330   br(Assembler::GT, loop);
4331 
4332   // Bang down shadow pages too.
4333   // At this point, (tmp-0) is the last address touched, so don't
4334   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4335   // was post-decremented.)  Skip this address by starting at i=1, and
4336   // touch a few more pages below.  N.B.  It is important to touch all
4337   // the way down to and including i=StackShadowPages.
4338   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4339     // this could be any sized move but this is can be a debugging crumb
4340     // so the bigger the better.
4341     lea(tmp, Address(tmp, -os::vm_page_size()));
4342     str(size, Address(tmp));
4343   }
4344 }
4345 
4346 // Move the address of the polling page into dest.
get_polling_page(Register dest,relocInfo::relocType rtype)4347 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4348   ldr(dest, Address(rthread, Thread::polling_page_offset()));
4349 }
4350 
4351 // Move the address of the polling page into r, then read the polling
4352 // page.
fetch_and_read_polling_page(Register r,relocInfo::relocType rtype)4353 address MacroAssembler::fetch_and_read_polling_page(Register r, relocInfo::relocType rtype) {
4354   get_polling_page(r, rtype);
4355   return read_polling_page(r, rtype);
4356 }
4357 
4358 // Read the polling page.  The address of the polling page must
4359 // already be in r.
read_polling_page(Register r,relocInfo::relocType rtype)4360 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4361   InstructionMark im(this);
4362   code_section()->relocate(inst_mark(), rtype);
4363   ldrw(zr, Address(r, 0));
4364   return inst_mark();
4365 }
4366 
adrp(Register reg1,const Address & dest,unsigned long & byte_offset)4367 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4368   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4369   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4370   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4371   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4372   long offset_low = dest_page - low_page;
4373   long offset_high = dest_page - high_page;
4374 
4375   assert(is_valid_AArch64_address(dest.target()), "bad address");
4376   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4377 
4378   InstructionMark im(this);
4379   code_section()->relocate(inst_mark(), dest.rspec());
4380   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4381   // the code cache so that if it is relocated we know it will still reach
4382   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4383     _adrp(reg1, dest.target());
4384   } else {
4385     unsigned long target = (unsigned long)dest.target();
4386     unsigned long adrp_target
4387       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4388 
4389     _adrp(reg1, (address)adrp_target);
4390     movk(reg1, target >> 32, 32);
4391   }
4392   byte_offset = (unsigned long)dest.target() & 0xfff;
4393 }
4394 
load_byte_map_base(Register reg)4395 void MacroAssembler::load_byte_map_base(Register reg) {
4396   CardTable::CardValue* byte_map_base =
4397     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4398 
4399   if (is_valid_AArch64_address((address)byte_map_base)) {
4400     // Strictly speaking the byte_map_base isn't an address at all,
4401     // and it might even be negative.
4402     unsigned long offset;
4403     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4404     // We expect offset to be zero with most collectors.
4405     if (offset != 0) {
4406       add(reg, reg, offset);
4407     }
4408   } else {
4409     mov(reg, (uint64_t)byte_map_base);
4410   }
4411 }
4412 
build_frame(int framesize)4413 void MacroAssembler::build_frame(int framesize) {
4414   assert(framesize > 0, "framesize must be > 0");
4415   if (framesize < ((1 << 9) + 2 * wordSize)) {
4416     sub(sp, sp, framesize);
4417     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4418     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4419   } else {
4420     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4421     if (PreserveFramePointer) mov(rfp, sp);
4422     if (framesize < ((1 << 12) + 2 * wordSize))
4423       sub(sp, sp, framesize - 2 * wordSize);
4424     else {
4425       mov(rscratch1, framesize - 2 * wordSize);
4426       sub(sp, sp, rscratch1);
4427     }
4428   }
4429 }
4430 
remove_frame(int framesize)4431 void MacroAssembler::remove_frame(int framesize) {
4432   assert(framesize > 0, "framesize must be > 0");
4433   if (framesize < ((1 << 9) + 2 * wordSize)) {
4434     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4435     add(sp, sp, framesize);
4436   } else {
4437     if (framesize < ((1 << 12) + 2 * wordSize))
4438       add(sp, sp, framesize - 2 * wordSize);
4439     else {
4440       mov(rscratch1, framesize - 2 * wordSize);
4441       add(sp, sp, rscratch1);
4442     }
4443     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4444   }
4445 }
4446 
4447 
4448 // This method checks if provided byte array contains byte with highest bit set.
has_negatives(Register ary1,Register len,Register result)4449 address MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
4450     // Simple and most common case of aligned small array which is not at the
4451     // end of memory page is placed here. All other cases are in stub.
4452     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
4453     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4454     assert_different_registers(ary1, len, result);
4455 
4456     cmpw(len, 0);
4457     br(LE, SET_RESULT);
4458     cmpw(len, 4 * wordSize);
4459     br(GE, STUB_LONG); // size > 32 then go to stub
4460 
4461     int shift = 64 - exact_log2(os::vm_page_size());
4462     lsl(rscratch1, ary1, shift);
4463     mov(rscratch2, (size_t)(4 * wordSize) << shift);
4464     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
4465     br(CS, STUB); // at the end of page then go to stub
4466     subs(len, len, wordSize);
4467     br(LT, END);
4468 
4469   BIND(LOOP);
4470     ldr(rscratch1, Address(post(ary1, wordSize)));
4471     tst(rscratch1, UPPER_BIT_MASK);
4472     br(NE, SET_RESULT);
4473     subs(len, len, wordSize);
4474     br(GE, LOOP);
4475     cmpw(len, -wordSize);
4476     br(EQ, SET_RESULT);
4477 
4478   BIND(END);
4479     ldr(result, Address(ary1));
4480     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
4481     lslv(result, result, len);
4482     tst(result, UPPER_BIT_MASK);
4483     b(SET_RESULT);
4484 
4485   BIND(STUB);
4486     RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives());
4487     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
4488     address tpc1 = trampoline_call(has_neg);
4489     if (tpc1 == NULL) {
4490       DEBUG_ONLY(reset_labels3(STUB_LONG, SET_RESULT, DONE));
4491       postcond(pc() == badAddress);
4492       return NULL;
4493     }
4494     b(DONE);
4495 
4496   BIND(STUB_LONG);
4497     RuntimeAddress has_neg_long = RuntimeAddress(StubRoutines::aarch64::has_negatives_long());
4498     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
4499     address tpc2 = trampoline_call(has_neg_long);
4500     if (tpc2 == NULL) {
4501       DEBUG_ONLY(reset_labels2(SET_RESULT, DONE));
4502       postcond(pc() == badAddress);
4503       return NULL;
4504     }
4505     b(DONE);
4506 
4507   BIND(SET_RESULT);
4508     cset(result, NE); // set true or false
4509 
4510   BIND(DONE);
4511   postcond(pc() != badAddress);
4512   return pc();
4513 }
4514 
arrays_equals(Register a1,Register a2,Register tmp3,Register tmp4,Register tmp5,Register result,Register cnt1,int elem_size)4515 address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
4516                                       Register tmp4, Register tmp5, Register result,
4517                                       Register cnt1, int elem_size) {
4518   Label DONE, SAME;
4519   Register tmp1 = rscratch1;
4520   Register tmp2 = rscratch2;
4521   Register cnt2 = tmp2;  // cnt2 only used in array length compare
4522   int elem_per_word = wordSize/elem_size;
4523   int log_elem_size = exact_log2(elem_size);
4524   int length_offset = arrayOopDesc::length_offset_in_bytes();
4525   int base_offset
4526     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
4527   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
4528 
4529   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
4530   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4531 
4532 #ifndef PRODUCT
4533   {
4534     const char kind = (elem_size == 2) ? 'U' : 'L';
4535     char comment[64];
4536     snprintf(comment, sizeof comment, "array_equals%c{", kind);
4537     BLOCK_COMMENT(comment);
4538   }
4539 #endif
4540 
4541   // if (a1 == a2)
4542   //     return true;
4543   cmpoop(a1, a2); // May have read barriers for a1 and a2.
4544   br(EQ, SAME);
4545 
4546   if (UseSimpleArrayEquals) {
4547     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
4548     // if (a1 == null || a2 == null)
4549     //     return false;
4550     // a1 & a2 == 0 means (some-pointer is null) or
4551     // (very-rare-or-even-probably-impossible-pointer-values)
4552     // so, we can save one branch in most cases
4553     tst(a1, a2);
4554     mov(result, false);
4555     br(EQ, A_MIGHT_BE_NULL);
4556     // if (a1.length != a2.length)
4557     //      return false;
4558     bind(A_IS_NOT_NULL);
4559     ldrw(cnt1, Address(a1, length_offset));
4560     ldrw(cnt2, Address(a2, length_offset));
4561     eorw(tmp5, cnt1, cnt2);
4562     cbnzw(tmp5, DONE);
4563     lea(a1, Address(a1, base_offset));
4564     lea(a2, Address(a2, base_offset));
4565     // Check for short strings, i.e. smaller than wordSize.
4566     subs(cnt1, cnt1, elem_per_word);
4567     br(Assembler::LT, SHORT);
4568     // Main 8 byte comparison loop.
4569     bind(NEXT_WORD); {
4570       ldr(tmp1, Address(post(a1, wordSize)));
4571       ldr(tmp2, Address(post(a2, wordSize)));
4572       subs(cnt1, cnt1, elem_per_word);
4573       eor(tmp5, tmp1, tmp2);
4574       cbnz(tmp5, DONE);
4575     } br(GT, NEXT_WORD);
4576     // Last longword.  In the case where length == 4 we compare the
4577     // same longword twice, but that's still faster than another
4578     // conditional branch.
4579     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
4580     // length == 4.
4581     if (log_elem_size > 0)
4582       lsl(cnt1, cnt1, log_elem_size);
4583     ldr(tmp3, Address(a1, cnt1));
4584     ldr(tmp4, Address(a2, cnt1));
4585     eor(tmp5, tmp3, tmp4);
4586     cbnz(tmp5, DONE);
4587     b(SAME);
4588     bind(A_MIGHT_BE_NULL);
4589     // in case both a1 and a2 are not-null, proceed with loads
4590     cbz(a1, DONE);
4591     cbz(a2, DONE);
4592     b(A_IS_NOT_NULL);
4593     bind(SHORT);
4594 
4595     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
4596     {
4597       ldrw(tmp1, Address(post(a1, 4)));
4598       ldrw(tmp2, Address(post(a2, 4)));
4599       eorw(tmp5, tmp1, tmp2);
4600       cbnzw(tmp5, DONE);
4601     }
4602     bind(TAIL03);
4603     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
4604     {
4605       ldrh(tmp3, Address(post(a1, 2)));
4606       ldrh(tmp4, Address(post(a2, 2)));
4607       eorw(tmp5, tmp3, tmp4);
4608       cbnzw(tmp5, DONE);
4609     }
4610     bind(TAIL01);
4611     if (elem_size == 1) { // Only needed when comparing byte arrays.
4612       tbz(cnt1, 0, SAME); // 0-1 bytes left.
4613       {
4614         ldrb(tmp1, a1);
4615         ldrb(tmp2, a2);
4616         eorw(tmp5, tmp1, tmp2);
4617         cbnzw(tmp5, DONE);
4618       }
4619     }
4620   } else {
4621     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB,
4622         CSET_EQ, LAST_CHECK;
4623     mov(result, false);
4624     cbz(a1, DONE);
4625     ldrw(cnt1, Address(a1, length_offset));
4626     cbz(a2, DONE);
4627     ldrw(cnt2, Address(a2, length_offset));
4628     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
4629     // faster to perform another branch before comparing a1 and a2
4630     cmp(cnt1, (u1)elem_per_word);
4631     br(LE, SHORT); // short or same
4632     ldr(tmp3, Address(pre(a1, base_offset)));
4633     subs(zr, cnt1, stubBytesThreshold);
4634     br(GE, STUB);
4635     ldr(tmp4, Address(pre(a2, base_offset)));
4636     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
4637     cmp(cnt2, cnt1);
4638     br(NE, DONE);
4639 
4640     // Main 16 byte comparison loop with 2 exits
4641     bind(NEXT_DWORD); {
4642       ldr(tmp1, Address(pre(a1, wordSize)));
4643       ldr(tmp2, Address(pre(a2, wordSize)));
4644       subs(cnt1, cnt1, 2 * elem_per_word);
4645       br(LE, TAIL);
4646       eor(tmp4, tmp3, tmp4);
4647       cbnz(tmp4, DONE);
4648       ldr(tmp3, Address(pre(a1, wordSize)));
4649       ldr(tmp4, Address(pre(a2, wordSize)));
4650       cmp(cnt1, (u1)elem_per_word);
4651       br(LE, TAIL2);
4652       cmp(tmp1, tmp2);
4653     } br(EQ, NEXT_DWORD);
4654     b(DONE);
4655 
4656     bind(TAIL);
4657     eor(tmp4, tmp3, tmp4);
4658     eor(tmp2, tmp1, tmp2);
4659     lslv(tmp2, tmp2, tmp5);
4660     orr(tmp5, tmp4, tmp2);
4661     cmp(tmp5, zr);
4662     b(CSET_EQ);
4663 
4664     bind(TAIL2);
4665     eor(tmp2, tmp1, tmp2);
4666     cbnz(tmp2, DONE);
4667     b(LAST_CHECK);
4668 
4669     bind(STUB);
4670     ldr(tmp4, Address(pre(a2, base_offset)));
4671     cmp(cnt2, cnt1);
4672     br(NE, DONE);
4673     if (elem_size == 2) { // convert to byte counter
4674       lsl(cnt1, cnt1, 1);
4675     }
4676     eor(tmp5, tmp3, tmp4);
4677     cbnz(tmp5, DONE);
4678     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
4679     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
4680     address tpc = trampoline_call(stub);
4681     if (tpc == NULL) {
4682       DEBUG_ONLY(reset_labels5(SHORT, LAST_CHECK, CSET_EQ, SAME, DONE));
4683       postcond(pc() == badAddress);
4684       return NULL;
4685     }
4686     b(DONE);
4687 
4688     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
4689     // so, if a2 == null => return false(0), else return true, so we can return a2
4690     mov(result, a2);
4691     b(DONE);
4692     bind(SHORT);
4693     cmp(cnt2, cnt1);
4694     br(NE, DONE);
4695     cbz(cnt1, SAME);
4696     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
4697     ldr(tmp3, Address(a1, base_offset));
4698     ldr(tmp4, Address(a2, base_offset));
4699     bind(LAST_CHECK);
4700     eor(tmp4, tmp3, tmp4);
4701     lslv(tmp5, tmp4, tmp5);
4702     cmp(tmp5, zr);
4703     bind(CSET_EQ);
4704     cset(result, EQ);
4705     b(DONE);
4706   }
4707 
4708   bind(SAME);
4709   mov(result, true);
4710   // That's it.
4711   bind(DONE);
4712 
4713   BLOCK_COMMENT("} array_equals");
4714   postcond(pc() != badAddress);
4715   return pc();
4716 }
4717 
4718 // Compare Strings
4719 
4720 // For Strings we're passed the address of the first characters in a1
4721 // and a2 and the length in cnt1.
4722 // elem_size is the element size in bytes: either 1 or 2.
4723 // There are two implementations.  For arrays >= 8 bytes, all
4724 // comparisons (including the final one, which may overlap) are
4725 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
4726 // halfword, then a short, and then a byte.
4727 
string_equals(Register a1,Register a2,Register result,Register cnt1,int elem_size)4728 void MacroAssembler::string_equals(Register a1, Register a2,
4729                                    Register result, Register cnt1, int elem_size)
4730 {
4731   Label SAME, DONE, SHORT, NEXT_WORD;
4732   Register tmp1 = rscratch1;
4733   Register tmp2 = rscratch2;
4734   Register cnt2 = tmp2;  // cnt2 only used in array length compare
4735 
4736   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
4737   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4738 
4739 #ifndef PRODUCT
4740   {
4741     const char kind = (elem_size == 2) ? 'U' : 'L';
4742     char comment[64];
4743     snprintf(comment, sizeof comment, "{string_equals%c", kind);
4744     BLOCK_COMMENT(comment);
4745   }
4746 #endif
4747 
4748   mov(result, false);
4749 
4750   // Check for short strings, i.e. smaller than wordSize.
4751   subs(cnt1, cnt1, wordSize);
4752   br(Assembler::LT, SHORT);
4753   // Main 8 byte comparison loop.
4754   bind(NEXT_WORD); {
4755     ldr(tmp1, Address(post(a1, wordSize)));
4756     ldr(tmp2, Address(post(a2, wordSize)));
4757     subs(cnt1, cnt1, wordSize);
4758     eor(tmp1, tmp1, tmp2);
4759     cbnz(tmp1, DONE);
4760   } br(GT, NEXT_WORD);
4761   // Last longword.  In the case where length == 4 we compare the
4762   // same longword twice, but that's still faster than another
4763   // conditional branch.
4764   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
4765   // length == 4.
4766   ldr(tmp1, Address(a1, cnt1));
4767   ldr(tmp2, Address(a2, cnt1));
4768   eor(tmp2, tmp1, tmp2);
4769   cbnz(tmp2, DONE);
4770   b(SAME);
4771 
4772   bind(SHORT);
4773   Label TAIL03, TAIL01;
4774 
4775   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
4776   {
4777     ldrw(tmp1, Address(post(a1, 4)));
4778     ldrw(tmp2, Address(post(a2, 4)));
4779     eorw(tmp1, tmp1, tmp2);
4780     cbnzw(tmp1, DONE);
4781   }
4782   bind(TAIL03);
4783   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
4784   {
4785     ldrh(tmp1, Address(post(a1, 2)));
4786     ldrh(tmp2, Address(post(a2, 2)));
4787     eorw(tmp1, tmp1, tmp2);
4788     cbnzw(tmp1, DONE);
4789   }
4790   bind(TAIL01);
4791   if (elem_size == 1) { // Only needed when comparing 1-byte elements
4792     tbz(cnt1, 0, SAME); // 0-1 bytes left.
4793     {
4794       ldrb(tmp1, a1);
4795       ldrb(tmp2, a2);
4796       eorw(tmp1, tmp1, tmp2);
4797       cbnzw(tmp1, DONE);
4798     }
4799   }
4800   // Arrays are equal.
4801   bind(SAME);
4802   mov(result, true);
4803 
4804   // That's it.
4805   bind(DONE);
4806   BLOCK_COMMENT("} string_equals");
4807 }
4808 
4809 
4810 // The size of the blocks erased by the zero_blocks stub.  We must
4811 // handle anything smaller than this ourselves in zero_words().
4812 const int MacroAssembler::zero_words_block_size = 8;
4813 
4814 // zero_words() is used by C2 ClearArray patterns.  It is as small as
4815 // possible, handling small word counts locally and delegating
4816 // anything larger to the zero_blocks stub.  It is expanded many times
4817 // in compiled code, so it is important to keep it short.
4818 
4819 // ptr:   Address of a buffer to be zeroed.
4820 // cnt:   Count in HeapWords.
4821 //
4822 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
zero_words(Register ptr,Register cnt)4823 address MacroAssembler::zero_words(Register ptr, Register cnt)
4824 {
4825   assert(is_power_of_2(zero_words_block_size), "adjust this");
4826   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
4827 
4828   BLOCK_COMMENT("zero_words {");
4829   cmp(cnt, (u1)zero_words_block_size);
4830   Label around;
4831   br(LO, around);
4832   {
4833     RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
4834     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
4835     if (StubRoutines::aarch64::complete()) {
4836       address tpc = trampoline_call(zero_blocks);
4837       if (tpc == NULL) {
4838         DEBUG_ONLY(reset_labels1(around));
4839         postcond(pc() == badAddress);
4840         return NULL;
4841       }
4842     } else {
4843       bl(zero_blocks);
4844     }
4845   }
4846   bind(around);
4847   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
4848     Label l;
4849     tbz(cnt, exact_log2(i), l);
4850     for (int j = 0; j < i; j += 2) {
4851       stp(zr, zr, post(ptr, 16));
4852     }
4853     bind(l);
4854   }
4855   {
4856     Label l;
4857     tbz(cnt, 0, l);
4858     str(zr, Address(ptr));
4859     bind(l);
4860   }
4861   BLOCK_COMMENT("} zero_words");
4862   postcond(pc() != badAddress);
4863   return pc();
4864 }
4865 
4866 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
4867 // cnt:          Immediate count in HeapWords.
4868 #define SmallArraySize (18 * BytesPerLong)
zero_words(Register base,u_int64_t cnt)4869 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
4870 {
4871   BLOCK_COMMENT("zero_words {");
4872   int i = cnt & 1;  // store any odd word to start
4873   if (i) str(zr, Address(base));
4874 
4875   if (cnt <= SmallArraySize / BytesPerLong) {
4876     for (; i < (int)cnt; i += 2) {
4877       stp(zr, zr, Address(base, i * wordSize));
4878     }
4879   } else {
4880     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
4881     int remainder = cnt % (2 * unroll);
4882     for (; i < remainder; i += 2) {
4883       stp(zr, zr, Address(base, i * wordSize));
4884     }
4885     Label loop;
4886     Register cnt_reg = rscratch1;
4887     Register loop_base = rscratch2;
4888     cnt = cnt - remainder;
4889     mov(cnt_reg, cnt);
4890     // adjust base and prebias by -2 * wordSize so we can pre-increment
4891     add(loop_base, base, (remainder - 2) * wordSize);
4892     bind(loop);
4893     sub(cnt_reg, cnt_reg, 2 * unroll);
4894     for (i = 1; i < unroll; i++) {
4895       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
4896     }
4897     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
4898     cbnz(cnt_reg, loop);
4899   }
4900   BLOCK_COMMENT("} zero_words");
4901 }
4902 
4903 // Zero blocks of memory by using DC ZVA.
4904 //
4905 // Aligns the base address first sufficently for DC ZVA, then uses
4906 // DC ZVA repeatedly for every full block.  cnt is the size to be
4907 // zeroed in HeapWords.  Returns the count of words left to be zeroed
4908 // in cnt.
4909 //
4910 // NOTE: This is intended to be used in the zero_blocks() stub.  If
4911 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
zero_dcache_blocks(Register base,Register cnt)4912 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
4913   Register tmp = rscratch1;
4914   Register tmp2 = rscratch2;
4915   int zva_length = VM_Version::zva_length();
4916   Label initial_table_end, loop_zva;
4917   Label fini;
4918 
4919   // Base must be 16 byte aligned. If not just return and let caller handle it
4920   tst(base, 0x0f);
4921   br(Assembler::NE, fini);
4922   // Align base with ZVA length.
4923   neg(tmp, base);
4924   andr(tmp, tmp, zva_length - 1);
4925 
4926   // tmp: the number of bytes to be filled to align the base with ZVA length.
4927   add(base, base, tmp);
4928   sub(cnt, cnt, tmp, Assembler::ASR, 3);
4929   adr(tmp2, initial_table_end);
4930   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
4931   br(tmp2);
4932 
4933   for (int i = -zva_length + 16; i < 0; i += 16)
4934     stp(zr, zr, Address(base, i));
4935   bind(initial_table_end);
4936 
4937   sub(cnt, cnt, zva_length >> 3);
4938   bind(loop_zva);
4939   dc(Assembler::ZVA, base);
4940   subs(cnt, cnt, zva_length >> 3);
4941   add(base, base, zva_length);
4942   br(Assembler::GE, loop_zva);
4943   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
4944   bind(fini);
4945 }
4946 
4947 // base:   Address of a buffer to be filled, 8 bytes aligned.
4948 // cnt:    Count in 8-byte unit.
4949 // value:  Value to be filled with.
4950 // base will point to the end of the buffer after filling.
fill_words(Register base,Register cnt,Register value)4951 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
4952 {
4953 //  Algorithm:
4954 //
4955 //    scratch1 = cnt & 7;
4956 //    cnt -= scratch1;
4957 //    p += scratch1;
4958 //    switch (scratch1) {
4959 //      do {
4960 //        cnt -= 8;
4961 //          p[-8] = v;
4962 //        case 7:
4963 //          p[-7] = v;
4964 //        case 6:
4965 //          p[-6] = v;
4966 //          // ...
4967 //        case 1:
4968 //          p[-1] = v;
4969 //        case 0:
4970 //          p += 8;
4971 //      } while (cnt);
4972 //    }
4973 
4974   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
4975 
4976   Label fini, skip, entry, loop;
4977   const int unroll = 8; // Number of stp instructions we'll unroll
4978 
4979   cbz(cnt, fini);
4980   tbz(base, 3, skip);
4981   str(value, Address(post(base, 8)));
4982   sub(cnt, cnt, 1);
4983   bind(skip);
4984 
4985   andr(rscratch1, cnt, (unroll-1) * 2);
4986   sub(cnt, cnt, rscratch1);
4987   add(base, base, rscratch1, Assembler::LSL, 3);
4988   adr(rscratch2, entry);
4989   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
4990   br(rscratch2);
4991 
4992   bind(loop);
4993   add(base, base, unroll * 16);
4994   for (int i = -unroll; i < 0; i++)
4995     stp(value, value, Address(base, i * 16));
4996   bind(entry);
4997   subs(cnt, cnt, unroll * 2);
4998   br(Assembler::GE, loop);
4999 
5000   tbz(cnt, 0, fini);
5001   str(value, Address(post(base, 8)));
5002   bind(fini);
5003 }
5004 
5005 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5006 // java/lang/StringUTF16.compress.
encode_iso_array(Register src,Register dst,Register len,Register result,FloatRegister Vtmp1,FloatRegister Vtmp2,FloatRegister Vtmp3,FloatRegister Vtmp4)5007 void MacroAssembler::encode_iso_array(Register src, Register dst,
5008                       Register len, Register result,
5009                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5010                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5011 {
5012     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5013         NEXT_32_START, NEXT_32_PRFM_START;
5014     Register tmp1 = rscratch1, tmp2 = rscratch2;
5015 
5016       mov(result, len); // Save initial len
5017 
5018       cmp(len, (u1)8); // handle shortest strings first
5019       br(LT, LOOP_1);
5020       cmp(len, (u1)32);
5021       br(LT, NEXT_8);
5022       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5023       // to convert chars to bytes
5024       if (SoftwarePrefetchHintDistance >= 0) {
5025         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5026         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5027         br(LE, NEXT_32_START);
5028         b(NEXT_32_PRFM_START);
5029         BIND(NEXT_32_PRFM);
5030           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5031         BIND(NEXT_32_PRFM_START);
5032           prfm(Address(src, SoftwarePrefetchHintDistance));
5033           orr(v4, T16B, Vtmp1, Vtmp2);
5034           orr(v5, T16B, Vtmp3, Vtmp4);
5035           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5036           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5037           uzp2(v5, T16B, v4, v5); // high bytes
5038           umov(tmp2, v5, D, 1);
5039           fmovd(tmp1, v5);
5040           orr(tmp1, tmp1, tmp2);
5041           cbnz(tmp1, LOOP_8);
5042           stpq(Vtmp1, Vtmp3, dst);
5043           sub(len, len, 32);
5044           add(dst, dst, 32);
5045           add(src, src, 64);
5046           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5047           br(GE, NEXT_32_PRFM);
5048           cmp(len, (u1)32);
5049           br(LT, LOOP_8);
5050         BIND(NEXT_32);
5051           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5052         BIND(NEXT_32_START);
5053       } else {
5054         BIND(NEXT_32);
5055           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5056       }
5057       prfm(Address(src, SoftwarePrefetchHintDistance));
5058       uzp1(v4, T16B, Vtmp1, Vtmp2);
5059       uzp1(v5, T16B, Vtmp3, Vtmp4);
5060       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5061       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5062       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5063       umov(tmp2, Vtmp1, D, 1);
5064       fmovd(tmp1, Vtmp1);
5065       orr(tmp1, tmp1, tmp2);
5066       cbnz(tmp1, LOOP_8);
5067       stpq(v4, v5, dst);
5068       sub(len, len, 32);
5069       add(dst, dst, 32);
5070       add(src, src, 64);
5071       cmp(len, (u1)32);
5072       br(GE, NEXT_32);
5073       cbz(len, DONE);
5074 
5075     BIND(LOOP_8);
5076       cmp(len, (u1)8);
5077       br(LT, LOOP_1);
5078     BIND(NEXT_8);
5079       ld1(Vtmp1, T8H, src);
5080       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5081       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5082       fmovd(tmp1, Vtmp3);
5083       cbnz(tmp1, NEXT_1);
5084       strd(Vtmp2, dst);
5085 
5086       sub(len, len, 8);
5087       add(dst, dst, 8);
5088       add(src, src, 16);
5089       cmp(len, (u1)8);
5090       br(GE, NEXT_8);
5091 
5092     BIND(LOOP_1);
5093 
5094     cbz(len, DONE);
5095     BIND(NEXT_1);
5096       ldrh(tmp1, Address(post(src, 2)));
5097       tst(tmp1, 0xff00);
5098       br(NE, SET_RESULT);
5099       strb(tmp1, Address(post(dst, 1)));
5100       subs(len, len, 1);
5101       br(GT, NEXT_1);
5102 
5103     BIND(SET_RESULT);
5104       sub(result, result, len); // Return index where we stopped
5105                                 // Return len == 0 if we processed all
5106                                 // characters
5107     BIND(DONE);
5108 }
5109 
5110 
5111 // Inflate byte[] array to char[].
byte_array_inflate(Register src,Register dst,Register len,FloatRegister vtmp1,FloatRegister vtmp2,FloatRegister vtmp3,Register tmp4)5112 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5113                                            FloatRegister vtmp1, FloatRegister vtmp2,
5114                                            FloatRegister vtmp3, Register tmp4) {
5115   Label big, done, after_init, to_stub;
5116 
5117   assert_different_registers(src, dst, len, tmp4, rscratch1);
5118 
5119   fmovd(vtmp1, zr);
5120   lsrw(tmp4, len, 3);
5121   bind(after_init);
5122   cbnzw(tmp4, big);
5123   // Short string: less than 8 bytes.
5124   {
5125     Label loop, tiny;
5126 
5127     cmpw(len, 4);
5128     br(LT, tiny);
5129     // Use SIMD to do 4 bytes.
5130     ldrs(vtmp2, post(src, 4));
5131     zip1(vtmp3, T8B, vtmp2, vtmp1);
5132     subw(len, len, 4);
5133     strd(vtmp3, post(dst, 8));
5134 
5135     cbzw(len, done);
5136 
5137     // Do the remaining bytes by steam.
5138     bind(loop);
5139     ldrb(tmp4, post(src, 1));
5140     strh(tmp4, post(dst, 2));
5141     subw(len, len, 1);
5142 
5143     bind(tiny);
5144     cbnz(len, loop);
5145 
5146     b(done);
5147   }
5148 
5149   if (SoftwarePrefetchHintDistance >= 0) {
5150     bind(to_stub);
5151       RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5152       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5153       address tpc = trampoline_call(stub);
5154       if (tpc == NULL) {
5155         DEBUG_ONLY(reset_labels2(big, done));
5156         postcond(pc() == badAddress);
5157         return NULL;
5158       }
5159       b(after_init);
5160   }
5161 
5162   // Unpack the bytes 8 at a time.
5163   bind(big);
5164   {
5165     Label loop, around, loop_last, loop_start;
5166 
5167     if (SoftwarePrefetchHintDistance >= 0) {
5168       const int large_loop_threshold = (64 + 16)/8;
5169       ldrd(vtmp2, post(src, 8));
5170       andw(len, len, 7);
5171       cmp(tmp4, (u1)large_loop_threshold);
5172       br(GE, to_stub);
5173       b(loop_start);
5174 
5175       bind(loop);
5176       ldrd(vtmp2, post(src, 8));
5177       bind(loop_start);
5178       subs(tmp4, tmp4, 1);
5179       br(EQ, loop_last);
5180       zip1(vtmp2, T16B, vtmp2, vtmp1);
5181       ldrd(vtmp3, post(src, 8));
5182       st1(vtmp2, T8H, post(dst, 16));
5183       subs(tmp4, tmp4, 1);
5184       zip1(vtmp3, T16B, vtmp3, vtmp1);
5185       st1(vtmp3, T8H, post(dst, 16));
5186       br(NE, loop);
5187       b(around);
5188       bind(loop_last);
5189       zip1(vtmp2, T16B, vtmp2, vtmp1);
5190       st1(vtmp2, T8H, post(dst, 16));
5191       bind(around);
5192       cbz(len, done);
5193     } else {
5194       andw(len, len, 7);
5195       bind(loop);
5196       ldrd(vtmp2, post(src, 8));
5197       sub(tmp4, tmp4, 1);
5198       zip1(vtmp3, T16B, vtmp2, vtmp1);
5199       st1(vtmp3, T8H, post(dst, 16));
5200       cbnz(tmp4, loop);
5201     }
5202   }
5203 
5204   // Do the tail of up to 8 bytes.
5205   add(src, src, len);
5206   ldrd(vtmp3, Address(src, -8));
5207   add(dst, dst, len, ext::uxtw, 1);
5208   zip1(vtmp3, T16B, vtmp3, vtmp1);
5209   strq(vtmp3, Address(dst, -16));
5210 
5211   bind(done);
5212   postcond(pc() != badAddress);
5213   return pc();
5214 }
5215 
5216 // Compress char[] array to byte[].
char_array_compress(Register src,Register dst,Register len,FloatRegister tmp1Reg,FloatRegister tmp2Reg,FloatRegister tmp3Reg,FloatRegister tmp4Reg,Register result)5217 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5218                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5219                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5220                                          Register result) {
5221   encode_iso_array(src, dst, len, result,
5222                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5223   cmp(len, zr);
5224   csel(result, result, zr, EQ);
5225 }
5226 
5227 // get_thread() can be called anywhere inside generated code so we
5228 // need to save whatever non-callee save context might get clobbered
5229 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5230 // the call setup code.
5231 //
5232 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5233 //
get_thread(Register dst)5234 void MacroAssembler::get_thread(Register dst) {
5235   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5236   push(saved_regs, sp);
5237 
5238   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5239   blr(lr);
5240   if (dst != c_rarg0) {
5241     mov(dst, c_rarg0);
5242   }
5243 
5244   pop(saved_regs, sp);
5245 }
5246 
cache_wb(Address line)5247 void MacroAssembler::cache_wb(Address line) {
5248   assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
5249   assert(line.index() == noreg, "index should be noreg");
5250   assert(line.offset() == 0, "offset should be 0");
5251   // would like to assert this
5252   // assert(line._ext.shift == 0, "shift should be zero");
5253   if (VM_Version::supports_dcpop()) {
5254     // writeback using clear virtual address to point of persistence
5255     dc(Assembler::CVAP, line.base());
5256   } else {
5257     // no need to generate anything as Unsafe.writebackMemory should
5258     // never invoke this stub
5259   }
5260 }
5261 
cache_wbsync(bool is_pre)5262 void MacroAssembler::cache_wbsync(bool is_pre) {
5263   // we only need a barrier post sync
5264   if (!is_pre) {
5265     membar(Assembler::AnyAny);
5266   }
5267 }
5268