1 /*
2  * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.
9  *
10  * This code is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13  * version 2 for more details (a copy is included in the LICENSE file that
14  * accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License version
17  * 2 along with this work; if not, write to the Free Software Foundation,
18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19  *
20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21  * or visit www.oracle.com if you need additional information or have any
22  * questions.
23  *
24  */
25 
26 #include <sys/types.h>
27 
28 #include "precompiled.hpp"
29 #include "jvm.h"
30 #include "asm/assembler.hpp"
31 #include "asm/assembler.inline.hpp"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/cardTable.hpp"
34 #include "gc/shared/barrierSetAssembler.hpp"
35 #include "gc/shared/cardTableBarrierSet.hpp"
36 #include "interpreter/interpreter.hpp"
37 #include "compiler/disassembler.hpp"
38 #include "memory/resourceArea.hpp"
39 #include "nativeInst_aarch64.hpp"
40 #include "oops/accessDecorators.hpp"
41 #include "oops/compressedOops.inline.hpp"
42 #include "oops/klass.inline.hpp"
43 #include "runtime/biasedLocking.hpp"
44 #include "runtime/icache.hpp"
45 #include "runtime/interfaceSupport.inline.hpp"
46 #include "runtime/jniHandles.inline.hpp"
47 #include "runtime/sharedRuntime.hpp"
48 #include "runtime/thread.hpp"
49 #ifdef COMPILER1
50 #include "c1/c1_LIRAssembler.hpp"
51 #endif
52 #ifdef COMPILER2
53 #include "oops/oop.hpp"
54 #include "opto/compile.hpp"
55 #include "opto/intrinsicnode.hpp"
56 #include "opto/node.hpp"
57 #endif
58 
59 #ifdef PRODUCT
60 #define BLOCK_COMMENT(str) /* nothing */
61 #define STOP(error) stop(error)
62 #else
63 #define BLOCK_COMMENT(str) block_comment(str)
64 #define STOP(error) block_comment(error); stop(error)
65 #endif
66 
67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
68 
69 // Patch any kind of instruction; there may be several instructions.
70 // Return the total length (in bytes) of the instructions.
pd_patch_instruction_size(address branch,address target)71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
72   int instructions = 1;
73   assert((uint64_t)target < (1ull << 48), "48-bit overflow in address constant");
74   intptr_t offset = (target - branch) >> 2;
75   unsigned insn = *(unsigned*)branch;
76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
77     // Load register (literal)
78     Instruction_aarch64::spatch(branch, 23, 5, offset);
79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
80     // Unconditional branch (immediate)
81     Instruction_aarch64::spatch(branch, 25, 0, offset);
82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
83     // Conditional branch (immediate)
84     Instruction_aarch64::spatch(branch, 23, 5, offset);
85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
86     // Compare & branch (immediate)
87     Instruction_aarch64::spatch(branch, 23, 5, offset);
88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
89     // Test & branch (immediate)
90     Instruction_aarch64::spatch(branch, 18, 5, offset);
91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
92     // PC-rel. addressing
93     offset = target-branch;
94     int shift = Instruction_aarch64::extract(insn, 31, 31);
95     if (shift) {
96       uint64_t dest = (uint64_t)target;
97       uint64_t pc_page = (uint64_t)branch >> 12;
98       uint64_t adr_page = (uint64_t)target >> 12;
99       unsigned offset_lo = dest & 0xfff;
100       offset = adr_page - pc_page;
101 
102       // We handle 4 types of PC relative addressing
103       //   1 - adrp    Rx, target_page
104       //       ldr/str Ry, [Rx, #offset_in_page]
105       //   2 - adrp    Rx, target_page
106       //       add     Ry, Rx, #offset_in_page
107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
108       //       movk    Rx, #imm16<<32
109       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
110       // In the first 3 cases we must check that Rx is the same in the adrp and the
111       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
112       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
113       // to be followed by a random unrelated ldr/str, add or movk instruction.
114       //
115       unsigned insn2 = ((unsigned*)branch)[1];
116       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
117                 Instruction_aarch64::extract(insn, 4, 0) ==
118                         Instruction_aarch64::extract(insn2, 9, 5)) {
119         // Load/store register (unsigned immediate)
120         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
121         Instruction_aarch64::patch(branch + sizeof (unsigned),
122                                     21, 10, offset_lo >> size);
123         guarantee(((dest >> size) << size) == dest, "misaligned target");
124         instructions = 2;
125       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
126                 Instruction_aarch64::extract(insn, 4, 0) ==
127                         Instruction_aarch64::extract(insn2, 4, 0)) {
128         // add (immediate)
129         Instruction_aarch64::patch(branch + sizeof (unsigned),
130                                    21, 10, offset_lo);
131         instructions = 2;
132       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
133                    Instruction_aarch64::extract(insn, 4, 0) ==
134                      Instruction_aarch64::extract(insn2, 4, 0)) {
135         // movk #imm16<<32
136         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
137         uintptr_t dest = ((uintptr_t)target & 0xffffffffULL) | ((uintptr_t)branch & 0xffff00000000ULL);
138         uintptr_t pc_page = (uintptr_t)branch >> 12;
139         uintptr_t adr_page = (uintptr_t)dest >> 12;
140         offset = adr_page - pc_page;
141         instructions = 2;
142       }
143     }
144     int offset_lo = offset & 3;
145     offset >>= 2;
146     Instruction_aarch64::spatch(branch, 23, 5, offset);
147     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
148   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
149     uint64_t dest = (uint64_t)target;
150     // Move wide constant
151     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
152     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
153     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
154     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
155     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
156     assert(target_addr_for_insn(branch) == target, "should be");
157     instructions = 3;
158   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
159              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
160     // nothing to do
161     assert(target == 0, "did not expect to relocate target for polling page load");
162   } else {
163     ShouldNotReachHere();
164   }
165   return instructions * NativeInstruction::instruction_size;
166 }
167 
patch_oop(address insn_addr,address o)168 int MacroAssembler::patch_oop(address insn_addr, address o) {
169   int instructions;
170   unsigned insn = *(unsigned*)insn_addr;
171   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
172 
173   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
174   // narrow OOPs by setting the upper 16 bits in the first
175   // instruction.
176   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
177     // Move narrow OOP
178     narrowOop n = CompressedOops::encode((oop)o);
179     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
180     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
181     instructions = 2;
182   } else {
183     // Move wide OOP
184     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
185     uintptr_t dest = (uintptr_t)o;
186     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
187     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
188     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
189     instructions = 3;
190   }
191   return instructions * NativeInstruction::instruction_size;
192 }
193 
patch_narrow_klass(address insn_addr,narrowKlass n)194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
195   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
196   // We encode narrow ones by setting the upper 16 bits in the first
197   // instruction.
198   NativeInstruction *insn = nativeInstruction_at(insn_addr);
199   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
200          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
201 
202   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
203   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
204   return 2 * NativeInstruction::instruction_size;
205 }
206 
target_addr_for_insn(address insn_addr,unsigned insn)207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
208   intptr_t offset = 0;
209   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
210     // Load register (literal)
211     offset = Instruction_aarch64::sextract(insn, 23, 5);
212     return address(((uint64_t)insn_addr + (offset << 2)));
213   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
214     // Unconditional branch (immediate)
215     offset = Instruction_aarch64::sextract(insn, 25, 0);
216   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
217     // Conditional branch (immediate)
218     offset = Instruction_aarch64::sextract(insn, 23, 5);
219   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
220     // Compare & branch (immediate)
221     offset = Instruction_aarch64::sextract(insn, 23, 5);
222    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
223     // Test & branch (immediate)
224     offset = Instruction_aarch64::sextract(insn, 18, 5);
225   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
226     // PC-rel. addressing
227     offset = Instruction_aarch64::extract(insn, 30, 29);
228     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
229     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
230     if (shift) {
231       offset <<= shift;
232       uint64_t target_page = ((uint64_t)insn_addr) + offset;
233       target_page &= ((uint64_t)-1) << shift;
234       // Return the target address for the following sequences
235       //   1 - adrp    Rx, target_page
236       //       ldr/str Ry, [Rx, #offset_in_page]
237       //   2 - adrp    Rx, target_page
238       //       add     Ry, Rx, #offset_in_page
239       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
240       //       movk    Rx, #imm12<<32
241       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
242       //
243       // In the first two cases  we check that the register is the same and
244       // return the target_page + the offset within the page.
245       // Otherwise we assume it is a page aligned relocation and return
246       // the target page only.
247       //
248       unsigned insn2 = ((unsigned*)insn_addr)[1];
249       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
250                 Instruction_aarch64::extract(insn, 4, 0) ==
251                         Instruction_aarch64::extract(insn2, 9, 5)) {
252         // Load/store register (unsigned immediate)
253         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
254         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
255         return address(target_page + (byte_offset << size));
256       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
257                 Instruction_aarch64::extract(insn, 4, 0) ==
258                         Instruction_aarch64::extract(insn2, 4, 0)) {
259         // add (immediate)
260         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
261         return address(target_page + byte_offset);
262       } else {
263         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
264                Instruction_aarch64::extract(insn, 4, 0) ==
265                  Instruction_aarch64::extract(insn2, 4, 0)) {
266           target_page = (target_page & 0xffffffff) |
267                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
268         }
269         return (address)target_page;
270       }
271     } else {
272       ShouldNotReachHere();
273     }
274   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
275     uint32_t *insns = (uint32_t *)insn_addr;
276     // Move wide constant: movz, movk, movk.  See movptr().
277     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
278     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
279     return address(uint64_t(Instruction_aarch64::extract(insns[0], 20, 5))
280                    + (uint64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
281                    + (uint64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
282   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
283              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
284     return 0;
285   } else {
286     ShouldNotReachHere();
287   }
288   return address(((uint64_t)insn_addr + (offset << 2)));
289 }
290 
serialize_memory(Register thread,Register tmp)291 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
292   dsb(Assembler::SY);
293 }
294 
safepoint_poll(Label & slow_path)295 void MacroAssembler::safepoint_poll(Label& slow_path) {
296   if (SafepointMechanism::uses_thread_local_poll()) {
297     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
298     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
299   } else {
300     unsigned long offset;
301     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
302     ldrw(rscratch1, Address(rscratch1, offset));
303     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
304     cbnz(rscratch1, slow_path);
305   }
306 }
307 
308 // Just like safepoint_poll, but use an acquiring load for thread-
309 // local polling.
310 //
311 // We need an acquire here to ensure that any subsequent load of the
312 // global SafepointSynchronize::_state flag is ordered after this load
313 // of the local Thread::_polling page.  We don't want this poll to
314 // return false (i.e. not safepointing) and a later poll of the global
315 // SafepointSynchronize::_state spuriously to return true.
316 //
317 // This is to avoid a race when we're in a native->Java transition
318 // racing the code which wakes up from a safepoint.
319 //
safepoint_poll_acquire(Label & slow_path)320 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
321   if (SafepointMechanism::uses_thread_local_poll()) {
322     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
323     ldar(rscratch1, rscratch1);
324     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
325   } else {
326     safepoint_poll(slow_path);
327   }
328 }
329 
reset_last_Java_frame(bool clear_fp)330 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
331   // we must set sp to zero to clear frame
332   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
333 
334   // must clear fp, so that compiled frames are not confused; it is
335   // possible that we need it only for debugging
336   if (clear_fp) {
337     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
338   }
339 
340   // Always clear the pc because it could have been set by make_walkable()
341   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
342 }
343 
344 // Calls to C land
345 //
346 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
347 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
348 // has to be reset to 0. This is required to allow proper stack traversal.
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Register last_java_pc,Register scratch)349 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
350                                          Register last_java_fp,
351                                          Register last_java_pc,
352                                          Register scratch) {
353 
354   if (last_java_pc->is_valid()) {
355       str(last_java_pc, Address(rthread,
356                                 JavaThread::frame_anchor_offset()
357                                 + JavaFrameAnchor::last_Java_pc_offset()));
358     }
359 
360   // determine last_java_sp register
361   if (last_java_sp == sp) {
362     mov(scratch, sp);
363     last_java_sp = scratch;
364   } else if (!last_java_sp->is_valid()) {
365     last_java_sp = esp;
366   }
367 
368   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
369 
370   // last_java_fp is optional
371   if (last_java_fp->is_valid()) {
372     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
373   }
374 }
375 
set_last_Java_frame(Register last_java_sp,Register last_java_fp,address last_java_pc,Register scratch)376 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
377                                          Register last_java_fp,
378                                          address  last_java_pc,
379                                          Register scratch) {
380   assert(last_java_pc != NULL, "must provide a valid PC");
381 
382   adr(scratch, last_java_pc);
383   str(scratch, Address(rthread,
384                        JavaThread::frame_anchor_offset()
385                        + JavaFrameAnchor::last_Java_pc_offset()));
386 
387   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
388 }
389 
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Label & L,Register scratch)390 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
391                                          Register last_java_fp,
392                                          Label &L,
393                                          Register scratch) {
394   if (L.is_bound()) {
395     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
396   } else {
397     InstructionMark im(this);
398     L.add_patch_at(code(), locator());
399     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
400   }
401 }
402 
far_call(Address entry,CodeBuffer * cbuf,Register tmp)403 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
404   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
405   assert(CodeCache::find_blob(entry.target()) != NULL,
406          "destination of far call not found in code cache");
407   if (far_branches()) {
408     uintptr_t offset;
409     // We can use ADRP here because we know that the total size of
410     // the code cache cannot exceed 2Gb.
411     adrp(tmp, entry, offset);
412     add(tmp, tmp, offset);
413     if (cbuf) cbuf->set_insts_mark();
414     blr(tmp);
415   } else {
416     if (cbuf) cbuf->set_insts_mark();
417     bl(entry);
418   }
419 }
420 
far_jump(Address entry,CodeBuffer * cbuf,Register tmp)421 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
422   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
423   assert(CodeCache::find_blob(entry.target()) != NULL,
424          "destination of far call not found in code cache");
425   if (far_branches()) {
426     uintptr_t offset;
427     // We can use ADRP here because we know that the total size of
428     // the code cache cannot exceed 2Gb.
429     adrp(tmp, entry, offset);
430     add(tmp, tmp, offset);
431     if (cbuf) cbuf->set_insts_mark();
432     br(tmp);
433   } else {
434     if (cbuf) cbuf->set_insts_mark();
435     b(entry);
436   }
437 }
438 
reserved_stack_check()439 void MacroAssembler::reserved_stack_check() {
440     // testing if reserved zone needs to be enabled
441     Label no_reserved_zone_enabling;
442 
443     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
444     cmp(sp, rscratch1);
445     br(Assembler::LO, no_reserved_zone_enabling);
446 
447     enter();   // LR and FP are live.
448     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
449     mov(c_rarg0, rthread);
450     blr(rscratch1);
451     leave();
452 
453     // We have already removed our own frame.
454     // throw_delayed_StackOverflowError will think that it's been
455     // called by our caller.
456     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
457     br(rscratch1);
458     should_not_reach_here();
459 
460     bind(no_reserved_zone_enabling);
461 }
462 
biased_locking_enter(Register lock_reg,Register obj_reg,Register swap_reg,Register tmp_reg,bool swap_reg_contains_mark,Label & done,Label * slow_case,BiasedLockingCounters * counters)463 int MacroAssembler::biased_locking_enter(Register lock_reg,
464                                          Register obj_reg,
465                                          Register swap_reg,
466                                          Register tmp_reg,
467                                          bool swap_reg_contains_mark,
468                                          Label& done,
469                                          Label* slow_case,
470                                          BiasedLockingCounters* counters) {
471   assert(UseBiasedLocking, "why call this otherwise?");
472   assert_different_registers(lock_reg, obj_reg, swap_reg);
473 
474   if (PrintBiasedLockingStatistics && counters == NULL)
475     counters = BiasedLocking::counters();
476 
477   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
478   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
479   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
480   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
481   Address saved_mark_addr(lock_reg, 0);
482 
483   // Biased locking
484   // See whether the lock is currently biased toward our thread and
485   // whether the epoch is still valid
486   // Note that the runtime guarantees sufficient alignment of JavaThread
487   // pointers to allow age to be placed into low bits
488   // First check to see whether biasing is even enabled for this object
489   Label cas_label;
490   int null_check_offset = -1;
491   if (!swap_reg_contains_mark) {
492     null_check_offset = offset();
493     ldr(swap_reg, mark_addr);
494   }
495   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
496   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
497   br(Assembler::NE, cas_label);
498   // The bias pattern is present in the object's header. Need to check
499   // whether the bias owner and the epoch are both still current.
500   load_prototype_header(tmp_reg, obj_reg);
501   orr(tmp_reg, tmp_reg, rthread);
502   eor(tmp_reg, swap_reg, tmp_reg);
503   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
504   if (counters != NULL) {
505     Label around;
506     cbnz(tmp_reg, around);
507     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
508     b(done);
509     bind(around);
510   } else {
511     cbz(tmp_reg, done);
512   }
513 
514   Label try_revoke_bias;
515   Label try_rebias;
516 
517   // At this point we know that the header has the bias pattern and
518   // that we are not the bias owner in the current epoch. We need to
519   // figure out more details about the state of the header in order to
520   // know what operations can be legally performed on the object's
521   // header.
522 
523   // If the low three bits in the xor result aren't clear, that means
524   // the prototype header is no longer biased and we have to revoke
525   // the bias on this object.
526   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
527   cbnz(rscratch1, try_revoke_bias);
528 
529   // Biasing is still enabled for this data type. See whether the
530   // epoch of the current bias is still valid, meaning that the epoch
531   // bits of the mark word are equal to the epoch bits of the
532   // prototype header. (Note that the prototype header's epoch bits
533   // only change at a safepoint.) If not, attempt to rebias the object
534   // toward the current thread. Note that we must be absolutely sure
535   // that the current epoch is invalid in order to do this because
536   // otherwise the manipulations it performs on the mark word are
537   // illegal.
538   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
539   cbnz(rscratch1, try_rebias);
540 
541   // The epoch of the current bias is still valid but we know nothing
542   // about the owner; it might be set or it might be clear. Try to
543   // acquire the bias of the object using an atomic operation. If this
544   // fails we will go in to the runtime to revoke the object's bias.
545   // Note that we first construct the presumed unbiased header so we
546   // don't accidentally blow away another thread's valid bias.
547   {
548     Label here;
549     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
550     andr(swap_reg, swap_reg, rscratch1);
551     orr(tmp_reg, swap_reg, rthread);
552     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
553     // If the biasing toward our thread failed, this means that
554     // another thread succeeded in biasing it toward itself and we
555     // need to revoke that bias. The revocation will occur in the
556     // interpreter runtime in the slow case.
557     bind(here);
558     if (counters != NULL) {
559       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
560                   tmp_reg, rscratch1, rscratch2);
561     }
562   }
563   b(done);
564 
565   bind(try_rebias);
566   // At this point we know the epoch has expired, meaning that the
567   // current "bias owner", if any, is actually invalid. Under these
568   // circumstances _only_, we are allowed to use the current header's
569   // value as the comparison value when doing the cas to acquire the
570   // bias in the current epoch. In other words, we allow transfer of
571   // the bias from one thread to another directly in this situation.
572   //
573   // FIXME: due to a lack of registers we currently blow away the age
574   // bits in this situation. Should attempt to preserve them.
575   {
576     Label here;
577     load_prototype_header(tmp_reg, obj_reg);
578     orr(tmp_reg, rthread, tmp_reg);
579     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
580     // If the biasing toward our thread failed, then another thread
581     // succeeded in biasing it toward itself and we need to revoke that
582     // bias. The revocation will occur in the runtime in the slow case.
583     bind(here);
584     if (counters != NULL) {
585       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
586                   tmp_reg, rscratch1, rscratch2);
587     }
588   }
589   b(done);
590 
591   bind(try_revoke_bias);
592   // The prototype mark in the klass doesn't have the bias bit set any
593   // more, indicating that objects of this data type are not supposed
594   // to be biased any more. We are going to try to reset the mark of
595   // this object to the prototype value and fall through to the
596   // CAS-based locking scheme. Note that if our CAS fails, it means
597   // that another thread raced us for the privilege of revoking the
598   // bias of this particular object, so it's okay to continue in the
599   // normal locking code.
600   //
601   // FIXME: due to a lack of registers we currently blow away the age
602   // bits in this situation. Should attempt to preserve them.
603   {
604     Label here, nope;
605     load_prototype_header(tmp_reg, obj_reg);
606     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
607     bind(here);
608 
609     // Fall through to the normal CAS-based lock, because no matter what
610     // the result of the above CAS, some thread must have succeeded in
611     // removing the bias bit from the object's header.
612     if (counters != NULL) {
613       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
614                   rscratch1, rscratch2);
615     }
616     bind(nope);
617   }
618 
619   bind(cas_label);
620 
621   return null_check_offset;
622 }
623 
biased_locking_exit(Register obj_reg,Register temp_reg,Label & done)624 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
625   assert(UseBiasedLocking, "why call this otherwise?");
626 
627   // Check for biased locking unlock case, which is a no-op
628   // Note: we do not have to check the thread ID for two reasons.
629   // First, the interpreter checks for IllegalMonitorStateException at
630   // a higher level. Second, if the bias was revoked while we held the
631   // lock, the object could not be rebiased toward another thread, so
632   // the bias bit would be clear.
633   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
634   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
635   cmp(temp_reg, markOopDesc::biased_lock_pattern);
636   br(Assembler::EQ, done);
637 }
638 
pass_arg0(MacroAssembler * masm,Register arg)639 static void pass_arg0(MacroAssembler* masm, Register arg) {
640   if (c_rarg0 != arg ) {
641     masm->mov(c_rarg0, arg);
642   }
643 }
644 
pass_arg1(MacroAssembler * masm,Register arg)645 static void pass_arg1(MacroAssembler* masm, Register arg) {
646   if (c_rarg1 != arg ) {
647     masm->mov(c_rarg1, arg);
648   }
649 }
650 
pass_arg2(MacroAssembler * masm,Register arg)651 static void pass_arg2(MacroAssembler* masm, Register arg) {
652   if (c_rarg2 != arg ) {
653     masm->mov(c_rarg2, arg);
654   }
655 }
656 
pass_arg3(MacroAssembler * masm,Register arg)657 static void pass_arg3(MacroAssembler* masm, Register arg) {
658   if (c_rarg3 != arg ) {
659     masm->mov(c_rarg3, arg);
660   }
661 }
662 
call_VM_base(Register oop_result,Register java_thread,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)663 void MacroAssembler::call_VM_base(Register oop_result,
664                                   Register java_thread,
665                                   Register last_java_sp,
666                                   address  entry_point,
667                                   int      number_of_arguments,
668                                   bool     check_exceptions) {
669    // determine java_thread register
670   if (!java_thread->is_valid()) {
671     java_thread = rthread;
672   }
673 
674   // determine last_java_sp register
675   if (!last_java_sp->is_valid()) {
676     last_java_sp = esp;
677   }
678 
679   // debugging support
680   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
681   assert(java_thread == rthread, "unexpected register");
682 #ifdef ASSERT
683   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
684   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
685 #endif // ASSERT
686 
687   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
688   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
689 
690   // push java thread (becomes first argument of C function)
691 
692   mov(c_rarg0, java_thread);
693 
694   // set last Java frame before call
695   assert(last_java_sp != rfp, "can't use rfp");
696 
697   Label l;
698   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
699 
700   // do the call, remove parameters
701   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
702 
703   // lr could be poisoned with PAC signature during throw_pending_exception
704   // if it was tail-call optimized by compiler, since lr is not callee-saved
705   // reload it with proper value
706   adr(lr, l);
707 
708   // reset last Java frame
709   // Only interpreter should have to clear fp
710   reset_last_Java_frame(true);
711 
712    // C++ interp handles this in the interpreter
713   check_and_handle_popframe(java_thread);
714   check_and_handle_earlyret(java_thread);
715 
716   if (check_exceptions) {
717     // check for pending exceptions (java_thread is set upon return)
718     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
719     Label ok;
720     cbz(rscratch1, ok);
721     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
722     br(rscratch1);
723     bind(ok);
724   }
725 
726   // get oop result if there is one and reset the value in the thread
727   if (oop_result->is_valid()) {
728     get_vm_result(oop_result, java_thread);
729   }
730 }
731 
call_VM_helper(Register oop_result,address entry_point,int number_of_arguments,bool check_exceptions)732 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
733   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
734 }
735 
736 // Maybe emit a call via a trampoline.  If the code cache is small
737 // trampolines won't be emitted.
738 
trampoline_call(Address entry,CodeBuffer * cbuf)739 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
740   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
741   assert(entry.rspec().type() == relocInfo::runtime_call_type
742          || entry.rspec().type() == relocInfo::opt_virtual_call_type
743          || entry.rspec().type() == relocInfo::static_call_type
744          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
745 
746   // We need a trampoline if branches are far.
747   if (far_branches()) {
748     bool in_scratch_emit_size = false;
749 #ifdef COMPILER2
750     // We don't want to emit a trampoline if C2 is generating dummy
751     // code during its branch shortening phase.
752     CompileTask* task = ciEnv::current()->task();
753     in_scratch_emit_size =
754       (task != NULL && is_c2_compile(task->comp_level()) &&
755        Compile::current()->in_scratch_emit_size());
756 #endif
757     if (!in_scratch_emit_size) {
758       address stub = emit_trampoline_stub(offset(), entry.target());
759       if (stub == NULL) {
760         postcond(pc() == badAddress);
761         return NULL; // CodeCache is full
762       }
763     }
764   }
765 
766   if (cbuf) cbuf->set_insts_mark();
767   relocate(entry.rspec());
768   if (!far_branches()) {
769     bl(entry.target());
770   } else {
771     bl(pc());
772   }
773   // just need to return a non-null address
774   postcond(pc() != badAddress);
775   return pc();
776 }
777 
778 
779 // Emit a trampoline stub for a call to a target which is too far away.
780 //
781 // code sequences:
782 //
783 // call-site:
784 //   branch-and-link to <destination> or <trampoline stub>
785 //
786 // Related trampoline stub for this call site in the stub section:
787 //   load the call target from the constant pool
788 //   branch (LR still points to the call site above)
789 
emit_trampoline_stub(int insts_call_instruction_offset,address dest)790 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
791                                              address dest) {
792   // Max stub size: alignment nop, TrampolineStub.
793   address stub = start_a_stub(NativeInstruction::instruction_size
794                    + NativeCallTrampolineStub::instruction_size);
795   if (stub == NULL) {
796     return NULL;  // CodeBuffer::expand failed
797   }
798 
799   // Create a trampoline stub relocation which relates this trampoline stub
800   // with the call instruction at insts_call_instruction_offset in the
801   // instructions code-section.
802   align(wordSize);
803   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
804                                             + insts_call_instruction_offset));
805   const int stub_start_offset = offset();
806 
807   // Now, create the trampoline stub's code:
808   // - load the call
809   // - call
810   Label target;
811   ldr(rscratch1, target);
812   br(rscratch1);
813   bind(target);
814   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
815          "should be");
816   emit_int64((int64_t)dest);
817 
818   const address stub_start_addr = addr_at(stub_start_offset);
819 
820   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
821 
822   end_a_stub();
823   return stub_start_addr;
824 }
825 
emit_static_call_stub()826 void MacroAssembler::emit_static_call_stub() {
827   // CompiledDirectStaticCall::set_to_interpreted knows the
828   // exact layout of this stub.
829 
830   isb();
831   mov_metadata(rmethod, (Metadata*)NULL);
832 
833   // Jump to the entry point of the i2c stub.
834   movptr(rscratch1, 0);
835   br(rscratch1);
836 }
837 
c2bool(Register x)838 void MacroAssembler::c2bool(Register x) {
839   // implements x == 0 ? 0 : 1
840   // note: must only look at least-significant byte of x
841   //       since C-style booleans are stored in one byte
842   //       only! (was bug)
843   tst(x, 0xff);
844   cset(x, Assembler::NE);
845 }
846 
ic_call(address entry,jint method_index)847 address MacroAssembler::ic_call(address entry, jint method_index) {
848   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
849   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
850   // uintptr_t offset;
851   // ldr_constant(rscratch2, const_ptr);
852   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
853   return trampoline_call(Address(entry, rh));
854 }
855 
856 // Implementation of call_VM versions
857 
call_VM(Register oop_result,address entry_point,bool check_exceptions)858 void MacroAssembler::call_VM(Register oop_result,
859                              address entry_point,
860                              bool check_exceptions) {
861   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
862 }
863 
call_VM(Register oop_result,address entry_point,Register arg_1,bool check_exceptions)864 void MacroAssembler::call_VM(Register oop_result,
865                              address entry_point,
866                              Register arg_1,
867                              bool check_exceptions) {
868   pass_arg1(this, arg_1);
869   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
870 }
871 
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)872 void MacroAssembler::call_VM(Register oop_result,
873                              address entry_point,
874                              Register arg_1,
875                              Register arg_2,
876                              bool check_exceptions) {
877   assert(arg_1 != c_rarg2, "smashed arg");
878   pass_arg2(this, arg_2);
879   pass_arg1(this, arg_1);
880   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
881 }
882 
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)883 void MacroAssembler::call_VM(Register oop_result,
884                              address entry_point,
885                              Register arg_1,
886                              Register arg_2,
887                              Register arg_3,
888                              bool check_exceptions) {
889   assert(arg_1 != c_rarg3, "smashed arg");
890   assert(arg_2 != c_rarg3, "smashed arg");
891   pass_arg3(this, arg_3);
892 
893   assert(arg_1 != c_rarg2, "smashed arg");
894   pass_arg2(this, arg_2);
895 
896   pass_arg1(this, arg_1);
897   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
898 }
899 
call_VM(Register oop_result,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)900 void MacroAssembler::call_VM(Register oop_result,
901                              Register last_java_sp,
902                              address entry_point,
903                              int number_of_arguments,
904                              bool check_exceptions) {
905   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
906 }
907 
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,bool check_exceptions)908 void MacroAssembler::call_VM(Register oop_result,
909                              Register last_java_sp,
910                              address entry_point,
911                              Register arg_1,
912                              bool check_exceptions) {
913   pass_arg1(this, arg_1);
914   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
915 }
916 
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)917 void MacroAssembler::call_VM(Register oop_result,
918                              Register last_java_sp,
919                              address entry_point,
920                              Register arg_1,
921                              Register arg_2,
922                              bool check_exceptions) {
923 
924   assert(arg_1 != c_rarg2, "smashed arg");
925   pass_arg2(this, arg_2);
926   pass_arg1(this, arg_1);
927   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
928 }
929 
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)930 void MacroAssembler::call_VM(Register oop_result,
931                              Register last_java_sp,
932                              address entry_point,
933                              Register arg_1,
934                              Register arg_2,
935                              Register arg_3,
936                              bool check_exceptions) {
937   assert(arg_1 != c_rarg3, "smashed arg");
938   assert(arg_2 != c_rarg3, "smashed arg");
939   pass_arg3(this, arg_3);
940   assert(arg_1 != c_rarg2, "smashed arg");
941   pass_arg2(this, arg_2);
942   pass_arg1(this, arg_1);
943   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
944 }
945 
946 
get_vm_result(Register oop_result,Register java_thread)947 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
948   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
949   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
950   verify_oop(oop_result, "broken oop in call_VM_base");
951 }
952 
get_vm_result_2(Register metadata_result,Register java_thread)953 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
954   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
955   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
956 }
957 
align(int modulus)958 void MacroAssembler::align(int modulus) {
959   while (offset() % modulus != 0) nop();
960 }
961 
962 // these are no-ops overridden by InterpreterMacroAssembler
963 
check_and_handle_earlyret(Register java_thread)964 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
965 
check_and_handle_popframe(Register java_thread)966 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
967 
968 
delayed_value_impl(intptr_t * delayed_value_addr,Register tmp,int offset)969 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
970                                                       Register tmp,
971                                                       int offset) {
972   intptr_t value = *delayed_value_addr;
973   if (value != 0)
974     return RegisterOrConstant(value + offset);
975 
976   // load indirectly to solve generation ordering problem
977   ldr(tmp, ExternalAddress((address) delayed_value_addr));
978 
979   if (offset != 0)
980     add(tmp, tmp, offset);
981 
982   return RegisterOrConstant(tmp);
983 }
984 
985 // Look up the method for a megamorphic invokeinterface call.
986 // The target method is determined by <intf_klass, itable_index>.
987 // The receiver klass is in recv_klass.
988 // On success, the result will be in method_result, and execution falls through.
989 // On failure, execution transfers to the given label.
lookup_interface_method(Register recv_klass,Register intf_klass,RegisterOrConstant itable_index,Register method_result,Register scan_temp,Label & L_no_such_interface,bool return_method)990 void MacroAssembler::lookup_interface_method(Register recv_klass,
991                                              Register intf_klass,
992                                              RegisterOrConstant itable_index,
993                                              Register method_result,
994                                              Register scan_temp,
995                                              Label& L_no_such_interface,
996                          bool return_method) {
997   assert_different_registers(recv_klass, intf_klass, scan_temp);
998   assert_different_registers(method_result, intf_klass, scan_temp);
999   assert(recv_klass != method_result || !return_method,
1000      "recv_klass can be destroyed when method isn't needed");
1001   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1002          "caller must use same register for non-constant itable index as for method");
1003 
1004   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1005   int vtable_base = in_bytes(Klass::vtable_start_offset());
1006   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1007   int scan_step   = itableOffsetEntry::size() * wordSize;
1008   int vte_size    = vtableEntry::size_in_bytes();
1009   assert(vte_size == wordSize, "else adjust times_vte_scale");
1010 
1011   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1012 
1013   // %%% Could store the aligned, prescaled offset in the klassoop.
1014   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1015   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1016   add(scan_temp, scan_temp, vtable_base);
1017 
1018   if (return_method) {
1019     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1020     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1021     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1022     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1023     if (itentry_off)
1024       add(recv_klass, recv_klass, itentry_off);
1025   }
1026 
1027   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1028   //   if (scan->interface() == intf) {
1029   //     result = (klass + scan->offset() + itable_index);
1030   //   }
1031   // }
1032   Label search, found_method;
1033 
1034   ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1035   cmp(intf_klass, method_result);
1036   br(Assembler::EQ, found_method);
1037   bind(search);
1038   // Check that the previous entry is non-null.  A null entry means that
1039   // the receiver class doesn't implement the interface, and wasn't the
1040   // same as when the caller was compiled.
1041   cbz(method_result, L_no_such_interface);
1042   if (itableOffsetEntry::interface_offset_in_bytes() != 0) {
1043     add(scan_temp, scan_temp, scan_step);
1044     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1045   } else {
1046     ldr(method_result, Address(pre(scan_temp, scan_step)));
1047   }
1048   cmp(intf_klass, method_result);
1049   br(Assembler::NE, search);
1050 
1051   bind(found_method);
1052 
1053   // Got a hit.
1054   if (return_method) {
1055     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1056     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1057   }
1058 }
1059 
1060 // virtual method calling
lookup_virtual_method(Register recv_klass,RegisterOrConstant vtable_index,Register method_result)1061 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1062                                            RegisterOrConstant vtable_index,
1063                                            Register method_result) {
1064   const int base = in_bytes(Klass::vtable_start_offset());
1065   assert(vtableEntry::size() * wordSize == 8,
1066          "adjust the scaling in the code below");
1067   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1068 
1069   if (vtable_index.is_register()) {
1070     lea(method_result, Address(recv_klass,
1071                                vtable_index.as_register(),
1072                                Address::lsl(LogBytesPerWord)));
1073     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1074   } else {
1075     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1076     ldr(method_result,
1077         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1078   }
1079 }
1080 
check_klass_subtype(Register sub_klass,Register super_klass,Register temp_reg,Label & L_success)1081 void MacroAssembler::check_klass_subtype(Register sub_klass,
1082                            Register super_klass,
1083                            Register temp_reg,
1084                            Label& L_success) {
1085   Label L_failure;
1086   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1087   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1088   bind(L_failure);
1089 }
1090 
1091 
check_klass_subtype_fast_path(Register sub_klass,Register super_klass,Register temp_reg,Label * L_success,Label * L_failure,Label * L_slow_path,RegisterOrConstant super_check_offset)1092 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1093                                                    Register super_klass,
1094                                                    Register temp_reg,
1095                                                    Label* L_success,
1096                                                    Label* L_failure,
1097                                                    Label* L_slow_path,
1098                                         RegisterOrConstant super_check_offset) {
1099   assert_different_registers(sub_klass, super_klass, temp_reg);
1100   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1101   if (super_check_offset.is_register()) {
1102     assert_different_registers(sub_klass, super_klass,
1103                                super_check_offset.as_register());
1104   } else if (must_load_sco) {
1105     assert(temp_reg != noreg, "supply either a temp or a register offset");
1106   }
1107 
1108   Label L_fallthrough;
1109   int label_nulls = 0;
1110   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1111   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1112   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1113   assert(label_nulls <= 1, "at most one NULL in the batch");
1114 
1115   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1116   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1117   Address super_check_offset_addr(super_klass, sco_offset);
1118 
1119   // Hacked jmp, which may only be used just before L_fallthrough.
1120 #define final_jmp(label)                                                \
1121   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1122   else                            b(label)                /*omit semi*/
1123 
1124   // If the pointers are equal, we are done (e.g., String[] elements).
1125   // This self-check enables sharing of secondary supertype arrays among
1126   // non-primary types such as array-of-interface.  Otherwise, each such
1127   // type would need its own customized SSA.
1128   // We move this check to the front of the fast path because many
1129   // type checks are in fact trivially successful in this manner,
1130   // so we get a nicely predicted branch right at the start of the check.
1131   cmp(sub_klass, super_klass);
1132   br(Assembler::EQ, *L_success);
1133 
1134   // Check the supertype display:
1135   if (must_load_sco) {
1136     ldrw(temp_reg, super_check_offset_addr);
1137     super_check_offset = RegisterOrConstant(temp_reg);
1138   }
1139   Address super_check_addr(sub_klass, super_check_offset);
1140   ldr(rscratch1, super_check_addr);
1141   cmp(super_klass, rscratch1); // load displayed supertype
1142 
1143   // This check has worked decisively for primary supers.
1144   // Secondary supers are sought in the super_cache ('super_cache_addr').
1145   // (Secondary supers are interfaces and very deeply nested subtypes.)
1146   // This works in the same check above because of a tricky aliasing
1147   // between the super_cache and the primary super display elements.
1148   // (The 'super_check_addr' can address either, as the case requires.)
1149   // Note that the cache is updated below if it does not help us find
1150   // what we need immediately.
1151   // So if it was a primary super, we can just fail immediately.
1152   // Otherwise, it's the slow path for us (no success at this point).
1153 
1154   if (super_check_offset.is_register()) {
1155     br(Assembler::EQ, *L_success);
1156     cmp(super_check_offset.as_register(), sc_offset);
1157     if (L_failure == &L_fallthrough) {
1158       br(Assembler::EQ, *L_slow_path);
1159     } else {
1160       br(Assembler::NE, *L_failure);
1161       final_jmp(*L_slow_path);
1162     }
1163   } else if (super_check_offset.as_constant() == sc_offset) {
1164     // Need a slow path; fast failure is impossible.
1165     if (L_slow_path == &L_fallthrough) {
1166       br(Assembler::EQ, *L_success);
1167     } else {
1168       br(Assembler::NE, *L_slow_path);
1169       final_jmp(*L_success);
1170     }
1171   } else {
1172     // No slow path; it's a fast decision.
1173     if (L_failure == &L_fallthrough) {
1174       br(Assembler::EQ, *L_success);
1175     } else {
1176       br(Assembler::NE, *L_failure);
1177       final_jmp(*L_success);
1178     }
1179   }
1180 
1181   bind(L_fallthrough);
1182 
1183 #undef final_jmp
1184 }
1185 
1186 // These two are taken from x86, but they look generally useful
1187 
1188 // scans count pointer sized words at [addr] for occurence of value,
1189 // generic
repne_scan(Register addr,Register value,Register count,Register scratch)1190 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1191                                 Register scratch) {
1192   Label Lloop, Lexit;
1193   cbz(count, Lexit);
1194   bind(Lloop);
1195   ldr(scratch, post(addr, wordSize));
1196   cmp(value, scratch);
1197   br(EQ, Lexit);
1198   sub(count, count, 1);
1199   cbnz(count, Lloop);
1200   bind(Lexit);
1201 }
1202 
1203 // scans count 4 byte words at [addr] for occurence of value,
1204 // generic
repne_scanw(Register addr,Register value,Register count,Register scratch)1205 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1206                                 Register scratch) {
1207   Label Lloop, Lexit;
1208   cbz(count, Lexit);
1209   bind(Lloop);
1210   ldrw(scratch, post(addr, wordSize));
1211   cmpw(value, scratch);
1212   br(EQ, Lexit);
1213   sub(count, count, 1);
1214   cbnz(count, Lloop);
1215   bind(Lexit);
1216 }
1217 
check_klass_subtype_slow_path(Register sub_klass,Register super_klass,Register temp_reg,Register temp2_reg,Label * L_success,Label * L_failure,bool set_cond_codes)1218 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1219                                                    Register super_klass,
1220                                                    Register temp_reg,
1221                                                    Register temp2_reg,
1222                                                    Label* L_success,
1223                                                    Label* L_failure,
1224                                                    bool set_cond_codes) {
1225   assert_different_registers(sub_klass, super_klass, temp_reg);
1226   if (temp2_reg != noreg)
1227     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1228 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1229 
1230   Label L_fallthrough;
1231   int label_nulls = 0;
1232   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1233   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1234   assert(label_nulls <= 1, "at most one NULL in the batch");
1235 
1236   // a couple of useful fields in sub_klass:
1237   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1238   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1239   Address secondary_supers_addr(sub_klass, ss_offset);
1240   Address super_cache_addr(     sub_klass, sc_offset);
1241 
1242   BLOCK_COMMENT("check_klass_subtype_slow_path");
1243 
1244   // Do a linear scan of the secondary super-klass chain.
1245   // This code is rarely used, so simplicity is a virtue here.
1246   // The repne_scan instruction uses fixed registers, which we must spill.
1247   // Don't worry too much about pre-existing connections with the input regs.
1248 
1249   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1250   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1251 
1252   RegSet pushed_registers;
1253   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1254   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1255 
1256   if (super_klass != r0 || UseCompressedOops) {
1257     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1258   }
1259 
1260   push(pushed_registers, sp);
1261 
1262   // Get super_klass value into r0 (even if it was in r5 or r2).
1263   if (super_klass != r0) {
1264     mov(r0, super_klass);
1265   }
1266 
1267 #ifndef PRODUCT
1268   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1269   Address pst_counter_addr(rscratch2);
1270   ldr(rscratch1, pst_counter_addr);
1271   add(rscratch1, rscratch1, 1);
1272   str(rscratch1, pst_counter_addr);
1273 #endif //PRODUCT
1274 
1275   // We will consult the secondary-super array.
1276   ldr(r5, secondary_supers_addr);
1277   // Load the array length.
1278   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1279   // Skip to start of data.
1280   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1281 
1282   cmp(sp, zr); // Clear Z flag; SP is never zero
1283   // Scan R2 words at [R5] for an occurrence of R0.
1284   // Set NZ/Z based on last compare.
1285   repne_scan(r5, r0, r2, rscratch1);
1286 
1287   // Unspill the temp. registers:
1288   pop(pushed_registers, sp);
1289 
1290   br(Assembler::NE, *L_failure);
1291 
1292   // Success.  Cache the super we found and proceed in triumph.
1293   str(super_klass, super_cache_addr);
1294 
1295   if (L_success != &L_fallthrough) {
1296     b(*L_success);
1297   }
1298 
1299 #undef IS_A_TEMP
1300 
1301   bind(L_fallthrough);
1302 }
1303 
1304 
verify_oop(Register reg,const char * s)1305 void MacroAssembler::verify_oop(Register reg, const char* s) {
1306   if (!VerifyOops) return;
1307 
1308   // Pass register number to verify_oop_subroutine
1309   const char* b = NULL;
1310   {
1311     ResourceMark rm;
1312     stringStream ss;
1313     ss.print("verify_oop: %s: %s", reg->name(), s);
1314     b = code_string(ss.as_string());
1315   }
1316   BLOCK_COMMENT("verify_oop {");
1317 
1318   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1319   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1320 
1321   mov(r0, reg);
1322   movptr(rscratch1, (uintptr_t)(address)b);
1323 
1324   // call indirectly to solve generation ordering problem
1325   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1326   ldr(rscratch2, Address(rscratch2));
1327   blr(rscratch2);
1328 
1329   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1330   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1331 
1332   BLOCK_COMMENT("} verify_oop");
1333 }
1334 
verify_oop_addr(Address addr,const char * s)1335 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1336   if (!VerifyOops) return;
1337 
1338   const char* b = NULL;
1339   {
1340     ResourceMark rm;
1341     stringStream ss;
1342     ss.print("verify_oop_addr: %s", s);
1343     b = code_string(ss.as_string());
1344   }
1345   BLOCK_COMMENT("verify_oop_addr {");
1346 
1347   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1348   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1349 
1350   // addr may contain sp so we will have to adjust it based on the
1351   // pushes that we just did.
1352   if (addr.uses(sp)) {
1353     lea(r0, addr);
1354     ldr(r0, Address(r0, 4 * wordSize));
1355   } else {
1356     ldr(r0, addr);
1357   }
1358   movptr(rscratch1, (uintptr_t)(address)b);
1359 
1360   // call indirectly to solve generation ordering problem
1361   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1362   ldr(rscratch2, Address(rscratch2));
1363   blr(rscratch2);
1364 
1365   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1366   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1367 
1368   BLOCK_COMMENT("} verify_oop_addr");
1369 }
1370 
argument_address(RegisterOrConstant arg_slot,int extra_slot_offset)1371 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1372                                          int extra_slot_offset) {
1373   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1374   int stackElementSize = Interpreter::stackElementSize;
1375   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1376 #ifdef ASSERT
1377   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1378   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1379 #endif
1380   if (arg_slot.is_constant()) {
1381     return Address(esp, arg_slot.as_constant() * stackElementSize
1382                    + offset);
1383   } else {
1384     add(rscratch1, esp, arg_slot.as_register(),
1385         ext::uxtx, exact_log2(stackElementSize));
1386     return Address(rscratch1, offset);
1387   }
1388 }
1389 
call_VM_leaf_base(address entry_point,int number_of_arguments,Label * retaddr)1390 void MacroAssembler::call_VM_leaf_base(address entry_point,
1391                                        int number_of_arguments,
1392                                        Label *retaddr) {
1393   Label E, L;
1394 
1395   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1396 
1397   mov(rscratch1, entry_point);
1398   blr(rscratch1);
1399   if (retaddr)
1400     bind(*retaddr);
1401 
1402   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1403   maybe_isb();
1404 }
1405 
call_VM_leaf(address entry_point,int number_of_arguments)1406 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1407   call_VM_leaf_base(entry_point, number_of_arguments);
1408 }
1409 
call_VM_leaf(address entry_point,Register arg_0)1410 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1411   pass_arg0(this, arg_0);
1412   call_VM_leaf_base(entry_point, 1);
1413 }
1414 
call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1415 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1416   pass_arg0(this, arg_0);
1417   pass_arg1(this, arg_1);
1418   call_VM_leaf_base(entry_point, 2);
1419 }
1420 
call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1421 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1422                                   Register arg_1, Register arg_2) {
1423   pass_arg0(this, arg_0);
1424   pass_arg1(this, arg_1);
1425   pass_arg2(this, arg_2);
1426   call_VM_leaf_base(entry_point, 3);
1427 }
1428 
super_call_VM_leaf(address entry_point,Register arg_0)1429 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1430   pass_arg0(this, arg_0);
1431   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1432 }
1433 
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1434 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1435 
1436   assert(arg_0 != c_rarg1, "smashed arg");
1437   pass_arg1(this, arg_1);
1438   pass_arg0(this, arg_0);
1439   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1440 }
1441 
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1442 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1443   assert(arg_0 != c_rarg2, "smashed arg");
1444   assert(arg_1 != c_rarg2, "smashed arg");
1445   pass_arg2(this, arg_2);
1446   assert(arg_0 != c_rarg1, "smashed arg");
1447   pass_arg1(this, arg_1);
1448   pass_arg0(this, arg_0);
1449   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1450 }
1451 
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2,Register arg_3)1452 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1453   assert(arg_0 != c_rarg3, "smashed arg");
1454   assert(arg_1 != c_rarg3, "smashed arg");
1455   assert(arg_2 != c_rarg3, "smashed arg");
1456   pass_arg3(this, arg_3);
1457   assert(arg_0 != c_rarg2, "smashed arg");
1458   assert(arg_1 != c_rarg2, "smashed arg");
1459   pass_arg2(this, arg_2);
1460   assert(arg_0 != c_rarg1, "smashed arg");
1461   pass_arg1(this, arg_1);
1462   pass_arg0(this, arg_0);
1463   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1464 }
1465 
null_check(Register reg,int offset)1466 void MacroAssembler::null_check(Register reg, int offset) {
1467   if (needs_explicit_null_check(offset)) {
1468     // provoke OS NULL exception if reg = NULL by
1469     // accessing M[reg] w/o changing any registers
1470     // NOTE: this is plenty to provoke a segv
1471     ldr(zr, Address(reg));
1472   } else {
1473     // nothing to do, (later) access of M[reg + offset]
1474     // will provoke OS NULL exception if reg = NULL
1475   }
1476 }
1477 
1478 // MacroAssembler protected routines needed to implement
1479 // public methods
1480 
mov(Register r,Address dest)1481 void MacroAssembler::mov(Register r, Address dest) {
1482   code_section()->relocate(pc(), dest.rspec());
1483   uint64_t imm64 = (uint64_t)dest.target();
1484   movptr(r, imm64);
1485 }
1486 
1487 // Move a constant pointer into r.  In AArch64 mode the virtual
1488 // address space is 48 bits in size, so we only need three
1489 // instructions to create a patchable instruction sequence that can
1490 // reach anywhere.
movptr(Register r,uintptr_t imm64)1491 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1492 #ifndef PRODUCT
1493   {
1494     char buffer[64];
1495     snprintf(buffer, sizeof(buffer), INTPTR_FORMAT, imm64);
1496     block_comment(buffer);
1497   }
1498 #endif
1499   assert(imm64 < (1ull << 48), "48-bit overflow in address constant");
1500   movz(r, imm64 & 0xffff);
1501   imm64 >>= 16;
1502   movk(r, imm64 & 0xffff, 16);
1503   imm64 >>= 16;
1504   movk(r, imm64 & 0xffff, 32);
1505 }
1506 
1507 // Macro to mov replicated immediate to vector register.
1508 //  Vd will get the following values for different arrangements in T
1509 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1510 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1511 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1512 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1513 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1514 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1515 //   T1D/T2D: invalid
mov(FloatRegister Vd,SIMD_Arrangement T,uint32_t imm32)1516 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, uint32_t imm32) {
1517   assert(T != T1D && T != T2D, "invalid arrangement");
1518   if (T == T8B || T == T16B) {
1519     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1520     movi(Vd, T, imm32 & 0xff, 0);
1521     return;
1522   }
1523   uint32_t nimm32 = ~imm32;
1524   if (T == T4H || T == T8H) {
1525     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1526     imm32 &= 0xffff;
1527     nimm32 &= 0xffff;
1528   }
1529   uint32_t x = imm32;
1530   int movi_cnt = 0;
1531   int movn_cnt = 0;
1532   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1533   x = nimm32;
1534   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1535   if (movn_cnt < movi_cnt) imm32 = nimm32;
1536   unsigned lsl = 0;
1537   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1538   if (movn_cnt < movi_cnt)
1539     mvni(Vd, T, imm32 & 0xff, lsl);
1540   else
1541     movi(Vd, T, imm32 & 0xff, lsl);
1542   imm32 >>= 8; lsl += 8;
1543   while (imm32) {
1544     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1545     if (movn_cnt < movi_cnt)
1546       bici(Vd, T, imm32 & 0xff, lsl);
1547     else
1548       orri(Vd, T, imm32 & 0xff, lsl);
1549     lsl += 8; imm32 >>= 8;
1550   }
1551 }
1552 
mov_immediate64(Register dst,uint64_t imm64)1553 void MacroAssembler::mov_immediate64(Register dst, uint64_t imm64)
1554 {
1555 #ifndef PRODUCT
1556   {
1557     char buffer[64];
1558     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1559     block_comment(buffer);
1560   }
1561 #endif
1562   if (operand_valid_for_logical_immediate(false, imm64)) {
1563     orr(dst, zr, imm64);
1564   } else {
1565     // we can use a combination of MOVZ or MOVN with
1566     // MOVK to build up the constant
1567     uint64_t imm_h[4];
1568     int zero_count = 0;
1569     int neg_count = 0;
1570     int i;
1571     for (i = 0; i < 4; i++) {
1572       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1573       if (imm_h[i] == 0) {
1574         zero_count++;
1575       } else if (imm_h[i] == 0xffffL) {
1576         neg_count++;
1577       }
1578     }
1579     if (zero_count == 4) {
1580       // one MOVZ will do
1581       movz(dst, 0);
1582     } else if (neg_count == 4) {
1583       // one MOVN will do
1584       movn(dst, 0);
1585     } else if (zero_count == 3) {
1586       for (i = 0; i < 4; i++) {
1587         if (imm_h[i] != 0L) {
1588           movz(dst, (uint32_t)imm_h[i], (i << 4));
1589           break;
1590         }
1591       }
1592     } else if (neg_count == 3) {
1593       // one MOVN will do
1594       for (int i = 0; i < 4; i++) {
1595         if (imm_h[i] != 0xffffL) {
1596           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1597           break;
1598         }
1599       }
1600     } else if (zero_count == 2) {
1601       // one MOVZ and one MOVK will do
1602       for (i = 0; i < 3; i++) {
1603         if (imm_h[i] != 0L) {
1604           movz(dst, (uint32_t)imm_h[i], (i << 4));
1605           i++;
1606           break;
1607         }
1608       }
1609       for (;i < 4; i++) {
1610         if (imm_h[i] != 0L) {
1611           movk(dst, (uint32_t)imm_h[i], (i << 4));
1612         }
1613       }
1614     } else if (neg_count == 2) {
1615       // one MOVN and one MOVK will do
1616       for (i = 0; i < 4; i++) {
1617         if (imm_h[i] != 0xffffL) {
1618           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1619           i++;
1620           break;
1621         }
1622       }
1623       for (;i < 4; i++) {
1624         if (imm_h[i] != 0xffffL) {
1625           movk(dst, (uint32_t)imm_h[i], (i << 4));
1626         }
1627       }
1628     } else if (zero_count == 1) {
1629       // one MOVZ and two MOVKs will do
1630       for (i = 0; i < 4; i++) {
1631         if (imm_h[i] != 0L) {
1632           movz(dst, (uint32_t)imm_h[i], (i << 4));
1633           i++;
1634           break;
1635         }
1636       }
1637       for (;i < 4; i++) {
1638         if (imm_h[i] != 0x0L) {
1639           movk(dst, (uint32_t)imm_h[i], (i << 4));
1640         }
1641       }
1642     } else if (neg_count == 1) {
1643       // one MOVN and two MOVKs will do
1644       for (i = 0; i < 4; i++) {
1645         if (imm_h[i] != 0xffffL) {
1646           movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1647           i++;
1648           break;
1649         }
1650       }
1651       for (;i < 4; i++) {
1652         if (imm_h[i] != 0xffffL) {
1653           movk(dst, (uint32_t)imm_h[i], (i << 4));
1654         }
1655       }
1656     } else {
1657       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1658       movz(dst, (uint32_t)imm_h[0], 0);
1659       for (i = 1; i < 4; i++) {
1660         movk(dst, (uint32_t)imm_h[i], (i << 4));
1661       }
1662     }
1663   }
1664 }
1665 
mov_immediate32(Register dst,uint32_t imm32)1666 void MacroAssembler::mov_immediate32(Register dst, uint32_t imm32)
1667 {
1668 #ifndef PRODUCT
1669     {
1670       char buffer[64];
1671       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1672       block_comment(buffer);
1673     }
1674 #endif
1675   if (operand_valid_for_logical_immediate(true, imm32)) {
1676     orrw(dst, zr, imm32);
1677   } else {
1678     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1679     // constant
1680     uint32_t imm_h[2];
1681     imm_h[0] = imm32 & 0xffff;
1682     imm_h[1] = ((imm32 >> 16) & 0xffff);
1683     if (imm_h[0] == 0) {
1684       movzw(dst, imm_h[1], 16);
1685     } else if (imm_h[0] == 0xffff) {
1686       movnw(dst, imm_h[1] ^ 0xffff, 16);
1687     } else if (imm_h[1] == 0) {
1688       movzw(dst, imm_h[0], 0);
1689     } else if (imm_h[1] == 0xffff) {
1690       movnw(dst, imm_h[0] ^ 0xffff, 0);
1691     } else {
1692       // use a MOVZ and MOVK (makes it easier to debug)
1693       movzw(dst, imm_h[0], 0);
1694       movkw(dst, imm_h[1], 16);
1695     }
1696   }
1697 }
1698 
1699 // Form an address from base + offset in Rd.  Rd may or may
1700 // not actually be used: you must use the Address that is returned.
1701 // It is up to you to ensure that the shift provided matches the size
1702 // of your data.
form_address(Register Rd,Register base,int64_t byte_offset,int shift)1703 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset, int shift) {
1704   if (Address::offset_ok_for_immed(byte_offset, shift))
1705     // It fits; no need for any heroics
1706     return Address(base, byte_offset);
1707 
1708   // Don't do anything clever with negative or misaligned offsets
1709   unsigned mask = (1 << shift) - 1;
1710   if (byte_offset < 0 || byte_offset & mask) {
1711     mov(Rd, byte_offset);
1712     add(Rd, base, Rd);
1713     return Address(Rd);
1714   }
1715 
1716   // See if we can do this with two 12-bit offsets
1717   {
1718     uint64_t word_offset = byte_offset >> shift;
1719     uint64_t masked_offset = word_offset & 0xfff000;
1720     if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
1721         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1722       add(Rd, base, masked_offset << shift);
1723       word_offset -= masked_offset;
1724       return Address(Rd, word_offset << shift);
1725     }
1726   }
1727 
1728   // Do it the hard way
1729   mov(Rd, byte_offset);
1730   add(Rd, base, Rd);
1731   return Address(Rd);
1732 }
1733 
atomic_incw(Register counter_addr,Register tmp,Register tmp2)1734 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1735   if (UseLSE) {
1736     mov(tmp, 1);
1737     ldadd(Assembler::word, tmp, zr, counter_addr);
1738     return;
1739   }
1740   Label retry_load;
1741   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1742     prfm(Address(counter_addr), PSTL1STRM);
1743   bind(retry_load);
1744   // flush and load exclusive from the memory location
1745   ldxrw(tmp, counter_addr);
1746   addw(tmp, tmp, 1);
1747   // if we store+flush with no intervening write tmp wil be zero
1748   stxrw(tmp2, tmp, counter_addr);
1749   cbnzw(tmp2, retry_load);
1750 }
1751 
1752 
corrected_idivl(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1753 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1754                                     bool want_remainder, Register scratch)
1755 {
1756   // Full implementation of Java idiv and irem.  The function
1757   // returns the (pc) offset of the div instruction - may be needed
1758   // for implicit exceptions.
1759   //
1760   // constraint : ra/rb =/= scratch
1761   //         normal case
1762   //
1763   // input : ra: dividend
1764   //         rb: divisor
1765   //
1766   // result: either
1767   //         quotient  (= ra idiv rb)
1768   //         remainder (= ra irem rb)
1769 
1770   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1771 
1772   int idivl_offset = offset();
1773   if (! want_remainder) {
1774     sdivw(result, ra, rb);
1775   } else {
1776     sdivw(scratch, ra, rb);
1777     Assembler::msubw(result, scratch, rb, ra);
1778   }
1779 
1780   return idivl_offset;
1781 }
1782 
corrected_idivq(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1783 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1784                                     bool want_remainder, Register scratch)
1785 {
1786   // Full implementation of Java ldiv and lrem.  The function
1787   // returns the (pc) offset of the div instruction - may be needed
1788   // for implicit exceptions.
1789   //
1790   // constraint : ra/rb =/= scratch
1791   //         normal case
1792   //
1793   // input : ra: dividend
1794   //         rb: divisor
1795   //
1796   // result: either
1797   //         quotient  (= ra idiv rb)
1798   //         remainder (= ra irem rb)
1799 
1800   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1801 
1802   int idivq_offset = offset();
1803   if (! want_remainder) {
1804     sdiv(result, ra, rb);
1805   } else {
1806     sdiv(scratch, ra, rb);
1807     Assembler::msub(result, scratch, rb, ra);
1808   }
1809 
1810   return idivq_offset;
1811 }
1812 
membar(Membar_mask_bits order_constraint)1813 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1814   address prev = pc() - NativeMembar::instruction_size;
1815   address last = code()->last_insn();
1816   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1817     NativeMembar *bar = NativeMembar_at(prev);
1818     // We are merging two memory barrier instructions.  On AArch64 we
1819     // can do this simply by ORing them together.
1820     bar->set_kind(bar->get_kind() | order_constraint);
1821     BLOCK_COMMENT("merged membar");
1822   } else {
1823     code()->set_last_insn(pc());
1824     dmb(Assembler::barrier(order_constraint));
1825   }
1826 }
1827 
try_merge_ldst(Register rt,const Address & adr,size_t size_in_bytes,bool is_store)1828 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1829   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1830     merge_ldst(rt, adr, size_in_bytes, is_store);
1831     code()->clear_last_insn();
1832     return true;
1833   } else {
1834     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1835     const unsigned mask = size_in_bytes - 1;
1836     if (adr.getMode() == Address::base_plus_offset &&
1837         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1838       code()->set_last_insn(pc());
1839     }
1840     return false;
1841   }
1842 }
1843 
ldr(Register Rx,const Address & adr)1844 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1845   // We always try to merge two adjacent loads into one ldp.
1846   if (!try_merge_ldst(Rx, adr, 8, false)) {
1847     Assembler::ldr(Rx, adr);
1848   }
1849 }
1850 
ldrw(Register Rw,const Address & adr)1851 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1852   // We always try to merge two adjacent loads into one ldp.
1853   if (!try_merge_ldst(Rw, adr, 4, false)) {
1854     Assembler::ldrw(Rw, adr);
1855   }
1856 }
1857 
str(Register Rx,const Address & adr)1858 void MacroAssembler::str(Register Rx, const Address &adr) {
1859   // We always try to merge two adjacent stores into one stp.
1860   if (!try_merge_ldst(Rx, adr, 8, true)) {
1861     Assembler::str(Rx, adr);
1862   }
1863 }
1864 
strw(Register Rw,const Address & adr)1865 void MacroAssembler::strw(Register Rw, const Address &adr) {
1866   // We always try to merge two adjacent stores into one stp.
1867   if (!try_merge_ldst(Rw, adr, 4, true)) {
1868     Assembler::strw(Rw, adr);
1869   }
1870 }
1871 
1872 // MacroAssembler routines found actually to be needed
1873 
push(Register src)1874 void MacroAssembler::push(Register src)
1875 {
1876   str(src, Address(pre(esp, -1 * wordSize)));
1877 }
1878 
pop(Register dst)1879 void MacroAssembler::pop(Register dst)
1880 {
1881   ldr(dst, Address(post(esp, 1 * wordSize)));
1882 }
1883 
1884 // Note: load_unsigned_short used to be called load_unsigned_word.
load_unsigned_short(Register dst,Address src)1885 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1886   int off = offset();
1887   ldrh(dst, src);
1888   return off;
1889 }
1890 
load_unsigned_byte(Register dst,Address src)1891 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1892   int off = offset();
1893   ldrb(dst, src);
1894   return off;
1895 }
1896 
load_signed_short(Register dst,Address src)1897 int MacroAssembler::load_signed_short(Register dst, Address src) {
1898   int off = offset();
1899   ldrsh(dst, src);
1900   return off;
1901 }
1902 
load_signed_byte(Register dst,Address src)1903 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1904   int off = offset();
1905   ldrsb(dst, src);
1906   return off;
1907 }
1908 
load_signed_short32(Register dst,Address src)1909 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1910   int off = offset();
1911   ldrshw(dst, src);
1912   return off;
1913 }
1914 
load_signed_byte32(Register dst,Address src)1915 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1916   int off = offset();
1917   ldrsbw(dst, src);
1918   return off;
1919 }
1920 
load_sized_value(Register dst,Address src,size_t size_in_bytes,bool is_signed,Register dst2)1921 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1922   switch (size_in_bytes) {
1923   case  8:  ldr(dst, src); break;
1924   case  4:  ldrw(dst, src); break;
1925   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1926   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1927   default:  ShouldNotReachHere();
1928   }
1929 }
1930 
store_sized_value(Address dst,Register src,size_t size_in_bytes,Register src2)1931 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1932   switch (size_in_bytes) {
1933   case  8:  str(src, dst); break;
1934   case  4:  strw(src, dst); break;
1935   case  2:  strh(src, dst); break;
1936   case  1:  strb(src, dst); break;
1937   default:  ShouldNotReachHere();
1938   }
1939 }
1940 
decrementw(Register reg,int value)1941 void MacroAssembler::decrementw(Register reg, int value)
1942 {
1943   if (value < 0)  { incrementw(reg, -value);      return; }
1944   if (value == 0) {                               return; }
1945   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1946   /* else */ {
1947     guarantee(reg != rscratch2, "invalid dst for register decrement");
1948     movw(rscratch2, (unsigned)value);
1949     subw(reg, reg, rscratch2);
1950   }
1951 }
1952 
decrement(Register reg,int value)1953 void MacroAssembler::decrement(Register reg, int value)
1954 {
1955   if (value < 0)  { increment(reg, -value);      return; }
1956   if (value == 0) {                              return; }
1957   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1958   /* else */ {
1959     assert(reg != rscratch2, "invalid dst for register decrement");
1960     mov(rscratch2, (uint64_t)value);
1961     sub(reg, reg, rscratch2);
1962   }
1963 }
1964 
decrementw(Address dst,int value)1965 void MacroAssembler::decrementw(Address dst, int value)
1966 {
1967   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1968   if (dst.getMode() == Address::literal) {
1969     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1970     lea(rscratch2, dst);
1971     dst = Address(rscratch2);
1972   }
1973   ldrw(rscratch1, dst);
1974   decrementw(rscratch1, value);
1975   strw(rscratch1, dst);
1976 }
1977 
decrement(Address dst,int value)1978 void MacroAssembler::decrement(Address dst, int value)
1979 {
1980   assert(!dst.uses(rscratch1), "invalid address for decrement");
1981   if (dst.getMode() == Address::literal) {
1982     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1983     lea(rscratch2, dst);
1984     dst = Address(rscratch2);
1985   }
1986   ldr(rscratch1, dst);
1987   decrement(rscratch1, value);
1988   str(rscratch1, dst);
1989 }
1990 
incrementw(Register reg,int value)1991 void MacroAssembler::incrementw(Register reg, int value)
1992 {
1993   if (value < 0)  { decrementw(reg, -value);      return; }
1994   if (value == 0) {                               return; }
1995   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1996   /* else */ {
1997     assert(reg != rscratch2, "invalid dst for register increment");
1998     movw(rscratch2, (unsigned)value);
1999     addw(reg, reg, rscratch2);
2000   }
2001 }
2002 
increment(Register reg,int value)2003 void MacroAssembler::increment(Register reg, int value)
2004 {
2005   if (value < 0)  { decrement(reg, -value);      return; }
2006   if (value == 0) {                              return; }
2007   if (value < (1 << 12)) { add(reg, reg, value); return; }
2008   /* else */ {
2009     assert(reg != rscratch2, "invalid dst for register increment");
2010     movw(rscratch2, (unsigned)value);
2011     add(reg, reg, rscratch2);
2012   }
2013 }
2014 
incrementw(Address dst,int value)2015 void MacroAssembler::incrementw(Address dst, int value)
2016 {
2017   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2018   if (dst.getMode() == Address::literal) {
2019     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2020     lea(rscratch2, dst);
2021     dst = Address(rscratch2);
2022   }
2023   ldrw(rscratch1, dst);
2024   incrementw(rscratch1, value);
2025   strw(rscratch1, dst);
2026 }
2027 
increment(Address dst,int value)2028 void MacroAssembler::increment(Address dst, int value)
2029 {
2030   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2031   if (dst.getMode() == Address::literal) {
2032     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2033     lea(rscratch2, dst);
2034     dst = Address(rscratch2);
2035   }
2036   ldr(rscratch1, dst);
2037   increment(rscratch1, value);
2038   str(rscratch1, dst);
2039 }
2040 
2041 
pusha()2042 void MacroAssembler::pusha() {
2043   push(0x7fffffff, sp);
2044 }
2045 
popa()2046 void MacroAssembler::popa() {
2047   pop(0x7fffffff, sp);
2048 }
2049 
2050 // Push lots of registers in the bit set supplied.  Don't push sp.
2051 // Return the number of words pushed
push(unsigned int bitset,Register stack)2052 int MacroAssembler::push(unsigned int bitset, Register stack) {
2053   int words_pushed = 0;
2054 
2055   // Scan bitset to accumulate register pairs
2056   unsigned char regs[32];
2057   int count = 0;
2058   for (int reg = 0; reg <= 30; reg++) {
2059     if (1 & bitset)
2060       regs[count++] = reg;
2061     bitset >>= 1;
2062   }
2063   regs[count++] = zr->encoding_nocheck();
2064   count &= ~1;  // Only push an even nuber of regs
2065 
2066   if (count) {
2067     stp(as_Register(regs[0]), as_Register(regs[1]),
2068        Address(pre(stack, -count * wordSize)));
2069     words_pushed += 2;
2070   }
2071   for (int i = 2; i < count; i += 2) {
2072     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2073        Address(stack, i * wordSize));
2074     words_pushed += 2;
2075   }
2076 
2077   assert(words_pushed == count, "oops, pushed != count");
2078 
2079   return count;
2080 }
2081 
pop(unsigned int bitset,Register stack)2082 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2083   int words_pushed = 0;
2084 
2085   // Scan bitset to accumulate register pairs
2086   unsigned char regs[32];
2087   int count = 0;
2088   for (int reg = 0; reg <= 30; reg++) {
2089     if (1 & bitset)
2090       regs[count++] = reg;
2091     bitset >>= 1;
2092   }
2093   regs[count++] = zr->encoding_nocheck();
2094   count &= ~1;
2095 
2096   for (int i = 2; i < count; i += 2) {
2097     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2098        Address(stack, i * wordSize));
2099     words_pushed += 2;
2100   }
2101   if (count) {
2102     ldp(as_Register(regs[0]), as_Register(regs[1]),
2103        Address(post(stack, count * wordSize)));
2104     words_pushed += 2;
2105   }
2106 
2107   assert(words_pushed == count, "oops, pushed != count");
2108 
2109   return count;
2110 }
2111 #ifdef ASSERT
verify_heapbase(const char * msg)2112 void MacroAssembler::verify_heapbase(const char* msg) {
2113 #if 0
2114   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2115   assert (Universe::heap() != NULL, "java heap should be initialized");
2116   if (CheckCompressedOops) {
2117     Label ok;
2118     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2119     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2120     br(Assembler::EQ, ok);
2121     stop(msg);
2122     bind(ok);
2123     pop(1 << rscratch1->encoding(), sp);
2124   }
2125 #endif
2126 }
2127 #endif
2128 
resolve_jobject(Register value,Register thread,Register tmp)2129 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2130   Label done, not_weak;
2131   cbz(value, done);           // Use NULL as-is.
2132 
2133   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2134   tbz(r0, 0, not_weak);    // Test for jweak tag.
2135 
2136   // Resolve jweak.
2137   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2138                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2139   verify_oop(value);
2140   b(done);
2141 
2142   bind(not_weak);
2143   // Resolve (untagged) jobject.
2144   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2145   verify_oop(value);
2146   bind(done);
2147 }
2148 
stop(const char * msg)2149 void MacroAssembler::stop(const char* msg) {
2150   address ip = pc();
2151   pusha();
2152   movptr(c_rarg0, (uintptr_t)(address)msg);
2153   movptr(c_rarg1, (uintptr_t)(address)ip);
2154   mov(c_rarg2, sp);
2155   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2156   blr(c_rarg3);
2157   hlt(0);
2158 }
2159 
warn(const char * msg)2160 void MacroAssembler::warn(const char* msg) {
2161   pusha();
2162   mov(c_rarg0, (address)msg);
2163   mov(lr, CAST_FROM_FN_PTR(address, warning));
2164   blr(lr);
2165   popa();
2166 }
2167 
unimplemented(const char * what)2168 void MacroAssembler::unimplemented(const char* what) {
2169   const char* buf = NULL;
2170   {
2171     ResourceMark rm;
2172     stringStream ss;
2173     ss.print("unimplemented: %s", what);
2174     buf = code_string(ss.as_string());
2175   }
2176   stop(buf);
2177 }
2178 
2179 // If a constant does not fit in an immediate field, generate some
2180 // number of MOV instructions and then perform the operation.
wrap_add_sub_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2181 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2182                                            add_sub_imm_insn insn1,
2183                                            add_sub_reg_insn insn2) {
2184   assert(Rd != zr, "Rd = zr and not setting flags?");
2185   if (operand_valid_for_add_sub_immediate((int)imm)) {
2186     (this->*insn1)(Rd, Rn, imm);
2187   } else {
2188     if (uabs(imm) < (1 << 24)) {
2189        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2190        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2191     } else {
2192        assert_different_registers(Rd, Rn);
2193        mov(Rd, (uint64_t)imm);
2194        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2195     }
2196   }
2197 }
2198 
2199 // Seperate vsn which sets the flags. Optimisations are more restricted
2200 // because we must set the flags correctly.
wrap_adds_subs_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2201 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2202                                            add_sub_imm_insn insn1,
2203                                            add_sub_reg_insn insn2) {
2204   if (operand_valid_for_add_sub_immediate((int)imm)) {
2205     (this->*insn1)(Rd, Rn, imm);
2206   } else {
2207     assert_different_registers(Rd, Rn);
2208     assert(Rd != zr, "overflow in immediate operand");
2209     mov(Rd, (uint64_t)imm);
2210     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2211   }
2212 }
2213 
2214 
add(Register Rd,Register Rn,RegisterOrConstant increment)2215 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2216   if (increment.is_register()) {
2217     add(Rd, Rn, increment.as_register());
2218   } else {
2219     add(Rd, Rn, increment.as_constant());
2220   }
2221 }
2222 
addw(Register Rd,Register Rn,RegisterOrConstant increment)2223 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2224   if (increment.is_register()) {
2225     addw(Rd, Rn, increment.as_register());
2226   } else {
2227     addw(Rd, Rn, increment.as_constant());
2228   }
2229 }
2230 
sub(Register Rd,Register Rn,RegisterOrConstant decrement)2231 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2232   if (decrement.is_register()) {
2233     sub(Rd, Rn, decrement.as_register());
2234   } else {
2235     sub(Rd, Rn, decrement.as_constant());
2236   }
2237 }
2238 
subw(Register Rd,Register Rn,RegisterOrConstant decrement)2239 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2240   if (decrement.is_register()) {
2241     subw(Rd, Rn, decrement.as_register());
2242   } else {
2243     subw(Rd, Rn, decrement.as_constant());
2244   }
2245 }
2246 
reinit_heapbase()2247 void MacroAssembler::reinit_heapbase()
2248 {
2249   if (UseCompressedOops) {
2250     if (Universe::is_fully_initialized()) {
2251       mov(rheapbase, Universe::narrow_ptrs_base());
2252     } else {
2253       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2254       ldr(rheapbase, Address(rheapbase));
2255     }
2256   }
2257 }
2258 
2259 // this simulates the behaviour of the x86 cmpxchg instruction using a
2260 // load linked/store conditional pair. we use the acquire/release
2261 // versions of these instructions so that we flush pending writes as
2262 // per Java semantics.
2263 
2264 // n.b the x86 version assumes the old value to be compared against is
2265 // in rax and updates rax with the value located in memory if the
2266 // cmpxchg fails. we supply a register for the old value explicitly
2267 
2268 // the aarch64 load linked/store conditional instructions do not
2269 // accept an offset. so, unlike x86, we must provide a plain register
2270 // to identify the memory word to be compared/exchanged rather than a
2271 // register+offset Address.
2272 
cmpxchgptr(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2273 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2274                                 Label &succeed, Label *fail) {
2275   // oldv holds comparison value
2276   // newv holds value to write in exchange
2277   // addr identifies memory word to compare against/update
2278   if (UseLSE) {
2279     mov(tmp, oldv);
2280     casal(Assembler::xword, oldv, newv, addr);
2281     cmp(tmp, oldv);
2282     br(Assembler::EQ, succeed);
2283     membar(AnyAny);
2284   } else {
2285     Label retry_load, nope;
2286     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2287       prfm(Address(addr), PSTL1STRM);
2288     bind(retry_load);
2289     // flush and load exclusive from the memory location
2290     // and fail if it is not what we expect
2291     ldaxr(tmp, addr);
2292     cmp(tmp, oldv);
2293     br(Assembler::NE, nope);
2294     // if we store+flush with no intervening write tmp wil be zero
2295     stlxr(tmp, newv, addr);
2296     cbzw(tmp, succeed);
2297     // retry so we only ever return after a load fails to compare
2298     // ensures we don't return a stale value after a failed write.
2299     b(retry_load);
2300     // if the memory word differs we return it in oldv and signal a fail
2301     bind(nope);
2302     membar(AnyAny);
2303     mov(oldv, tmp);
2304   }
2305   if (fail)
2306     b(*fail);
2307 }
2308 
cmpxchg_obj_header(Register oldv,Register newv,Register obj,Register tmp,Label & succeed,Label * fail)2309 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2310                                         Label &succeed, Label *fail) {
2311   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2312   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2313 }
2314 
cmpxchgw(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2315 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2316                                 Label &succeed, Label *fail) {
2317   // oldv holds comparison value
2318   // newv holds value to write in exchange
2319   // addr identifies memory word to compare against/update
2320   // tmp returns 0/1 for success/failure
2321   if (UseLSE) {
2322     mov(tmp, oldv);
2323     casal(Assembler::word, oldv, newv, addr);
2324     cmp(tmp, oldv);
2325     br(Assembler::EQ, succeed);
2326     membar(AnyAny);
2327   } else {
2328     Label retry_load, nope;
2329     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2330       prfm(Address(addr), PSTL1STRM);
2331     bind(retry_load);
2332     // flush and load exclusive from the memory location
2333     // and fail if it is not what we expect
2334     ldaxrw(tmp, addr);
2335     cmp(tmp, oldv);
2336     br(Assembler::NE, nope);
2337     // if we store+flush with no intervening write tmp wil be zero
2338     stlxrw(tmp, newv, addr);
2339     cbzw(tmp, succeed);
2340     // retry so we only ever return after a load fails to compare
2341     // ensures we don't return a stale value after a failed write.
2342     b(retry_load);
2343     // if the memory word differs we return it in oldv and signal a fail
2344     bind(nope);
2345     membar(AnyAny);
2346     mov(oldv, tmp);
2347   }
2348   if (fail)
2349     b(*fail);
2350 }
2351 
2352 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2353 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2354 // Pass a register for the result, otherwise pass noreg.
2355 
2356 // Clobbers rscratch1
cmpxchg(Register addr,Register expected,Register new_val,enum operand_size size,bool acquire,bool release,bool weak,Register result)2357 void MacroAssembler::cmpxchg(Register addr, Register expected,
2358                              Register new_val,
2359                              enum operand_size size,
2360                              bool acquire, bool release,
2361                              bool weak,
2362                              Register result) {
2363   if (result == noreg)  result = rscratch1;
2364   BLOCK_COMMENT("cmpxchg {");
2365   if (UseLSE) {
2366     mov(result, expected);
2367     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2368     compare_eq(result, expected, size);
2369   } else {
2370     Label retry_load, done;
2371     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2372       prfm(Address(addr), PSTL1STRM);
2373     bind(retry_load);
2374     load_exclusive(result, addr, size, acquire);
2375     compare_eq(result, expected, size);
2376     br(Assembler::NE, done);
2377     store_exclusive(rscratch1, new_val, addr, size, release);
2378     if (weak) {
2379       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2380     } else {
2381       cbnzw(rscratch1, retry_load);
2382     }
2383     bind(done);
2384   }
2385   BLOCK_COMMENT("} cmpxchg");
2386 }
2387 
2388 // A generic comparison. Only compares for equality, clobbers rscratch1.
compare_eq(Register rm,Register rn,enum operand_size size)2389 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2390   if (size == xword) {
2391     cmp(rm, rn);
2392   } else if (size == word) {
2393     cmpw(rm, rn);
2394   } else if (size == halfword) {
2395     eorw(rscratch1, rm, rn);
2396     ands(zr, rscratch1, 0xffff);
2397   } else if (size == byte) {
2398     eorw(rscratch1, rm, rn);
2399     ands(zr, rscratch1, 0xff);
2400   } else {
2401     ShouldNotReachHere();
2402   }
2403 }
2404 
2405 
different(Register a,RegisterOrConstant b,Register c)2406 static bool different(Register a, RegisterOrConstant b, Register c) {
2407   if (b.is_constant())
2408     return a != c;
2409   else
2410     return a != b.as_register() && a != c && b.as_register() != c;
2411 }
2412 
2413 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2414 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2415   if (UseLSE) {                                                         \
2416     prev = prev->is_valid() ? prev : zr;                                \
2417     if (incr.is_register()) {                                           \
2418       AOP(sz, incr.as_register(), prev, addr);                          \
2419     } else {                                                            \
2420       mov(rscratch2, incr.as_constant());                               \
2421       AOP(sz, rscratch2, prev, addr);                                   \
2422     }                                                                   \
2423     return;                                                             \
2424   }                                                                     \
2425   Register result = rscratch2;                                          \
2426   if (prev->is_valid())                                                 \
2427     result = different(prev, incr, addr) ? prev : rscratch2;            \
2428                                                                         \
2429   Label retry_load;                                                     \
2430   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2431     prfm(Address(addr), PSTL1STRM);                                     \
2432   bind(retry_load);                                                     \
2433   LDXR(result, addr);                                                   \
2434   OP(rscratch1, result, incr);                                          \
2435   STXR(rscratch2, rscratch1, addr);                                     \
2436   cbnzw(rscratch2, retry_load);                                         \
2437   if (prev->is_valid() && prev != result) {                             \
2438     IOP(prev, rscratch1, incr);                                         \
2439   }                                                                     \
2440 }
2441 
2442 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2443 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2444 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2445 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2446 
2447 #undef ATOMIC_OP
2448 
2449 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2450 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2451   if (UseLSE) {                                                         \
2452     prev = prev->is_valid() ? prev : zr;                                \
2453     AOP(sz, newv, prev, addr);                                          \
2454     return;                                                             \
2455   }                                                                     \
2456   Register result = rscratch2;                                          \
2457   if (prev->is_valid())                                                 \
2458     result = different(prev, newv, addr) ? prev : rscratch2;            \
2459                                                                         \
2460   Label retry_load;                                                     \
2461   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2462     prfm(Address(addr), PSTL1STRM);                                     \
2463   bind(retry_load);                                                     \
2464   LDXR(result, addr);                                                   \
2465   STXR(rscratch1, newv, addr);                                          \
2466   cbnzw(rscratch1, retry_load);                                         \
2467   if (prev->is_valid() && prev != result)                               \
2468     mov(prev, result);                                                  \
2469 }
2470 
2471 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2472 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2473 ATOMIC_XCHG(xchgl, swpl, ldxr, stlxr, Assembler::xword)
2474 ATOMIC_XCHG(xchglw, swpl, ldxrw, stlxrw, Assembler::word)
2475 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2476 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2477 
2478 #undef ATOMIC_XCHG
2479 
2480 #ifndef PRODUCT
2481 extern "C" void findpc(intptr_t x);
2482 #endif
2483 
debug64(char * msg,int64_t pc,int64_t regs[])2484 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2485 {
2486   // In order to get locks to work, we need to fake a in_VM state
2487   if (ShowMessageBoxOnError ) {
2488     JavaThread* thread = JavaThread::current();
2489     JavaThreadState saved_state = thread->thread_state();
2490     thread->set_thread_state(_thread_in_vm);
2491 #ifndef PRODUCT
2492     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2493       ttyLocker ttyl;
2494       BytecodeCounter::print();
2495     }
2496 #endif
2497     if (os::message_box(msg, "Execution stopped, print registers?")) {
2498       ttyLocker ttyl;
2499       tty->print_cr(" pc = 0x" UINT64_FORMAT_X, pc);
2500 #ifndef PRODUCT
2501       tty->cr();
2502       findpc(pc);
2503       tty->cr();
2504 #endif
2505       tty->print_cr(" r0 = 0x" UINT64_FORMAT_X, regs[0]);
2506       tty->print_cr(" r1 = 0x" UINT64_FORMAT_X, regs[1]);
2507       tty->print_cr(" r2 = 0x" UINT64_FORMAT_X, regs[2]);
2508       tty->print_cr(" r3 = 0x" UINT64_FORMAT_X, regs[3]);
2509       tty->print_cr(" r4 = 0x" UINT64_FORMAT_X, regs[4]);
2510       tty->print_cr(" r5 = 0x" UINT64_FORMAT_X, regs[5]);
2511       tty->print_cr(" r6 = 0x" UINT64_FORMAT_X, regs[6]);
2512       tty->print_cr(" r7 = 0x" UINT64_FORMAT_X, regs[7]);
2513       tty->print_cr(" r8 = 0x" UINT64_FORMAT_X, regs[8]);
2514       tty->print_cr(" r9 = 0x" UINT64_FORMAT_X, regs[9]);
2515       tty->print_cr("r10 = 0x" UINT64_FORMAT_X, regs[10]);
2516       tty->print_cr("r11 = 0x" UINT64_FORMAT_X, regs[11]);
2517       tty->print_cr("r12 = 0x" UINT64_FORMAT_X, regs[12]);
2518       tty->print_cr("r13 = 0x" UINT64_FORMAT_X, regs[13]);
2519       tty->print_cr("r14 = 0x" UINT64_FORMAT_X, regs[14]);
2520       tty->print_cr("r15 = 0x" UINT64_FORMAT_X, regs[15]);
2521       tty->print_cr("r16 = 0x" UINT64_FORMAT_X, regs[16]);
2522       tty->print_cr("r17 = 0x" UINT64_FORMAT_X, regs[17]);
2523       tty->print_cr("r18 = 0x" UINT64_FORMAT_X, regs[18]);
2524       tty->print_cr("r19 = 0x" UINT64_FORMAT_X, regs[19]);
2525       tty->print_cr("r20 = 0x" UINT64_FORMAT_X, regs[20]);
2526       tty->print_cr("r21 = 0x" UINT64_FORMAT_X, regs[21]);
2527       tty->print_cr("r22 = 0x" UINT64_FORMAT_X, regs[22]);
2528       tty->print_cr("r23 = 0x" UINT64_FORMAT_X, regs[23]);
2529       tty->print_cr("r24 = 0x" UINT64_FORMAT_X, regs[24]);
2530       tty->print_cr("r25 = 0x" UINT64_FORMAT_X, regs[25]);
2531       tty->print_cr("r26 = 0x" UINT64_FORMAT_X, regs[26]);
2532       tty->print_cr("r27 = 0x" UINT64_FORMAT_X, regs[27]);
2533       tty->print_cr("r28 = 0x" UINT64_FORMAT_X, regs[28]);
2534       tty->print_cr("r30 = 0x" UINT64_FORMAT_X, regs[30]);
2535       tty->print_cr("r31 = 0x" UINT64_FORMAT_X, regs[31]);
2536       BREAKPOINT;
2537     }
2538     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2539   } else {
2540     ttyLocker ttyl;
2541     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2542                     msg);
2543     assert(false, "DEBUG MESSAGE: %s", msg);
2544   }
2545 }
2546 
push_call_clobbered_registers()2547 void MacroAssembler::push_call_clobbered_registers() {
2548   int step = 4 * wordSize;
2549   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2550   sub(sp, sp, step);
2551   mov(rscratch1, -step);
2552   // Push v0-v7, v16-v31.
2553   for (int i = 31; i>= 4; i -= 4) {
2554     if (i <= v7->encoding() || i >= v16->encoding())
2555       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2556           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2557   }
2558   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2559       as_FloatRegister(3), T1D, Address(sp));
2560 }
2561 
pop_call_clobbered_registers()2562 void MacroAssembler::pop_call_clobbered_registers() {
2563   for (int i = 0; i < 32; i += 4) {
2564     if (i <= v7->encoding() || i >= v16->encoding())
2565       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2566           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2567   }
2568 
2569   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2570 }
2571 
push_CPU_state(bool save_vectors)2572 void MacroAssembler::push_CPU_state(bool save_vectors) {
2573   int step = (save_vectors ? 8 : 4) * wordSize;
2574   push(0x3fffffff, sp);         // integer registers except lr & sp
2575   mov(rscratch1, -step);
2576   sub(sp, sp, step);
2577   for (int i = 28; i >= 4; i -= 4) {
2578     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2579         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2580   }
2581   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2582 }
2583 
pop_CPU_state(bool restore_vectors)2584 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2585   int step = (restore_vectors ? 8 : 4) * wordSize;
2586   for (int i = 0; i <= 28; i += 4)
2587     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2588         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2589   pop(0x3fffffff, sp);         // integer registers except lr & sp
2590 }
2591 
2592 /**
2593  * Helpers for multiply_to_len().
2594  */
add2_with_carry(Register final_dest_hi,Register dest_hi,Register dest_lo,Register src1,Register src2)2595 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2596                                      Register src1, Register src2) {
2597   adds(dest_lo, dest_lo, src1);
2598   adc(dest_hi, dest_hi, zr);
2599   adds(dest_lo, dest_lo, src2);
2600   adc(final_dest_hi, dest_hi, zr);
2601 }
2602 
2603 // Generate an address from (r + r1 extend offset).  "size" is the
2604 // size of the operand.  The result may be in rscratch2.
offsetted_address(Register r,Register r1,Address::extend ext,int offset,int size)2605 Address MacroAssembler::offsetted_address(Register r, Register r1,
2606                                           Address::extend ext, int offset, int size) {
2607   if (offset || (ext.shift() % size != 0)) {
2608     lea(rscratch2, Address(r, r1, ext));
2609     return Address(rscratch2, offset);
2610   } else {
2611     return Address(r, r1, ext);
2612   }
2613 }
2614 
spill_address(int size,int offset,Register tmp)2615 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2616 {
2617   assert(offset >= 0, "spill to negative address?");
2618   // Offset reachable ?
2619   //   Not aligned - 9 bits signed offset
2620   //   Aligned - 12 bits unsigned offset shifted
2621   Register base = sp;
2622   if ((offset & (size-1)) && offset >= (1<<8)) {
2623     add(tmp, base, offset & ((1<<12)-1));
2624     base = tmp;
2625     offset &= -1u<<12;
2626   }
2627 
2628   if (offset >= (1<<12) * size) {
2629     add(tmp, base, offset & (((1<<12)-1)<<12));
2630     base = tmp;
2631     offset &= ~(((1<<12)-1)<<12);
2632   }
2633 
2634   return Address(base, offset);
2635 }
2636 
2637 // Checks whether offset is aligned.
2638 // Returns true if it is, else false.
merge_alignment_check(Register base,size_t size,int64_t cur_offset,int64_t prev_offset) const2639 bool MacroAssembler::merge_alignment_check(Register base,
2640                                            size_t size,
2641                                            int64_t cur_offset,
2642                                            int64_t prev_offset) const {
2643   if (AvoidUnalignedAccesses) {
2644     if (base == sp) {
2645       // Checks whether low offset if aligned to pair of registers.
2646       int64_t pair_mask = size * 2 - 1;
2647       int64_t offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2648       return (offset & pair_mask) == 0;
2649     } else { // If base is not sp, we can't guarantee the access is aligned.
2650       return false;
2651     }
2652   } else {
2653     int64_t mask = size - 1;
2654     // Load/store pair instruction only supports element size aligned offset.
2655     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2656   }
2657 }
2658 
2659 // Checks whether current and previous loads/stores can be merged.
2660 // Returns true if it can be merged, else false.
ldst_can_merge(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store) const2661 bool MacroAssembler::ldst_can_merge(Register rt,
2662                                     const Address &adr,
2663                                     size_t cur_size_in_bytes,
2664                                     bool is_store) const {
2665   address prev = pc() - NativeInstruction::instruction_size;
2666   address last = code()->last_insn();
2667 
2668   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2669     return false;
2670   }
2671 
2672   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2673     return false;
2674   }
2675 
2676   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2677   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2678 
2679   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2680   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2681 
2682   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2683     return false;
2684   }
2685 
2686   int64_t max_offset = 63 * prev_size_in_bytes;
2687   int64_t min_offset = -64 * prev_size_in_bytes;
2688 
2689   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2690 
2691   // Only same base can be merged.
2692   if (adr.base() != prev_ldst->base()) {
2693     return false;
2694   }
2695 
2696   int64_t cur_offset = adr.offset();
2697   int64_t prev_offset = prev_ldst->offset();
2698   size_t diff = abs(cur_offset - prev_offset);
2699   if (diff != prev_size_in_bytes) {
2700     return false;
2701   }
2702 
2703   // Following cases can not be merged:
2704   // ldr x2, [x2, #8]
2705   // ldr x3, [x2, #16]
2706   // or:
2707   // ldr x2, [x3, #8]
2708   // ldr x2, [x3, #16]
2709   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2710   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2711     return false;
2712   }
2713 
2714   int64_t low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2715   // Offset range must be in ldp/stp instruction's range.
2716   if (low_offset > max_offset || low_offset < min_offset) {
2717     return false;
2718   }
2719 
2720   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2721     return true;
2722   }
2723 
2724   return false;
2725 }
2726 
2727 // Merge current load/store with previous load/store into ldp/stp.
merge_ldst(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store)2728 void MacroAssembler::merge_ldst(Register rt,
2729                                 const Address &adr,
2730                                 size_t cur_size_in_bytes,
2731                                 bool is_store) {
2732 
2733   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2734 
2735   Register rt_low, rt_high;
2736   address prev = pc() - NativeInstruction::instruction_size;
2737   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2738 
2739   int64_t offset;
2740 
2741   if (adr.offset() < prev_ldst->offset()) {
2742     offset = adr.offset();
2743     rt_low = rt;
2744     rt_high = prev_ldst->target();
2745   } else {
2746     offset = prev_ldst->offset();
2747     rt_low = prev_ldst->target();
2748     rt_high = rt;
2749   }
2750 
2751   Address adr_p = Address(prev_ldst->base(), offset);
2752   // Overwrite previous generated binary.
2753   code_section()->set_end(prev);
2754 
2755   const int sz = prev_ldst->size_in_bytes();
2756   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2757   if (!is_store) {
2758     BLOCK_COMMENT("merged ldr pair");
2759     if (sz == 8) {
2760       ldp(rt_low, rt_high, adr_p);
2761     } else {
2762       ldpw(rt_low, rt_high, adr_p);
2763     }
2764   } else {
2765     BLOCK_COMMENT("merged str pair");
2766     if (sz == 8) {
2767       stp(rt_low, rt_high, adr_p);
2768     } else {
2769       stpw(rt_low, rt_high, adr_p);
2770     }
2771   }
2772 }
2773 
2774 /**
2775  * Multiply 64 bit by 64 bit first loop.
2776  */
multiply_64_x_64_loop(Register x,Register xstart,Register x_xstart,Register y,Register y_idx,Register z,Register carry,Register product,Register idx,Register kdx)2777 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2778                                            Register y, Register y_idx, Register z,
2779                                            Register carry, Register product,
2780                                            Register idx, Register kdx) {
2781   //
2782   //  jlong carry, x[], y[], z[];
2783   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2784   //    huge_128 product = y[idx] * x[xstart] + carry;
2785   //    z[kdx] = (jlong)product;
2786   //    carry  = (jlong)(product >>> 64);
2787   //  }
2788   //  z[xstart] = carry;
2789   //
2790 
2791   Label L_first_loop, L_first_loop_exit;
2792   Label L_one_x, L_one_y, L_multiply;
2793 
2794   subsw(xstart, xstart, 1);
2795   br(Assembler::MI, L_one_x);
2796 
2797   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2798   ldr(x_xstart, Address(rscratch1));
2799   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2800 
2801   bind(L_first_loop);
2802   subsw(idx, idx, 1);
2803   br(Assembler::MI, L_first_loop_exit);
2804   subsw(idx, idx, 1);
2805   br(Assembler::MI, L_one_y);
2806   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2807   ldr(y_idx, Address(rscratch1));
2808   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2809   bind(L_multiply);
2810 
2811   // AArch64 has a multiply-accumulate instruction that we can't use
2812   // here because it has no way to process carries, so we have to use
2813   // separate add and adc instructions.  Bah.
2814   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2815   mul(product, x_xstart, y_idx);
2816   adds(product, product, carry);
2817   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2818 
2819   subw(kdx, kdx, 2);
2820   ror(product, product, 32); // back to big-endian
2821   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2822 
2823   b(L_first_loop);
2824 
2825   bind(L_one_y);
2826   ldrw(y_idx, Address(y,  0));
2827   b(L_multiply);
2828 
2829   bind(L_one_x);
2830   ldrw(x_xstart, Address(x,  0));
2831   b(L_first_loop);
2832 
2833   bind(L_first_loop_exit);
2834 }
2835 
2836 /**
2837  * Multiply 128 bit by 128. Unrolled inner loop.
2838  *
2839  */
multiply_128_x_128_loop(Register y,Register z,Register carry,Register carry2,Register idx,Register jdx,Register yz_idx1,Register yz_idx2,Register tmp,Register tmp3,Register tmp4,Register tmp6,Register product_hi)2840 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2841                                              Register carry, Register carry2,
2842                                              Register idx, Register jdx,
2843                                              Register yz_idx1, Register yz_idx2,
2844                                              Register tmp, Register tmp3, Register tmp4,
2845                                              Register tmp6, Register product_hi) {
2846 
2847   //   jlong carry, x[], y[], z[];
2848   //   int kdx = ystart+1;
2849   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2850   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2851   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2852   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2853   //     carry  = (jlong)(tmp4 >>> 64);
2854   //     z[kdx+idx+1] = (jlong)tmp3;
2855   //     z[kdx+idx] = (jlong)tmp4;
2856   //   }
2857   //   idx += 2;
2858   //   if (idx > 0) {
2859   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2860   //     z[kdx+idx] = (jlong)yz_idx1;
2861   //     carry  = (jlong)(yz_idx1 >>> 64);
2862   //   }
2863   //
2864 
2865   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2866 
2867   lsrw(jdx, idx, 2);
2868 
2869   bind(L_third_loop);
2870 
2871   subsw(jdx, jdx, 1);
2872   br(Assembler::MI, L_third_loop_exit);
2873   subw(idx, idx, 4);
2874 
2875   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2876 
2877   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2878 
2879   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2880 
2881   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2882   ror(yz_idx2, yz_idx2, 32);
2883 
2884   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2885 
2886   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2887   umulh(tmp4, product_hi, yz_idx1);
2888 
2889   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2890   ror(rscratch2, rscratch2, 32);
2891 
2892   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2893   umulh(carry2, product_hi, yz_idx2);
2894 
2895   // propagate sum of both multiplications into carry:tmp4:tmp3
2896   adds(tmp3, tmp3, carry);
2897   adc(tmp4, tmp4, zr);
2898   adds(tmp3, tmp3, rscratch1);
2899   adcs(tmp4, tmp4, tmp);
2900   adc(carry, carry2, zr);
2901   adds(tmp4, tmp4, rscratch2);
2902   adc(carry, carry, zr);
2903 
2904   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2905   ror(tmp4, tmp4, 32);
2906   stp(tmp4, tmp3, Address(tmp6, 0));
2907 
2908   b(L_third_loop);
2909   bind (L_third_loop_exit);
2910 
2911   andw (idx, idx, 0x3);
2912   cbz(idx, L_post_third_loop_done);
2913 
2914   Label L_check_1;
2915   subsw(idx, idx, 2);
2916   br(Assembler::MI, L_check_1);
2917 
2918   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2919   ldr(yz_idx1, Address(rscratch1, 0));
2920   ror(yz_idx1, yz_idx1, 32);
2921   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2922   umulh(tmp4, product_hi, yz_idx1);
2923   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2924   ldr(yz_idx2, Address(rscratch1, 0));
2925   ror(yz_idx2, yz_idx2, 32);
2926 
2927   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2928 
2929   ror(tmp3, tmp3, 32);
2930   str(tmp3, Address(rscratch1, 0));
2931 
2932   bind (L_check_1);
2933 
2934   andw (idx, idx, 0x1);
2935   subsw(idx, idx, 1);
2936   br(Assembler::MI, L_post_third_loop_done);
2937   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2938   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2939   umulh(carry2, tmp4, product_hi);
2940   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2941 
2942   add2_with_carry(carry2, tmp3, tmp4, carry);
2943 
2944   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2945   extr(carry, carry2, tmp3, 32);
2946 
2947   bind(L_post_third_loop_done);
2948 }
2949 
2950 /**
2951  * Code for BigInteger::multiplyToLen() instrinsic.
2952  *
2953  * r0: x
2954  * r1: xlen
2955  * r2: y
2956  * r3: ylen
2957  * r4:  z
2958  * r5: zlen
2959  * r10: tmp1
2960  * r11: tmp2
2961  * r12: tmp3
2962  * r13: tmp4
2963  * r14: tmp5
2964  * r15: tmp6
2965  * r16: tmp7
2966  *
2967  */
multiply_to_len(Register x,Register xlen,Register y,Register ylen,Register z,Register zlen,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,Register product_hi)2968 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2969                                      Register z, Register zlen,
2970                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2971                                      Register tmp5, Register tmp6, Register product_hi) {
2972 
2973   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2974 
2975   const Register idx = tmp1;
2976   const Register kdx = tmp2;
2977   const Register xstart = tmp3;
2978 
2979   const Register y_idx = tmp4;
2980   const Register carry = tmp5;
2981   const Register product  = xlen;
2982   const Register x_xstart = zlen;  // reuse register
2983 
2984   // First Loop.
2985   //
2986   //  final static long LONG_MASK = 0xffffffffL;
2987   //  int xstart = xlen - 1;
2988   //  int ystart = ylen - 1;
2989   //  long carry = 0;
2990   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2991   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2992   //    z[kdx] = (int)product;
2993   //    carry = product >>> 32;
2994   //  }
2995   //  z[xstart] = (int)carry;
2996   //
2997 
2998   movw(idx, ylen);      // idx = ylen;
2999   movw(kdx, zlen);      // kdx = xlen+ylen;
3000   mov(carry, zr);       // carry = 0;
3001 
3002   Label L_done;
3003 
3004   movw(xstart, xlen);
3005   subsw(xstart, xstart, 1);
3006   br(Assembler::MI, L_done);
3007 
3008   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3009 
3010   Label L_second_loop;
3011   cbzw(kdx, L_second_loop);
3012 
3013   Label L_carry;
3014   subw(kdx, kdx, 1);
3015   cbzw(kdx, L_carry);
3016 
3017   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3018   lsr(carry, carry, 32);
3019   subw(kdx, kdx, 1);
3020 
3021   bind(L_carry);
3022   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3023 
3024   // Second and third (nested) loops.
3025   //
3026   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3027   //   carry = 0;
3028   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3029   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3030   //                    (z[k] & LONG_MASK) + carry;
3031   //     z[k] = (int)product;
3032   //     carry = product >>> 32;
3033   //   }
3034   //   z[i] = (int)carry;
3035   // }
3036   //
3037   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3038 
3039   const Register jdx = tmp1;
3040 
3041   bind(L_second_loop);
3042   mov(carry, zr);                // carry = 0;
3043   movw(jdx, ylen);               // j = ystart+1
3044 
3045   subsw(xstart, xstart, 1);      // i = xstart-1;
3046   br(Assembler::MI, L_done);
3047 
3048   str(z, Address(pre(sp, -4 * wordSize)));
3049 
3050   Label L_last_x;
3051   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3052   subsw(xstart, xstart, 1);       // i = xstart-1;
3053   br(Assembler::MI, L_last_x);
3054 
3055   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3056   ldr(product_hi, Address(rscratch1));
3057   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3058 
3059   Label L_third_loop_prologue;
3060   bind(L_third_loop_prologue);
3061 
3062   str(ylen, Address(sp, wordSize));
3063   stp(x, xstart, Address(sp, 2 * wordSize));
3064   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3065                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3066   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3067   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3068 
3069   addw(tmp3, xlen, 1);
3070   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3071   subsw(tmp3, tmp3, 1);
3072   br(Assembler::MI, L_done);
3073 
3074   lsr(carry, carry, 32);
3075   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3076   b(L_second_loop);
3077 
3078   // Next infrequent code is moved outside loops.
3079   bind(L_last_x);
3080   ldrw(product_hi, Address(x,  0));
3081   b(L_third_loop_prologue);
3082 
3083   bind(L_done);
3084 }
3085 
3086 // Code for BigInteger::mulAdd instrinsic
3087 // out     = r0
3088 // in      = r1
3089 // offset  = r2  (already out.length-offset)
3090 // len     = r3
3091 // k       = r4
3092 //
3093 // pseudo code from java implementation:
3094 // carry = 0;
3095 // offset = out.length-offset - 1;
3096 // for (int j=len-1; j >= 0; j--) {
3097 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3098 //     out[offset--] = (int)product;
3099 //     carry = product >>> 32;
3100 // }
3101 // return (int)carry;
mul_add(Register out,Register in,Register offset,Register len,Register k)3102 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3103       Register len, Register k) {
3104     Label LOOP, END;
3105     // pre-loop
3106     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3107     csel(out, zr, out, Assembler::EQ);
3108     br(Assembler::EQ, END);
3109     add(in, in, len, LSL, 2); // in[j+1] address
3110     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3111     mov(out, zr); // used to keep carry now
3112     BIND(LOOP);
3113     ldrw(rscratch1, Address(pre(in, -4)));
3114     madd(rscratch1, rscratch1, k, out);
3115     ldrw(rscratch2, Address(pre(offset, -4)));
3116     add(rscratch1, rscratch1, rscratch2);
3117     strw(rscratch1, Address(offset));
3118     lsr(out, rscratch1, 32);
3119     subs(len, len, 1);
3120     br(Assembler::NE, LOOP);
3121     BIND(END);
3122 }
3123 
3124 /**
3125  * Emits code to update CRC-32 with a byte value according to constants in table
3126  *
3127  * @param [in,out]crc   Register containing the crc.
3128  * @param [in]val       Register containing the byte to fold into the CRC.
3129  * @param [in]table     Register containing the table of crc constants.
3130  *
3131  * uint32_t crc;
3132  * val = crc_table[(val ^ crc) & 0xFF];
3133  * crc = val ^ (crc >> 8);
3134  *
3135  */
update_byte_crc32(Register crc,Register val,Register table)3136 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3137   eor(val, val, crc);
3138   andr(val, val, 0xff);
3139   ldrw(val, Address(table, val, Address::lsl(2)));
3140   eor(crc, val, crc, Assembler::LSR, 8);
3141 }
3142 
3143 /**
3144  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3145  *
3146  * @param [in,out]crc   Register containing the crc.
3147  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3148  * @param [in]table0    Register containing table 0 of crc constants.
3149  * @param [in]table1    Register containing table 1 of crc constants.
3150  * @param [in]table2    Register containing table 2 of crc constants.
3151  * @param [in]table3    Register containing table 3 of crc constants.
3152  *
3153  * uint32_t crc;
3154  *   v = crc ^ v
3155  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3156  *
3157  */
update_word_crc32(Register crc,Register v,Register tmp,Register table0,Register table1,Register table2,Register table3,bool upper)3158 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3159         Register table0, Register table1, Register table2, Register table3,
3160         bool upper) {
3161   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3162   uxtb(tmp, v);
3163   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3164   ubfx(tmp, v, 8, 8);
3165   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3166   eor(crc, crc, tmp);
3167   ubfx(tmp, v, 16, 8);
3168   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3169   eor(crc, crc, tmp);
3170   ubfx(tmp, v, 24, 8);
3171   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3172   eor(crc, crc, tmp);
3173 }
3174 
kernel_crc32_using_crc32(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3175 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3176         Register len, Register tmp0, Register tmp1, Register tmp2,
3177         Register tmp3) {
3178     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3179     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3180 
3181     mvnw(crc, crc);
3182 
3183     subs(len, len, 128);
3184     br(Assembler::GE, CRC_by64_pre);
3185   BIND(CRC_less64);
3186     adds(len, len, 128-32);
3187     br(Assembler::GE, CRC_by32_loop);
3188   BIND(CRC_less32);
3189     adds(len, len, 32-4);
3190     br(Assembler::GE, CRC_by4_loop);
3191     adds(len, len, 4);
3192     br(Assembler::GT, CRC_by1_loop);
3193     b(L_exit);
3194 
3195   BIND(CRC_by32_loop);
3196     ldp(tmp0, tmp1, Address(post(buf, 16)));
3197     subs(len, len, 32);
3198     crc32x(crc, crc, tmp0);
3199     ldr(tmp2, Address(post(buf, 8)));
3200     crc32x(crc, crc, tmp1);
3201     ldr(tmp3, Address(post(buf, 8)));
3202     crc32x(crc, crc, tmp2);
3203     crc32x(crc, crc, tmp3);
3204     br(Assembler::GE, CRC_by32_loop);
3205     cmn(len, 32);
3206     br(Assembler::NE, CRC_less32);
3207     b(L_exit);
3208 
3209   BIND(CRC_by4_loop);
3210     ldrw(tmp0, Address(post(buf, 4)));
3211     subs(len, len, 4);
3212     crc32w(crc, crc, tmp0);
3213     br(Assembler::GE, CRC_by4_loop);
3214     adds(len, len, 4);
3215     br(Assembler::LE, L_exit);
3216   BIND(CRC_by1_loop);
3217     ldrb(tmp0, Address(post(buf, 1)));
3218     subs(len, len, 1);
3219     crc32b(crc, crc, tmp0);
3220     br(Assembler::GT, CRC_by1_loop);
3221     b(L_exit);
3222 
3223   BIND(CRC_by64_pre);
3224     sub(buf, buf, 8);
3225     ldp(tmp0, tmp1, Address(buf, 8));
3226     crc32x(crc, crc, tmp0);
3227     ldr(tmp2, Address(buf, 24));
3228     crc32x(crc, crc, tmp1);
3229     ldr(tmp3, Address(buf, 32));
3230     crc32x(crc, crc, tmp2);
3231     ldr(tmp0, Address(buf, 40));
3232     crc32x(crc, crc, tmp3);
3233     ldr(tmp1, Address(buf, 48));
3234     crc32x(crc, crc, tmp0);
3235     ldr(tmp2, Address(buf, 56));
3236     crc32x(crc, crc, tmp1);
3237     ldr(tmp3, Address(pre(buf, 64)));
3238 
3239     b(CRC_by64_loop);
3240 
3241     align(CodeEntryAlignment);
3242   BIND(CRC_by64_loop);
3243     subs(len, len, 64);
3244     crc32x(crc, crc, tmp2);
3245     ldr(tmp0, Address(buf, 8));
3246     crc32x(crc, crc, tmp3);
3247     ldr(tmp1, Address(buf, 16));
3248     crc32x(crc, crc, tmp0);
3249     ldr(tmp2, Address(buf, 24));
3250     crc32x(crc, crc, tmp1);
3251     ldr(tmp3, Address(buf, 32));
3252     crc32x(crc, crc, tmp2);
3253     ldr(tmp0, Address(buf, 40));
3254     crc32x(crc, crc, tmp3);
3255     ldr(tmp1, Address(buf, 48));
3256     crc32x(crc, crc, tmp0);
3257     ldr(tmp2, Address(buf, 56));
3258     crc32x(crc, crc, tmp1);
3259     ldr(tmp3, Address(pre(buf, 64)));
3260     br(Assembler::GE, CRC_by64_loop);
3261 
3262     // post-loop
3263     crc32x(crc, crc, tmp2);
3264     crc32x(crc, crc, tmp3);
3265 
3266     sub(len, len, 64);
3267     add(buf, buf, 8);
3268     cmn(len, 128);
3269     br(Assembler::NE, CRC_less64);
3270   BIND(L_exit);
3271     mvnw(crc, crc);
3272 }
3273 
3274 /**
3275  * @param crc   register containing existing CRC (32-bit)
3276  * @param buf   register pointing to input byte buffer (byte*)
3277  * @param len   register containing number of bytes
3278  * @param table register that will contain address of CRC table
3279  * @param tmp   scratch register
3280  */
kernel_crc32(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3281 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3282         Register table0, Register table1, Register table2, Register table3,
3283         Register tmp, Register tmp2, Register tmp3) {
3284   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3285   uint64_t offset;
3286 
3287   if (UseCRC32) {
3288       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3289       return;
3290   }
3291 
3292     mvnw(crc, crc);
3293 
3294     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3295     if (offset) add(table0, table0, offset);
3296     add(table1, table0, 1*256*sizeof(juint));
3297     add(table2, table0, 2*256*sizeof(juint));
3298     add(table3, table0, 3*256*sizeof(juint));
3299 
3300   if (UseNeon) {
3301       cmp(len, 64);
3302       br(Assembler::LT, L_by16);
3303       eor(v16, T16B, v16, v16);
3304 
3305     Label L_fold;
3306 
3307       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3308 
3309       ld1(v0, v1, T2D, post(buf, 32));
3310       ld1r(v4, T2D, post(tmp, 8));
3311       ld1r(v5, T2D, post(tmp, 8));
3312       ld1r(v6, T2D, post(tmp, 8));
3313       ld1r(v7, T2D, post(tmp, 8));
3314       mov(v16, T4S, 0, crc);
3315 
3316       eor(v0, T16B, v0, v16);
3317       sub(len, len, 64);
3318 
3319     BIND(L_fold);
3320       pmull(v22, T8H, v0, v5, T8B);
3321       pmull(v20, T8H, v0, v7, T8B);
3322       pmull(v23, T8H, v0, v4, T8B);
3323       pmull(v21, T8H, v0, v6, T8B);
3324 
3325       pmull2(v18, T8H, v0, v5, T16B);
3326       pmull2(v16, T8H, v0, v7, T16B);
3327       pmull2(v19, T8H, v0, v4, T16B);
3328       pmull2(v17, T8H, v0, v6, T16B);
3329 
3330       uzp1(v24, T8H, v20, v22);
3331       uzp2(v25, T8H, v20, v22);
3332       eor(v20, T16B, v24, v25);
3333 
3334       uzp1(v26, T8H, v16, v18);
3335       uzp2(v27, T8H, v16, v18);
3336       eor(v16, T16B, v26, v27);
3337 
3338       ushll2(v22, T4S, v20, T8H, 8);
3339       ushll(v20, T4S, v20, T4H, 8);
3340 
3341       ushll2(v18, T4S, v16, T8H, 8);
3342       ushll(v16, T4S, v16, T4H, 8);
3343 
3344       eor(v22, T16B, v23, v22);
3345       eor(v18, T16B, v19, v18);
3346       eor(v20, T16B, v21, v20);
3347       eor(v16, T16B, v17, v16);
3348 
3349       uzp1(v17, T2D, v16, v20);
3350       uzp2(v21, T2D, v16, v20);
3351       eor(v17, T16B, v17, v21);
3352 
3353       ushll2(v20, T2D, v17, T4S, 16);
3354       ushll(v16, T2D, v17, T2S, 16);
3355 
3356       eor(v20, T16B, v20, v22);
3357       eor(v16, T16B, v16, v18);
3358 
3359       uzp1(v17, T2D, v20, v16);
3360       uzp2(v21, T2D, v20, v16);
3361       eor(v28, T16B, v17, v21);
3362 
3363       pmull(v22, T8H, v1, v5, T8B);
3364       pmull(v20, T8H, v1, v7, T8B);
3365       pmull(v23, T8H, v1, v4, T8B);
3366       pmull(v21, T8H, v1, v6, T8B);
3367 
3368       pmull2(v18, T8H, v1, v5, T16B);
3369       pmull2(v16, T8H, v1, v7, T16B);
3370       pmull2(v19, T8H, v1, v4, T16B);
3371       pmull2(v17, T8H, v1, v6, T16B);
3372 
3373       ld1(v0, v1, T2D, post(buf, 32));
3374 
3375       uzp1(v24, T8H, v20, v22);
3376       uzp2(v25, T8H, v20, v22);
3377       eor(v20, T16B, v24, v25);
3378 
3379       uzp1(v26, T8H, v16, v18);
3380       uzp2(v27, T8H, v16, v18);
3381       eor(v16, T16B, v26, v27);
3382 
3383       ushll2(v22, T4S, v20, T8H, 8);
3384       ushll(v20, T4S, v20, T4H, 8);
3385 
3386       ushll2(v18, T4S, v16, T8H, 8);
3387       ushll(v16, T4S, v16, T4H, 8);
3388 
3389       eor(v22, T16B, v23, v22);
3390       eor(v18, T16B, v19, v18);
3391       eor(v20, T16B, v21, v20);
3392       eor(v16, T16B, v17, v16);
3393 
3394       uzp1(v17, T2D, v16, v20);
3395       uzp2(v21, T2D, v16, v20);
3396       eor(v16, T16B, v17, v21);
3397 
3398       ushll2(v20, T2D, v16, T4S, 16);
3399       ushll(v16, T2D, v16, T2S, 16);
3400 
3401       eor(v20, T16B, v22, v20);
3402       eor(v16, T16B, v16, v18);
3403 
3404       uzp1(v17, T2D, v20, v16);
3405       uzp2(v21, T2D, v20, v16);
3406       eor(v20, T16B, v17, v21);
3407 
3408       shl(v16, T2D, v28, 1);
3409       shl(v17, T2D, v20, 1);
3410 
3411       eor(v0, T16B, v0, v16);
3412       eor(v1, T16B, v1, v17);
3413 
3414       subs(len, len, 32);
3415       br(Assembler::GE, L_fold);
3416 
3417       mov(crc, 0);
3418       mov(tmp, v0, T1D, 0);
3419       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3420       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3421       mov(tmp, v0, T1D, 1);
3422       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3423       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3424       mov(tmp, v1, T1D, 0);
3425       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3426       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3427       mov(tmp, v1, T1D, 1);
3428       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3429       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3430 
3431       add(len, len, 32);
3432   }
3433 
3434   BIND(L_by16);
3435     subs(len, len, 16);
3436     br(Assembler::GE, L_by16_loop);
3437     adds(len, len, 16-4);
3438     br(Assembler::GE, L_by4_loop);
3439     adds(len, len, 4);
3440     br(Assembler::GT, L_by1_loop);
3441     b(L_exit);
3442 
3443   BIND(L_by4_loop);
3444     ldrw(tmp, Address(post(buf, 4)));
3445     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3446     subs(len, len, 4);
3447     br(Assembler::GE, L_by4_loop);
3448     adds(len, len, 4);
3449     br(Assembler::LE, L_exit);
3450   BIND(L_by1_loop);
3451     subs(len, len, 1);
3452     ldrb(tmp, Address(post(buf, 1)));
3453     update_byte_crc32(crc, tmp, table0);
3454     br(Assembler::GT, L_by1_loop);
3455     b(L_exit);
3456 
3457     align(CodeEntryAlignment);
3458   BIND(L_by16_loop);
3459     subs(len, len, 16);
3460     ldp(tmp, tmp3, Address(post(buf, 16)));
3461     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3462     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3463     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3464     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3465     br(Assembler::GE, L_by16_loop);
3466     adds(len, len, 16-4);
3467     br(Assembler::GE, L_by4_loop);
3468     adds(len, len, 4);
3469     br(Assembler::GT, L_by1_loop);
3470   BIND(L_exit);
3471     mvnw(crc, crc);
3472 }
3473 
kernel_crc32c_using_crc32c(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3474 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3475         Register len, Register tmp0, Register tmp1, Register tmp2,
3476         Register tmp3) {
3477     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3478     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3479 
3480     subs(len, len, 128);
3481     br(Assembler::GE, CRC_by64_pre);
3482   BIND(CRC_less64);
3483     adds(len, len, 128-32);
3484     br(Assembler::GE, CRC_by32_loop);
3485   BIND(CRC_less32);
3486     adds(len, len, 32-4);
3487     br(Assembler::GE, CRC_by4_loop);
3488     adds(len, len, 4);
3489     br(Assembler::GT, CRC_by1_loop);
3490     b(L_exit);
3491 
3492   BIND(CRC_by32_loop);
3493     ldp(tmp0, tmp1, Address(post(buf, 16)));
3494     subs(len, len, 32);
3495     crc32cx(crc, crc, tmp0);
3496     ldr(tmp2, Address(post(buf, 8)));
3497     crc32cx(crc, crc, tmp1);
3498     ldr(tmp3, Address(post(buf, 8)));
3499     crc32cx(crc, crc, tmp2);
3500     crc32cx(crc, crc, tmp3);
3501     br(Assembler::GE, CRC_by32_loop);
3502     cmn(len, 32);
3503     br(Assembler::NE, CRC_less32);
3504     b(L_exit);
3505 
3506   BIND(CRC_by4_loop);
3507     ldrw(tmp0, Address(post(buf, 4)));
3508     subs(len, len, 4);
3509     crc32cw(crc, crc, tmp0);
3510     br(Assembler::GE, CRC_by4_loop);
3511     adds(len, len, 4);
3512     br(Assembler::LE, L_exit);
3513   BIND(CRC_by1_loop);
3514     ldrb(tmp0, Address(post(buf, 1)));
3515     subs(len, len, 1);
3516     crc32cb(crc, crc, tmp0);
3517     br(Assembler::GT, CRC_by1_loop);
3518     b(L_exit);
3519 
3520   BIND(CRC_by64_pre);
3521     sub(buf, buf, 8);
3522     ldp(tmp0, tmp1, Address(buf, 8));
3523     crc32cx(crc, crc, tmp0);
3524     ldr(tmp2, Address(buf, 24));
3525     crc32cx(crc, crc, tmp1);
3526     ldr(tmp3, Address(buf, 32));
3527     crc32cx(crc, crc, tmp2);
3528     ldr(tmp0, Address(buf, 40));
3529     crc32cx(crc, crc, tmp3);
3530     ldr(tmp1, Address(buf, 48));
3531     crc32cx(crc, crc, tmp0);
3532     ldr(tmp2, Address(buf, 56));
3533     crc32cx(crc, crc, tmp1);
3534     ldr(tmp3, Address(pre(buf, 64)));
3535 
3536     b(CRC_by64_loop);
3537 
3538     align(CodeEntryAlignment);
3539   BIND(CRC_by64_loop);
3540     subs(len, len, 64);
3541     crc32cx(crc, crc, tmp2);
3542     ldr(tmp0, Address(buf, 8));
3543     crc32cx(crc, crc, tmp3);
3544     ldr(tmp1, Address(buf, 16));
3545     crc32cx(crc, crc, tmp0);
3546     ldr(tmp2, Address(buf, 24));
3547     crc32cx(crc, crc, tmp1);
3548     ldr(tmp3, Address(buf, 32));
3549     crc32cx(crc, crc, tmp2);
3550     ldr(tmp0, Address(buf, 40));
3551     crc32cx(crc, crc, tmp3);
3552     ldr(tmp1, Address(buf, 48));
3553     crc32cx(crc, crc, tmp0);
3554     ldr(tmp2, Address(buf, 56));
3555     crc32cx(crc, crc, tmp1);
3556     ldr(tmp3, Address(pre(buf, 64)));
3557     br(Assembler::GE, CRC_by64_loop);
3558 
3559     // post-loop
3560     crc32cx(crc, crc, tmp2);
3561     crc32cx(crc, crc, tmp3);
3562 
3563     sub(len, len, 64);
3564     add(buf, buf, 8);
3565     cmn(len, 128);
3566     br(Assembler::NE, CRC_less64);
3567   BIND(L_exit);
3568 }
3569 
3570 /**
3571  * @param crc   register containing existing CRC (32-bit)
3572  * @param buf   register pointing to input byte buffer (byte*)
3573  * @param len   register containing number of bytes
3574  * @param table register that will contain address of CRC table
3575  * @param tmp   scratch register
3576  */
kernel_crc32c(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3577 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3578         Register table0, Register table1, Register table2, Register table3,
3579         Register tmp, Register tmp2, Register tmp3) {
3580   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3581 }
3582 
3583 
SkipIfEqual(MacroAssembler * masm,const bool * flag_addr,bool value)3584 SkipIfEqual::SkipIfEqual(
3585     MacroAssembler* masm, const bool* flag_addr, bool value) {
3586   _masm = masm;
3587   uint64_t offset;
3588   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3589   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3590   _masm->cbzw(rscratch1, _label);
3591 }
3592 
~SkipIfEqual()3593 SkipIfEqual::~SkipIfEqual() {
3594   _masm->bind(_label);
3595 }
3596 
addptr(const Address & dst,int32_t src)3597 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3598   Address adr;
3599   switch(dst.getMode()) {
3600   case Address::base_plus_offset:
3601     // This is the expected mode, although we allow all the other
3602     // forms below.
3603     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3604     break;
3605   default:
3606     lea(rscratch2, dst);
3607     adr = Address(rscratch2);
3608     break;
3609   }
3610   ldr(rscratch1, adr);
3611   add(rscratch1, rscratch1, src);
3612   str(rscratch1, adr);
3613 }
3614 
cmpptr(Register src1,Address src2)3615 void MacroAssembler::cmpptr(Register src1, Address src2) {
3616   uint64_t offset;
3617   adrp(rscratch1, src2, offset);
3618   ldr(rscratch1, Address(rscratch1, offset));
3619   cmp(src1, rscratch1);
3620 }
3621 
cmpoop(Register obj1,Register obj2)3622 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3623   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3624   bs->obj_equals(this, obj1, obj2);
3625 }
3626 
load_klass(Register dst,Register src)3627 void MacroAssembler::load_klass(Register dst, Register src) {
3628   if (UseCompressedClassPointers) {
3629     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3630     decode_klass_not_null(dst);
3631   } else {
3632     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3633   }
3634 }
3635 
3636 // ((OopHandle)result).resolve();
resolve_oop_handle(Register result,Register tmp)3637 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3638   // OopHandle::resolve is an indirection.
3639   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3640 }
3641 
load_mirror(Register dst,Register method,Register tmp)3642 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3643   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3644   ldr(dst, Address(rmethod, Method::const_offset()));
3645   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3646   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3647   ldr(dst, Address(dst, mirror_offset));
3648   resolve_oop_handle(dst, tmp);
3649 }
3650 
cmp_klass(Register oop,Register trial_klass,Register tmp)3651 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3652   if (UseCompressedClassPointers) {
3653     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3654     if (Universe::narrow_klass_base() == NULL) {
3655       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3656       return;
3657     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3658                && Universe::narrow_klass_shift() == 0) {
3659       // Only the bottom 32 bits matter
3660       cmpw(trial_klass, tmp);
3661       return;
3662     }
3663     decode_klass_not_null(tmp);
3664   } else {
3665     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3666   }
3667   cmp(trial_klass, tmp);
3668 }
3669 
load_prototype_header(Register dst,Register src)3670 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3671   load_klass(dst, src);
3672   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3673 }
3674 
store_klass(Register dst,Register src)3675 void MacroAssembler::store_klass(Register dst, Register src) {
3676   // FIXME: Should this be a store release?  concurrent gcs assumes
3677   // klass length is valid if klass field is not null.
3678   if (UseCompressedClassPointers) {
3679     encode_klass_not_null(src);
3680     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3681   } else {
3682     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3683   }
3684 }
3685 
store_klass_gap(Register dst,Register src)3686 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3687   if (UseCompressedClassPointers) {
3688     // Store to klass gap in destination
3689     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3690   }
3691 }
3692 
3693 // Algorithm must match CompressedOops::encode.
encode_heap_oop(Register d,Register s)3694 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3695 #ifdef ASSERT
3696   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3697 #endif
3698   verify_oop(s, "broken oop in encode_heap_oop");
3699   if (Universe::narrow_oop_base() == NULL) {
3700     if (Universe::narrow_oop_shift() != 0) {
3701       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3702       lsr(d, s, LogMinObjAlignmentInBytes);
3703     } else {
3704       mov(d, s);
3705     }
3706   } else {
3707     subs(d, s, rheapbase);
3708     csel(d, d, zr, Assembler::HS);
3709     lsr(d, d, LogMinObjAlignmentInBytes);
3710 
3711     /*  Old algorithm: is this any worse?
3712     Label nonnull;
3713     cbnz(r, nonnull);
3714     sub(r, r, rheapbase);
3715     bind(nonnull);
3716     lsr(r, r, LogMinObjAlignmentInBytes);
3717     */
3718   }
3719 }
3720 
encode_heap_oop_not_null(Register r)3721 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3722 #ifdef ASSERT
3723   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3724   if (CheckCompressedOops) {
3725     Label ok;
3726     cbnz(r, ok);
3727     stop("null oop passed to encode_heap_oop_not_null");
3728     bind(ok);
3729   }
3730 #endif
3731   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3732   if (Universe::narrow_oop_base() != NULL) {
3733     sub(r, r, rheapbase);
3734   }
3735   if (Universe::narrow_oop_shift() != 0) {
3736     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3737     lsr(r, r, LogMinObjAlignmentInBytes);
3738   }
3739 }
3740 
encode_heap_oop_not_null(Register dst,Register src)3741 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3742 #ifdef ASSERT
3743   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3744   if (CheckCompressedOops) {
3745     Label ok;
3746     cbnz(src, ok);
3747     stop("null oop passed to encode_heap_oop_not_null2");
3748     bind(ok);
3749   }
3750 #endif
3751   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3752 
3753   Register data = src;
3754   if (Universe::narrow_oop_base() != NULL) {
3755     sub(dst, src, rheapbase);
3756     data = dst;
3757   }
3758   if (Universe::narrow_oop_shift() != 0) {
3759     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3760     lsr(dst, data, LogMinObjAlignmentInBytes);
3761     data = dst;
3762   }
3763   if (data == src)
3764     mov(dst, src);
3765 }
3766 
decode_heap_oop(Register d,Register s)3767 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3768 #ifdef ASSERT
3769   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3770 #endif
3771   if (Universe::narrow_oop_base() == NULL) {
3772     if (Universe::narrow_oop_shift() != 0 || d != s) {
3773       lsl(d, s, Universe::narrow_oop_shift());
3774     }
3775   } else {
3776     Label done;
3777     if (d != s)
3778       mov(d, s);
3779     cbz(s, done);
3780     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3781     bind(done);
3782   }
3783   verify_oop(d, "broken oop in decode_heap_oop");
3784 }
3785 
decode_heap_oop_not_null(Register r)3786 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3787   assert (UseCompressedOops, "should only be used for compressed headers");
3788   assert (Universe::heap() != NULL, "java heap should be initialized");
3789   // Cannot assert, unverified entry point counts instructions (see .ad file)
3790   // vtableStubs also counts instructions in pd_code_size_limit.
3791   // Also do not verify_oop as this is called by verify_oop.
3792   if (Universe::narrow_oop_shift() != 0) {
3793     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3794     if (Universe::narrow_oop_base() != NULL) {
3795       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3796     } else {
3797       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3798     }
3799   } else {
3800     assert (Universe::narrow_oop_base() == NULL, "sanity");
3801   }
3802 }
3803 
decode_heap_oop_not_null(Register dst,Register src)3804 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3805   assert (UseCompressedOops, "should only be used for compressed headers");
3806   assert (Universe::heap() != NULL, "java heap should be initialized");
3807   // Cannot assert, unverified entry point counts instructions (see .ad file)
3808   // vtableStubs also counts instructions in pd_code_size_limit.
3809   // Also do not verify_oop as this is called by verify_oop.
3810   if (Universe::narrow_oop_shift() != 0) {
3811     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3812     if (Universe::narrow_oop_base() != NULL) {
3813       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3814     } else {
3815       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3816     }
3817   } else {
3818     assert (Universe::narrow_oop_base() == NULL, "sanity");
3819     if (dst != src) {
3820       mov(dst, src);
3821     }
3822   }
3823 }
3824 
encode_klass_not_null(Register dst,Register src)3825 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3826   if (Universe::narrow_klass_base() == NULL) {
3827     if (Universe::narrow_klass_shift() != 0) {
3828       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3829       lsr(dst, src, LogKlassAlignmentInBytes);
3830     } else {
3831       if (dst != src) mov(dst, src);
3832     }
3833     return;
3834   }
3835 
3836   if (use_XOR_for_compressed_class_base) {
3837     if (Universe::narrow_klass_shift() != 0) {
3838       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3839       lsr(dst, dst, LogKlassAlignmentInBytes);
3840     } else {
3841       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3842     }
3843     return;
3844   }
3845 
3846   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3847       && Universe::narrow_klass_shift() == 0) {
3848     movw(dst, src);
3849     return;
3850   }
3851 
3852 #ifdef ASSERT
3853   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3854 #endif
3855 
3856   Register rbase = dst;
3857   if (dst == src) rbase = rheapbase;
3858   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3859   sub(dst, src, rbase);
3860   if (Universe::narrow_klass_shift() != 0) {
3861     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3862     lsr(dst, dst, LogKlassAlignmentInBytes);
3863   }
3864   if (dst == src) reinit_heapbase();
3865 }
3866 
encode_klass_not_null(Register r)3867 void MacroAssembler::encode_klass_not_null(Register r) {
3868   encode_klass_not_null(r, r);
3869 }
3870 
decode_klass_not_null(Register dst,Register src)3871 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3872   Register rbase = dst;
3873   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3874 
3875   if (Universe::narrow_klass_base() == NULL) {
3876     if (Universe::narrow_klass_shift() != 0) {
3877       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3878       lsl(dst, src, LogKlassAlignmentInBytes);
3879     } else {
3880       if (dst != src) mov(dst, src);
3881     }
3882     return;
3883   }
3884 
3885   if (use_XOR_for_compressed_class_base) {
3886     if (Universe::narrow_klass_shift() != 0) {
3887       lsl(dst, src, LogKlassAlignmentInBytes);
3888       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3889     } else {
3890       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3891     }
3892     return;
3893   }
3894 
3895   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3896       && Universe::narrow_klass_shift() == 0) {
3897     if (dst != src)
3898       movw(dst, src);
3899     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3900     return;
3901   }
3902 
3903   // Cannot assert, unverified entry point counts instructions (see .ad file)
3904   // vtableStubs also counts instructions in pd_code_size_limit.
3905   // Also do not verify_oop as this is called by verify_oop.
3906   if (dst == src) rbase = rheapbase;
3907   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3908   if (Universe::narrow_klass_shift() != 0) {
3909     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3910     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3911   } else {
3912     add(dst, rbase, src);
3913   }
3914   if (dst == src) reinit_heapbase();
3915 }
3916 
decode_klass_not_null(Register r)3917 void  MacroAssembler::decode_klass_not_null(Register r) {
3918   decode_klass_not_null(r, r);
3919 }
3920 
set_narrow_oop(Register dst,jobject obj)3921 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3922 #ifdef ASSERT
3923   {
3924     ThreadInVMfromUnknown tiv;
3925     assert (UseCompressedOops, "should only be used for compressed oops");
3926     assert (Universe::heap() != NULL, "java heap should be initialized");
3927     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3928     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3929   }
3930 #endif
3931   int oop_index = oop_recorder()->find_index(obj);
3932   InstructionMark im(this);
3933   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3934   code_section()->relocate(inst_mark(), rspec);
3935   movz(dst, 0xDEAD, 16);
3936   movk(dst, 0xBEEF);
3937 }
3938 
set_narrow_klass(Register dst,Klass * k)3939 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3940   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3941   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3942   int index = oop_recorder()->find_index(k);
3943   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3944 
3945   InstructionMark im(this);
3946   RelocationHolder rspec = metadata_Relocation::spec(index);
3947   code_section()->relocate(inst_mark(), rspec);
3948   narrowKlass nk = Klass::encode_klass(k);
3949   movz(dst, (nk >> 16), 16);
3950   movk(dst, nk & 0xffff);
3951 }
3952 
access_load_at(BasicType type,DecoratorSet decorators,Register dst,Address src,Register tmp1,Register thread_tmp)3953 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3954                                     Register dst, Address src,
3955                                     Register tmp1, Register thread_tmp) {
3956   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3957   decorators = AccessInternal::decorator_fixup(decorators);
3958   bool as_raw = (decorators & AS_RAW) != 0;
3959   if (as_raw) {
3960     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3961   } else {
3962     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3963   }
3964 }
3965 
access_store_at(BasicType type,DecoratorSet decorators,Address dst,Register src,Register tmp1,Register thread_tmp)3966 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3967                                      Address dst, Register src,
3968                                      Register tmp1, Register thread_tmp) {
3969   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3970   decorators = AccessInternal::decorator_fixup(decorators);
3971   bool as_raw = (decorators & AS_RAW) != 0;
3972   if (as_raw) {
3973     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3974   } else {
3975     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3976   }
3977 }
3978 
load_heap_oop(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)3979 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3980                                    Register thread_tmp, DecoratorSet decorators) {
3981   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
3982 }
3983 
load_heap_oop_not_null(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)3984 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3985                                             Register thread_tmp, DecoratorSet decorators) {
3986   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
3987 }
3988 
store_heap_oop(Address dst,Register src,Register tmp1,Register thread_tmp,DecoratorSet decorators)3989 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
3990                                     Register thread_tmp, DecoratorSet decorators) {
3991   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
3992 }
3993 
3994 // Used for storing NULLs.
store_heap_oop_null(Address dst)3995 void MacroAssembler::store_heap_oop_null(Address dst) {
3996   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
3997 }
3998 
allocate_metadata_address(Metadata * obj)3999 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4000   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4001   int index = oop_recorder()->allocate_metadata_index(obj);
4002   RelocationHolder rspec = metadata_Relocation::spec(index);
4003   return Address((address)obj, rspec);
4004 }
4005 
4006 // Move an oop into a register.  immediate is true if we want
4007 // immediate instrcutions, i.e. we are not going to patch this
4008 // instruction while the code is being executed by another thread.  In
4009 // that case we can use move immediates rather than the constant pool.
movoop(Register dst,jobject obj,bool immediate)4010 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4011   int oop_index;
4012   if (obj == NULL) {
4013     oop_index = oop_recorder()->allocate_oop_index(obj);
4014   } else {
4015 #ifdef ASSERT
4016     {
4017       ThreadInVMfromUnknown tiv;
4018       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4019     }
4020 #endif
4021     oop_index = oop_recorder()->find_index(obj);
4022   }
4023   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4024   if (! immediate) {
4025     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4026     ldr_constant(dst, Address(dummy, rspec));
4027   } else
4028     mov(dst, Address((address)obj, rspec));
4029 }
4030 
4031 // Move a metadata address into a register.
mov_metadata(Register dst,Metadata * obj)4032 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4033   int oop_index;
4034   if (obj == NULL) {
4035     oop_index = oop_recorder()->allocate_metadata_index(obj);
4036   } else {
4037     oop_index = oop_recorder()->find_index(obj);
4038   }
4039   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4040   mov(dst, Address((address)obj, rspec));
4041 }
4042 
constant_oop_address(jobject obj)4043 Address MacroAssembler::constant_oop_address(jobject obj) {
4044 #ifdef ASSERT
4045   {
4046     ThreadInVMfromUnknown tiv;
4047     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4048     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4049   }
4050 #endif
4051   int oop_index = oop_recorder()->find_index(obj);
4052   return Address((address)obj, oop_Relocation::spec(oop_index));
4053 }
4054 
4055 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
tlab_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Register t2,Label & slow_case)4056 void MacroAssembler::tlab_allocate(Register obj,
4057                                    Register var_size_in_bytes,
4058                                    int con_size_in_bytes,
4059                                    Register t1,
4060                                    Register t2,
4061                                    Label& slow_case) {
4062   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4063   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4064 }
4065 
4066 // Defines obj, preserves var_size_in_bytes
eden_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Label & slow_case)4067 void MacroAssembler::eden_allocate(Register obj,
4068                                    Register var_size_in_bytes,
4069                                    int con_size_in_bytes,
4070                                    Register t1,
4071                                    Label& slow_case) {
4072   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4073   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4074 }
4075 
4076 // Zero words; len is in bytes
4077 // Destroys all registers except addr
4078 // len must be a nonzero multiple of wordSize
zero_memory(Register addr,Register len,Register t1)4079 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4080   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4081 
4082 #ifdef ASSERT
4083   { Label L;
4084     tst(len, BytesPerWord - 1);
4085     br(Assembler::EQ, L);
4086     stop("len is not a multiple of BytesPerWord");
4087     bind(L);
4088   }
4089 #endif
4090 
4091 #ifndef PRODUCT
4092   block_comment("zero memory");
4093 #endif
4094 
4095   Label loop;
4096   Label entry;
4097 
4098 //  Algorithm:
4099 //
4100 //    scratch1 = cnt & 7;
4101 //    cnt -= scratch1;
4102 //    p += scratch1;
4103 //    switch (scratch1) {
4104 //      do {
4105 //        cnt -= 8;
4106 //          p[-8] = 0;
4107 //        case 7:
4108 //          p[-7] = 0;
4109 //        case 6:
4110 //          p[-6] = 0;
4111 //          // ...
4112 //        case 1:
4113 //          p[-1] = 0;
4114 //        case 0:
4115 //          p += 8;
4116 //      } while (cnt);
4117 //    }
4118 
4119   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4120 
4121   lsr(len, len, LogBytesPerWord);
4122   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4123   sub(len, len, rscratch1);      // cnt -= unroll
4124   // t1 always points to the end of the region we're about to zero
4125   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4126   adr(rscratch2, entry);
4127   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4128   br(rscratch2);
4129   bind(loop);
4130   sub(len, len, unroll);
4131   for (int i = -unroll; i < 0; i++)
4132     Assembler::str(zr, Address(t1, i * wordSize));
4133   bind(entry);
4134   add(t1, t1, unroll * wordSize);
4135   cbnz(len, loop);
4136 }
4137 
verify_tlab()4138 void MacroAssembler::verify_tlab() {
4139 #ifdef ASSERT
4140   if (UseTLAB && VerifyOops) {
4141     Label next, ok;
4142 
4143     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4144 
4145     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4146     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4147     cmp(rscratch2, rscratch1);
4148     br(Assembler::HS, next);
4149     STOP("assert(top >= start)");
4150     should_not_reach_here();
4151 
4152     bind(next);
4153     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4154     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4155     cmp(rscratch2, rscratch1);
4156     br(Assembler::HS, ok);
4157     STOP("assert(top <= end)");
4158     should_not_reach_here();
4159 
4160     bind(ok);
4161     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4162   }
4163 #endif
4164 }
4165 
4166 // Writes to stack successive pages until offset reached to check for
4167 // stack overflow + shadow pages.  This clobbers tmp.
bang_stack_size(Register size,Register tmp)4168 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4169   assert_different_registers(tmp, size, rscratch1);
4170   mov(tmp, sp);
4171   // Bang stack for total size given plus shadow page size.
4172   // Bang one page at a time because large size can bang beyond yellow and
4173   // red zones.
4174   Label loop;
4175   mov(rscratch1, os::vm_page_size());
4176   bind(loop);
4177   lea(tmp, Address(tmp, -os::vm_page_size()));
4178   subsw(size, size, rscratch1);
4179   str(size, Address(tmp));
4180   br(Assembler::GT, loop);
4181 
4182   // Bang down shadow pages too.
4183   // At this point, (tmp-0) is the last address touched, so don't
4184   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4185   // was post-decremented.)  Skip this address by starting at i=1, and
4186   // touch a few more pages below.  N.B.  It is important to touch all
4187   // the way down to and including i=StackShadowPages.
4188   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4189     // this could be any sized move but this is can be a debugging crumb
4190     // so the bigger the better.
4191     lea(tmp, Address(tmp, -os::vm_page_size()));
4192     str(size, Address(tmp));
4193   }
4194 }
4195 
4196 
4197 // Move the address of the polling page into dest.
get_polling_page(Register dest,address page,relocInfo::relocType rtype)4198 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4199   if (SafepointMechanism::uses_thread_local_poll()) {
4200     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4201   } else {
4202     unsigned long off;
4203     adrp(dest, Address(page, rtype), off);
4204     assert(off == 0, "polling page must be page aligned");
4205   }
4206 }
4207 
4208 // Move the address of the polling page into r, then read the polling
4209 // page.
read_polling_page(Register r,address page,relocInfo::relocType rtype)4210 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4211   get_polling_page(r, page, rtype);
4212   return read_polling_page(r, rtype);
4213 }
4214 
4215 // Read the polling page.  The address of the polling page must
4216 // already be in r.
read_polling_page(Register r,relocInfo::relocType rtype)4217 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4218   InstructionMark im(this);
4219   code_section()->relocate(inst_mark(), rtype);
4220   ldrw(zr, Address(r, 0));
4221   return inst_mark();
4222 }
4223 
adrp(Register reg1,const Address & dest,uint64_t & byte_offset)4224 void MacroAssembler::adrp(Register reg1, const Address &dest, uint64_t &byte_offset) {
4225   uint64_t low_page = (uint64_t)CodeCache::low_bound() >> 12;
4226   uint64_t high_page = (uint64_t)(CodeCache::high_bound()-1) >> 12;
4227   uint64_t dest_page = (uint64_t)dest.target() >> 12;
4228   int64_t offset_low = dest_page - low_page;
4229   int64_t offset_high = dest_page - high_page;
4230 
4231   assert(is_valid_AArch64_address(dest.target()), "bad address");
4232   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4233 
4234   InstructionMark im(this);
4235   code_section()->relocate(inst_mark(), dest.rspec());
4236   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4237   // the code cache so that if it is relocated we know it will still reach
4238   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4239     _adrp(reg1, dest.target());
4240   } else {
4241     uint64_t target = (uint64_t)dest.target();
4242     uint64_t adrp_target
4243       = (target & 0xffffffffULL) | ((uint64_t)pc() & 0xffff00000000ULL);
4244 
4245     _adrp(reg1, (address)adrp_target);
4246     movk(reg1, target >> 32, 32);
4247   }
4248   byte_offset = (uint64_t)dest.target() & 0xfff;
4249 }
4250 
load_byte_map_base(Register reg)4251 void MacroAssembler::load_byte_map_base(Register reg) {
4252   jbyte *byte_map_base =
4253     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4254 
4255   // Strictly speaking the byte_map_base isn't an address at all, and it might
4256   // even be negative. It is thus materialised as a constant.
4257   mov(reg, (uint64_t)byte_map_base);
4258 }
4259 
build_frame(int framesize)4260 void MacroAssembler::build_frame(int framesize) {
4261   assert(framesize > 0, "framesize must be > 0");
4262   if (framesize < ((1 << 9) + 2 * wordSize)) {
4263     sub(sp, sp, framesize);
4264     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4265     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4266   } else {
4267     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4268     if (PreserveFramePointer) mov(rfp, sp);
4269     if (framesize < ((1 << 12) + 2 * wordSize))
4270       sub(sp, sp, framesize - 2 * wordSize);
4271     else {
4272       mov(rscratch1, framesize - 2 * wordSize);
4273       sub(sp, sp, rscratch1);
4274     }
4275   }
4276 }
4277 
remove_frame(int framesize)4278 void MacroAssembler::remove_frame(int framesize) {
4279   assert(framesize > 0, "framesize must be > 0");
4280   if (framesize < ((1 << 9) + 2 * wordSize)) {
4281     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4282     add(sp, sp, framesize);
4283   } else {
4284     if (framesize < ((1 << 12) + 2 * wordSize))
4285       add(sp, sp, framesize - 2 * wordSize);
4286     else {
4287       mov(rscratch1, framesize - 2 * wordSize);
4288       add(sp, sp, rscratch1);
4289     }
4290     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4291   }
4292 }
4293 
4294 #ifdef COMPILER2
4295 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4296 
4297 // Search for str1 in str2 and return index or -1
string_indexof(Register str2,Register str1,Register cnt2,Register cnt1,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,int icnt1,Register result,int ae)4298 void MacroAssembler::string_indexof(Register str2, Register str1,
4299                                     Register cnt2, Register cnt1,
4300                                     Register tmp1, Register tmp2,
4301                                     Register tmp3, Register tmp4,
4302                                     Register tmp5, Register tmp6,
4303                                     int icnt1, Register result, int ae) {
4304   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4305   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4306 
4307   Register ch1 = rscratch1;
4308   Register ch2 = rscratch2;
4309   Register cnt1tmp = tmp1;
4310   Register cnt2tmp = tmp2;
4311   Register cnt1_neg = cnt1;
4312   Register cnt2_neg = cnt2;
4313   Register result_tmp = tmp4;
4314 
4315   bool isL = ae == StrIntrinsicNode::LL;
4316 
4317   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4318   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4319   int str1_chr_shift = str1_isL ? 0:1;
4320   int str2_chr_shift = str2_isL ? 0:1;
4321   int str1_chr_size = str1_isL ? 1:2;
4322   int str2_chr_size = str2_isL ? 1:2;
4323   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4324                                       (chr_insn)&MacroAssembler::ldrh;
4325   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4326                                       (chr_insn)&MacroAssembler::ldrh;
4327   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4328   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4329 
4330   // Note, inline_string_indexOf() generates checks:
4331   // if (substr.count > string.count) return -1;
4332   // if (substr.count == 0) return 0;
4333 
4334   // We have two strings, a source string in str2, cnt2 and a pattern string
4335   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4336 
4337   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4338   // With a small pattern and source we use linear scan.
4339 
4340   if (icnt1 == -1) {
4341     sub(result_tmp, cnt2, cnt1);
4342     cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4343     br(LT, LINEARSEARCH);
4344     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4345     cmp(cnt1, 256);
4346     lsr(tmp1, cnt2, 2);
4347     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4348     br(GE, LINEARSTUB);
4349   }
4350 
4351 // The Boyer Moore alogorithm is based on the description here:-
4352 //
4353 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4354 //
4355 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4356 // and the 'Good Suffix' rule.
4357 //
4358 // These rules are essentially heuristics for how far we can shift the
4359 // pattern along the search string.
4360 //
4361 // The implementation here uses the 'Bad Character' rule only because of the
4362 // complexity of initialisation for the 'Good Suffix' rule.
4363 //
4364 // This is also known as the Boyer-Moore-Horspool algorithm:-
4365 //
4366 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4367 //
4368 // This particular implementation has few java-specific optimizations.
4369 //
4370 // #define ASIZE 256
4371 //
4372 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4373 //       int i, j;
4374 //       unsigned c;
4375 //       unsigned char bc[ASIZE];
4376 //
4377 //       /* Preprocessing */
4378 //       for (i = 0; i < ASIZE; ++i)
4379 //          bc[i] = m;
4380 //       for (i = 0; i < m - 1; ) {
4381 //          c = x[i];
4382 //          ++i;
4383 //          // c < 256 for Latin1 string, so, no need for branch
4384 //          #ifdef PATTERN_STRING_IS_LATIN1
4385 //          bc[c] = m - i;
4386 //          #else
4387 //          if (c < ASIZE) bc[c] = m - i;
4388 //          #endif
4389 //       }
4390 //
4391 //       /* Searching */
4392 //       j = 0;
4393 //       while (j <= n - m) {
4394 //          c = y[i+j];
4395 //          if (x[m-1] == c)
4396 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4397 //          if (i < 0) return j;
4398 //          // c < 256 for Latin1 string, so, no need for branch
4399 //          #ifdef SOURCE_STRING_IS_LATIN1
4400 //          // LL case: (c< 256) always true. Remove branch
4401 //          j += bc[y[j+m-1]];
4402 //          #endif
4403 //          #ifndef PATTERN_STRING_IS_UTF
4404 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4405 //          if (c < ASIZE)
4406 //            j += bc[y[j+m-1]];
4407 //          else
4408 //            j += 1
4409 //          #endif
4410 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4411 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4412 //          if (c < ASIZE)
4413 //            j += bc[y[j+m-1]];
4414 //          else
4415 //            j += m
4416 //          #endif
4417 //       }
4418 //    }
4419 
4420   if (icnt1 == -1) {
4421     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4422         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4423     Register cnt1end = tmp2;
4424     Register str2end = cnt2;
4425     Register skipch = tmp2;
4426 
4427     // str1 length is >=8, so, we can read at least 1 register for cases when
4428     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4429     // UL case. We'll re-read last character in inner pre-loop code to have
4430     // single outer pre-loop load
4431     const int firstStep = isL ? 7 : 3;
4432 
4433     const int ASIZE = 256;
4434     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4435     sub(sp, sp, ASIZE);
4436     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4437     mov(ch1, sp);
4438     BIND(BM_INIT_LOOP);
4439       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4440       subs(tmp5, tmp5, 1);
4441       br(GT, BM_INIT_LOOP);
4442 
4443       sub(cnt1tmp, cnt1, 1);
4444       mov(tmp5, str2);
4445       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4446       sub(ch2, cnt1, 1);
4447       mov(tmp3, str1);
4448     BIND(BCLOOP);
4449       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4450       if (!str1_isL) {
4451         cmp(ch1, ASIZE);
4452         br(HS, BCSKIP);
4453       }
4454       strb(ch2, Address(sp, ch1));
4455     BIND(BCSKIP);
4456       subs(ch2, ch2, 1);
4457       br(GT, BCLOOP);
4458 
4459       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4460       if (str1_isL == str2_isL) {
4461         // load last 8 bytes (8LL/4UU symbols)
4462         ldr(tmp6, Address(tmp6, -wordSize));
4463       } else {
4464         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4465         // convert Latin1 to UTF. We'll have to wait until load completed, but
4466         // it's still faster than per-character loads+checks
4467         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4468         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4469         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4470         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4471         orr(ch2, ch1, ch2, LSL, 16);
4472         orr(tmp6, tmp6, tmp3, LSL, 48);
4473         orr(tmp6, tmp6, ch2, LSL, 16);
4474       }
4475     BIND(BMLOOPSTR2);
4476       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4477       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4478       if (str1_isL == str2_isL) {
4479         // re-init tmp3. It's for free because it's executed in parallel with
4480         // load above. Alternative is to initialize it before loop, but it'll
4481         // affect performance on in-order systems with 2 or more ld/st pipelines
4482         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4483       }
4484       if (!isL) { // UU/UL case
4485         lsl(ch2, cnt1tmp, 1); // offset in bytes
4486       }
4487       cmp(tmp3, skipch);
4488       br(NE, BMSKIP);
4489       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4490       mov(ch1, tmp6);
4491       if (isL) {
4492         b(BMLOOPSTR1_AFTER_LOAD);
4493       } else {
4494         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4495         b(BMLOOPSTR1_CMP);
4496       }
4497     BIND(BMLOOPSTR1);
4498       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4499       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4500     BIND(BMLOOPSTR1_AFTER_LOAD);
4501       subs(cnt1tmp, cnt1tmp, 1);
4502       br(LT, BMLOOPSTR1_LASTCMP);
4503     BIND(BMLOOPSTR1_CMP);
4504       cmp(ch1, ch2);
4505       br(EQ, BMLOOPSTR1);
4506     BIND(BMSKIP);
4507       if (!isL) {
4508         // if we've met UTF symbol while searching Latin1 pattern, then we can
4509         // skip cnt1 symbols
4510         if (str1_isL != str2_isL) {
4511           mov(result_tmp, cnt1);
4512         } else {
4513           mov(result_tmp, 1);
4514         }
4515         cmp(skipch, ASIZE);
4516         br(HS, BMADV);
4517       }
4518       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4519     BIND(BMADV);
4520       sub(cnt1tmp, cnt1, 1);
4521       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4522       cmp(str2, str2end);
4523       br(LE, BMLOOPSTR2);
4524       add(sp, sp, ASIZE);
4525       b(NOMATCH);
4526     BIND(BMLOOPSTR1_LASTCMP);
4527       cmp(ch1, ch2);
4528       br(NE, BMSKIP);
4529     BIND(BMMATCH);
4530       sub(result, str2, tmp5);
4531       if (!str2_isL) lsr(result, result, 1);
4532       add(sp, sp, ASIZE);
4533       b(DONE);
4534 
4535     BIND(LINEARSTUB);
4536     cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
4537     br(LT, LINEAR_MEDIUM);
4538     mov(result, zr);
4539     RuntimeAddress stub = NULL;
4540     if (isL) {
4541       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4542       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4543     } else if (str1_isL) {
4544       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4545        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4546     } else {
4547       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4548       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4549     }
4550     trampoline_call(stub);
4551     b(DONE);
4552   }
4553 
4554   BIND(LINEARSEARCH);
4555   {
4556     Label DO1, DO2, DO3;
4557 
4558     Register str2tmp = tmp2;
4559     Register first = tmp3;
4560 
4561     if (icnt1 == -1)
4562     {
4563         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4564 
4565         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4566         br(LT, DOSHORT);
4567       BIND(LINEAR_MEDIUM);
4568         (this->*str1_load_1chr)(first, Address(str1));
4569         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4570         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4571         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4572         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4573 
4574       BIND(FIRST_LOOP);
4575         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4576         cmp(first, ch2);
4577         br(EQ, STR1_LOOP);
4578       BIND(STR2_NEXT);
4579         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4580         br(LE, FIRST_LOOP);
4581         b(NOMATCH);
4582 
4583       BIND(STR1_LOOP);
4584         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4585         add(cnt2tmp, cnt2_neg, str2_chr_size);
4586         br(GE, MATCH);
4587 
4588       BIND(STR1_NEXT);
4589         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4590         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4591         cmp(ch1, ch2);
4592         br(NE, STR2_NEXT);
4593         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4594         add(cnt2tmp, cnt2tmp, str2_chr_size);
4595         br(LT, STR1_NEXT);
4596         b(MATCH);
4597 
4598       BIND(DOSHORT);
4599       if (str1_isL == str2_isL) {
4600         cmp(cnt1, 2);
4601         br(LT, DO1);
4602         br(GT, DO3);
4603       }
4604     }
4605 
4606     if (icnt1 == 4) {
4607       Label CH1_LOOP;
4608 
4609         (this->*load_4chr)(ch1, str1);
4610         sub(result_tmp, cnt2, 4);
4611         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4612         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4613 
4614       BIND(CH1_LOOP);
4615         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4616         cmp(ch1, ch2);
4617         br(EQ, MATCH);
4618         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4619         br(LE, CH1_LOOP);
4620         b(NOMATCH);
4621       }
4622 
4623     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4624       Label CH1_LOOP;
4625 
4626       BIND(DO2);
4627         (this->*load_2chr)(ch1, str1);
4628         if (icnt1 == 2) {
4629           sub(result_tmp, cnt2, 2);
4630         }
4631         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4632         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4633       BIND(CH1_LOOP);
4634         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4635         cmp(ch1, ch2);
4636         br(EQ, MATCH);
4637         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4638         br(LE, CH1_LOOP);
4639         b(NOMATCH);
4640     }
4641 
4642     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4643       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4644 
4645       BIND(DO3);
4646         (this->*load_2chr)(first, str1);
4647         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4648         if (icnt1 == 3) {
4649           sub(result_tmp, cnt2, 3);
4650         }
4651         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4652         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4653       BIND(FIRST_LOOP);
4654         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4655         cmpw(first, ch2);
4656         br(EQ, STR1_LOOP);
4657       BIND(STR2_NEXT);
4658         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4659         br(LE, FIRST_LOOP);
4660         b(NOMATCH);
4661 
4662       BIND(STR1_LOOP);
4663         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4664         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4665         cmp(ch1, ch2);
4666         br(NE, STR2_NEXT);
4667         b(MATCH);
4668     }
4669 
4670     if (icnt1 == -1 || icnt1 == 1) {
4671       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4672 
4673       BIND(DO1);
4674         (this->*str1_load_1chr)(ch1, str1);
4675         cmp(cnt2, 8);
4676         br(LT, DO1_SHORT);
4677 
4678         sub(result_tmp, cnt2, 8/str2_chr_size);
4679         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4680         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4681         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4682 
4683         if (str2_isL) {
4684           orr(ch1, ch1, ch1, LSL, 8);
4685         }
4686         orr(ch1, ch1, ch1, LSL, 16);
4687         orr(ch1, ch1, ch1, LSL, 32);
4688       BIND(CH1_LOOP);
4689         ldr(ch2, Address(str2, cnt2_neg));
4690         eor(ch2, ch1, ch2);
4691         sub(tmp1, ch2, tmp3);
4692         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4693         bics(tmp1, tmp1, tmp2);
4694         br(NE, HAS_ZERO);
4695         adds(cnt2_neg, cnt2_neg, 8);
4696         br(LT, CH1_LOOP);
4697 
4698         cmp(cnt2_neg, 8);
4699         mov(cnt2_neg, 0);
4700         br(LT, CH1_LOOP);
4701         b(NOMATCH);
4702 
4703       BIND(HAS_ZERO);
4704         rev(tmp1, tmp1);
4705         clz(tmp1, tmp1);
4706         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4707         b(MATCH);
4708 
4709       BIND(DO1_SHORT);
4710         mov(result_tmp, cnt2);
4711         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4712         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4713       BIND(DO1_LOOP);
4714         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4715         cmpw(ch1, ch2);
4716         br(EQ, MATCH);
4717         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4718         br(LT, DO1_LOOP);
4719     }
4720   }
4721   BIND(NOMATCH);
4722     mov(result, -1);
4723     b(DONE);
4724   BIND(MATCH);
4725     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4726   BIND(DONE);
4727 }
4728 
4729 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4730 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4731 
string_indexof_char(Register str1,Register cnt1,Register ch,Register result,Register tmp1,Register tmp2,Register tmp3)4732 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4733                                          Register ch, Register result,
4734                                          Register tmp1, Register tmp2, Register tmp3)
4735 {
4736   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4737   Register cnt1_neg = cnt1;
4738   Register ch1 = rscratch1;
4739   Register result_tmp = rscratch2;
4740 
4741   cbz(cnt1, NOMATCH);
4742 
4743   cmp(cnt1, 4);
4744   br(LT, DO1_SHORT);
4745 
4746   orr(ch, ch, ch, LSL, 16);
4747   orr(ch, ch, ch, LSL, 32);
4748 
4749   sub(cnt1, cnt1, 4);
4750   mov(result_tmp, cnt1);
4751   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4752   sub(cnt1_neg, zr, cnt1, LSL, 1);
4753 
4754   mov(tmp3, 0x0001000100010001);
4755 
4756   BIND(CH1_LOOP);
4757     ldr(ch1, Address(str1, cnt1_neg));
4758     eor(ch1, ch, ch1);
4759     sub(tmp1, ch1, tmp3);
4760     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4761     bics(tmp1, tmp1, tmp2);
4762     br(NE, HAS_ZERO);
4763     adds(cnt1_neg, cnt1_neg, 8);
4764     br(LT, CH1_LOOP);
4765 
4766     cmp(cnt1_neg, 8);
4767     mov(cnt1_neg, 0);
4768     br(LT, CH1_LOOP);
4769     b(NOMATCH);
4770 
4771   BIND(HAS_ZERO);
4772     rev(tmp1, tmp1);
4773     clz(tmp1, tmp1);
4774     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4775     b(MATCH);
4776 
4777   BIND(DO1_SHORT);
4778     mov(result_tmp, cnt1);
4779     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4780     sub(cnt1_neg, zr, cnt1, LSL, 1);
4781   BIND(DO1_LOOP);
4782     ldrh(ch1, Address(str1, cnt1_neg));
4783     cmpw(ch, ch1);
4784     br(EQ, MATCH);
4785     adds(cnt1_neg, cnt1_neg, 2);
4786     br(LT, DO1_LOOP);
4787   BIND(NOMATCH);
4788     mov(result, -1);
4789     b(DONE);
4790   BIND(MATCH);
4791     add(result, result_tmp, cnt1_neg, ASR, 1);
4792   BIND(DONE);
4793 }
4794 
4795 // Compare strings.
string_compare(Register str1,Register str2,Register cnt1,Register cnt2,Register result,Register tmp1,Register tmp2,FloatRegister vtmp1,FloatRegister vtmp2,FloatRegister vtmp3,int ae)4796 void MacroAssembler::string_compare(Register str1, Register str2,
4797     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4798     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4799   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4800       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4801       SHORT_LOOP_START, TAIL_CHECK;
4802 
4803   const int STUB_THRESHOLD = 64 + 8;
4804   bool isLL = ae == StrIntrinsicNode::LL;
4805   bool isLU = ae == StrIntrinsicNode::LU;
4806   bool isUL = ae == StrIntrinsicNode::UL;
4807 
4808   bool str1_isL = isLL || isLU;
4809   bool str2_isL = isLL || isUL;
4810 
4811   int str1_chr_shift = str1_isL ? 0 : 1;
4812   int str2_chr_shift = str2_isL ? 0 : 1;
4813   int str1_chr_size = str1_isL ? 1 : 2;
4814   int str2_chr_size = str2_isL ? 1 : 2;
4815   int minCharsInWord = isLL ? wordSize : wordSize/2;
4816 
4817   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4818   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4819                                       (chr_insn)&MacroAssembler::ldrh;
4820   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4821                                       (chr_insn)&MacroAssembler::ldrh;
4822   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4823                             (uxt_insn)&MacroAssembler::uxthw;
4824 
4825   BLOCK_COMMENT("string_compare {");
4826 
4827   // Bizzarely, the counts are passed in bytes, regardless of whether they
4828   // are L or U strings, however the result is always in characters.
4829   if (!str1_isL) asrw(cnt1, cnt1, 1);
4830   if (!str2_isL) asrw(cnt2, cnt2, 1);
4831 
4832   // Compute the minimum of the string lengths and save the difference.
4833   subsw(result, cnt1, cnt2);
4834   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4835 
4836   // A very short string
4837   cmpw(cnt2, minCharsInWord);
4838   br(Assembler::LE, SHORT_STRING);
4839 
4840   // Compare longwords
4841   // load first parts of strings and finish initialization while loading
4842   {
4843     if (str1_isL == str2_isL) { // LL or UU
4844       ldr(tmp1, Address(str1));
4845       cmp(str1, str2);
4846       br(Assembler::EQ, DONE);
4847       ldr(tmp2, Address(str2));
4848       cmp(cnt2, STUB_THRESHOLD);
4849       br(GE, STUB);
4850       subsw(cnt2, cnt2, minCharsInWord);
4851       br(EQ, TAIL_CHECK);
4852       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4853       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4854       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4855     } else if (isLU) {
4856       ldrs(vtmp, Address(str1));
4857       ldr(tmp2, Address(str2));
4858       cmp(cnt2, STUB_THRESHOLD);
4859       br(GE, STUB);
4860       subw(cnt2, cnt2, 4);
4861       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4862       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4863       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4864       zip1(vtmp, T8B, vtmp, vtmpZ);
4865       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4866       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4867       add(cnt1, cnt1, 4);
4868       fmovd(tmp1, vtmp);
4869     } else { // UL case
4870       ldr(tmp1, Address(str1));
4871       ldrs(vtmp, Address(str2));
4872       cmp(cnt2, STUB_THRESHOLD);
4873       br(GE, STUB);
4874       subw(cnt2, cnt2, 4);
4875       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4876       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4877       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4878       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4879       zip1(vtmp, T8B, vtmp, vtmpZ);
4880       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4881       add(cnt1, cnt1, 8);
4882       fmovd(tmp2, vtmp);
4883     }
4884     adds(cnt2, cnt2, isUL ? 4 : 8);
4885     br(GE, TAIL);
4886     eor(rscratch2, tmp1, tmp2);
4887     cbnz(rscratch2, DIFFERENCE);
4888     // main loop
4889     bind(NEXT_WORD);
4890     if (str1_isL == str2_isL) {
4891       ldr(tmp1, Address(str1, cnt2));
4892       ldr(tmp2, Address(str2, cnt2));
4893       adds(cnt2, cnt2, 8);
4894     } else if (isLU) {
4895       ldrs(vtmp, Address(str1, cnt1));
4896       ldr(tmp2, Address(str2, cnt2));
4897       add(cnt1, cnt1, 4);
4898       zip1(vtmp, T8B, vtmp, vtmpZ);
4899       fmovd(tmp1, vtmp);
4900       adds(cnt2, cnt2, 8);
4901     } else { // UL
4902       ldrs(vtmp, Address(str2, cnt2));
4903       ldr(tmp1, Address(str1, cnt1));
4904       zip1(vtmp, T8B, vtmp, vtmpZ);
4905       add(cnt1, cnt1, 8);
4906       fmovd(tmp2, vtmp);
4907       adds(cnt2, cnt2, 4);
4908     }
4909     br(GE, TAIL);
4910 
4911     eor(rscratch2, tmp1, tmp2);
4912     cbz(rscratch2, NEXT_WORD);
4913     b(DIFFERENCE);
4914     bind(TAIL);
4915     eor(rscratch2, tmp1, tmp2);
4916     cbnz(rscratch2, DIFFERENCE);
4917     // Last longword.  In the case where length == 4 we compare the
4918     // same longword twice, but that's still faster than another
4919     // conditional branch.
4920     if (str1_isL == str2_isL) {
4921       ldr(tmp1, Address(str1));
4922       ldr(tmp2, Address(str2));
4923     } else if (isLU) {
4924       ldrs(vtmp, Address(str1));
4925       ldr(tmp2, Address(str2));
4926       zip1(vtmp, T8B, vtmp, vtmpZ);
4927       fmovd(tmp1, vtmp);
4928     } else { // UL
4929       ldrs(vtmp, Address(str2));
4930       ldr(tmp1, Address(str1));
4931       zip1(vtmp, T8B, vtmp, vtmpZ);
4932       fmovd(tmp2, vtmp);
4933     }
4934     bind(TAIL_CHECK);
4935     eor(rscratch2, tmp1, tmp2);
4936     cbz(rscratch2, DONE);
4937 
4938     // Find the first different characters in the longwords and
4939     // compute their difference.
4940     bind(DIFFERENCE);
4941     rev(rscratch2, rscratch2);
4942     clz(rscratch2, rscratch2);
4943     andr(rscratch2, rscratch2, isLL ? -8 : -16);
4944     lsrv(tmp1, tmp1, rscratch2);
4945     (this->*ext_chr)(tmp1, tmp1);
4946     lsrv(tmp2, tmp2, rscratch2);
4947     (this->*ext_chr)(tmp2, tmp2);
4948     subw(result, tmp1, tmp2);
4949     b(DONE);
4950   }
4951 
4952   bind(STUB);
4953     RuntimeAddress stub = NULL;
4954     switch(ae) {
4955       case StrIntrinsicNode::LL:
4956         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
4957         break;
4958       case StrIntrinsicNode::UU:
4959         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
4960         break;
4961       case StrIntrinsicNode::LU:
4962         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
4963         break;
4964       case StrIntrinsicNode::UL:
4965         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
4966         break;
4967       default:
4968         ShouldNotReachHere();
4969      }
4970     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
4971     trampoline_call(stub);
4972     b(DONE);
4973 
4974   bind(SHORT_STRING);
4975   // Is the minimum length zero?
4976   cbz(cnt2, DONE);
4977   // arrange code to do most branches while loading and loading next characters
4978   // while comparing previous
4979   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
4980   subs(cnt2, cnt2, 1);
4981   br(EQ, SHORT_LAST_INIT);
4982   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
4983   b(SHORT_LOOP_START);
4984   bind(SHORT_LOOP);
4985   subs(cnt2, cnt2, 1);
4986   br(EQ, SHORT_LAST);
4987   bind(SHORT_LOOP_START);
4988   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
4989   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
4990   cmp(tmp1, cnt1);
4991   br(NE, SHORT_LOOP_TAIL);
4992   subs(cnt2, cnt2, 1);
4993   br(EQ, SHORT_LAST2);
4994   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
4995   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
4996   cmp(tmp2, rscratch1);
4997   br(EQ, SHORT_LOOP);
4998   sub(result, tmp2, rscratch1);
4999   b(DONE);
5000   bind(SHORT_LOOP_TAIL);
5001   sub(result, tmp1, cnt1);
5002   b(DONE);
5003   bind(SHORT_LAST2);
5004   cmp(tmp2, rscratch1);
5005   br(EQ, DONE);
5006   sub(result, tmp2, rscratch1);
5007 
5008   b(DONE);
5009   bind(SHORT_LAST_INIT);
5010   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5011   bind(SHORT_LAST);
5012   cmp(tmp1, cnt1);
5013   br(EQ, DONE);
5014   sub(result, tmp1, cnt1);
5015 
5016   bind(DONE);
5017 
5018   BLOCK_COMMENT("} string_compare");
5019 }
5020 #endif // COMPILER2
5021 
5022 // This method checks if provided byte array contains byte with highest bit set.
has_negatives(Register ary1,Register len,Register result)5023 address MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5024     // Simple and most common case of aligned small array which is not at the
5025     // end of memory page is placed here. All other cases are in stub.
5026     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5027     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5028     assert_different_registers(ary1, len, result);
5029 
5030     cmpw(len, 0);
5031     br(LE, SET_RESULT);
5032     cmpw(len, 4 * wordSize);
5033     br(GE, STUB_LONG); // size > 32 then go to stub
5034 
5035     int shift = 64 - exact_log2(os::vm_page_size());
5036     lsl(rscratch1, ary1, shift);
5037     mov(rscratch2, (u_int64_t)(4 * wordSize) << shift);
5038     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5039     br(CS, STUB); // at the end of page then go to stub
5040     subs(len, len, wordSize);
5041     br(LT, END);
5042 
5043   BIND(LOOP);
5044     ldr(rscratch1, Address(post(ary1, wordSize)));
5045     tst(rscratch1, UPPER_BIT_MASK);
5046     br(NE, SET_RESULT);
5047     subs(len, len, wordSize);
5048     br(GE, LOOP);
5049     cmpw(len, -wordSize);
5050     br(EQ, SET_RESULT);
5051 
5052   BIND(END);
5053     ldr(result, Address(ary1));
5054     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5055     lslv(result, result, len);
5056     tst(result, UPPER_BIT_MASK);
5057     b(SET_RESULT);
5058 
5059   BIND(STUB);
5060     RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives());
5061     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5062     address tpc1 = trampoline_call(has_neg);
5063     if (tpc1 == NULL) {
5064       DEBUG_ONLY(reset_labels3(STUB_LONG, SET_RESULT, DONE));
5065       postcond(pc() == badAddress);
5066       return NULL;
5067     }
5068     b(DONE);
5069 
5070   BIND(STUB_LONG);
5071     RuntimeAddress has_neg_long = RuntimeAddress(StubRoutines::aarch64::has_negatives_long());
5072     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5073     address tpc2 = trampoline_call(has_neg_long);
5074     if (tpc2 == NULL) {
5075       DEBUG_ONLY(reset_labels2(SET_RESULT, DONE));
5076       postcond(pc() == badAddress);
5077       return NULL;
5078     }
5079     b(DONE);
5080 
5081   BIND(SET_RESULT);
5082     cset(result, NE); // set true or false
5083 
5084   BIND(DONE);
5085   postcond(pc() != badAddress);
5086   return pc();
5087 }
5088 
arrays_equals(Register a1,Register a2,Register tmp3,Register tmp4,Register tmp5,Register result,Register cnt1,int elem_size)5089 address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5090                                       Register tmp4, Register tmp5, Register result,
5091                                       Register cnt1, int elem_size) {
5092   Label DONE, SAME;
5093   Register tmp1 = rscratch1;
5094   Register tmp2 = rscratch2;
5095   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5096   int elem_per_word = wordSize/elem_size;
5097   int log_elem_size = exact_log2(elem_size);
5098   int length_offset = arrayOopDesc::length_offset_in_bytes();
5099   int base_offset
5100     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5101   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5102 
5103   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5104   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5105 
5106 #ifndef PRODUCT
5107   {
5108     const char kind = (elem_size == 2) ? 'U' : 'L';
5109     char comment[64];
5110     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5111     BLOCK_COMMENT(comment);
5112   }
5113 #endif
5114 
5115   // if (a1 == a2)
5116   //     return true;
5117   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5118   br(EQ, SAME);
5119 
5120   if (UseSimpleArrayEquals) {
5121     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5122     // if (a1 == null || a2 == null)
5123     //     return false;
5124     // a1 & a2 == 0 means (some-pointer is null) or
5125     // (very-rare-or-even-probably-impossible-pointer-values)
5126     // so, we can save one branch in most cases
5127     tst(a1, a2);
5128     mov(result, false);
5129     br(EQ, A_MIGHT_BE_NULL);
5130     // if (a1.length != a2.length)
5131     //      return false;
5132     bind(A_IS_NOT_NULL);
5133     ldrw(cnt1, Address(a1, length_offset));
5134     ldrw(cnt2, Address(a2, length_offset));
5135     eorw(tmp5, cnt1, cnt2);
5136     cbnzw(tmp5, DONE);
5137     lea(a1, Address(a1, base_offset));
5138     lea(a2, Address(a2, base_offset));
5139     // Check for short strings, i.e. smaller than wordSize.
5140     subs(cnt1, cnt1, elem_per_word);
5141     br(Assembler::LT, SHORT);
5142     // Main 8 byte comparison loop.
5143     bind(NEXT_WORD); {
5144       ldr(tmp1, Address(post(a1, wordSize)));
5145       ldr(tmp2, Address(post(a2, wordSize)));
5146       subs(cnt1, cnt1, elem_per_word);
5147       eor(tmp5, tmp1, tmp2);
5148       cbnz(tmp5, DONE);
5149     } br(GT, NEXT_WORD);
5150     // Last longword.  In the case where length == 4 we compare the
5151     // same longword twice, but that's still faster than another
5152     // conditional branch.
5153     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5154     // length == 4.
5155     if (log_elem_size > 0)
5156       lsl(cnt1, cnt1, log_elem_size);
5157     ldr(tmp3, Address(a1, cnt1));
5158     ldr(tmp4, Address(a2, cnt1));
5159     eor(tmp5, tmp3, tmp4);
5160     cbnz(tmp5, DONE);
5161     b(SAME);
5162     bind(A_MIGHT_BE_NULL);
5163     // in case both a1 and a2 are not-null, proceed with loads
5164     cbz(a1, DONE);
5165     cbz(a2, DONE);
5166     b(A_IS_NOT_NULL);
5167     bind(SHORT);
5168 
5169     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5170     {
5171       ldrw(tmp1, Address(post(a1, 4)));
5172       ldrw(tmp2, Address(post(a2, 4)));
5173       eorw(tmp5, tmp1, tmp2);
5174       cbnzw(tmp5, DONE);
5175     }
5176     bind(TAIL03);
5177     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5178     {
5179       ldrh(tmp3, Address(post(a1, 2)));
5180       ldrh(tmp4, Address(post(a2, 2)));
5181       eorw(tmp5, tmp3, tmp4);
5182       cbnzw(tmp5, DONE);
5183     }
5184     bind(TAIL01);
5185     if (elem_size == 1) { // Only needed when comparing byte arrays.
5186       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5187       {
5188         ldrb(tmp1, a1);
5189         ldrb(tmp2, a2);
5190         eorw(tmp5, tmp1, tmp2);
5191         cbnzw(tmp5, DONE);
5192       }
5193     }
5194   } else {
5195     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB,
5196         CSET_EQ, LAST_CHECK;
5197     mov(result, false);
5198     cbz(a1, DONE);
5199     ldrw(cnt1, Address(a1, length_offset));
5200     cbz(a2, DONE);
5201     ldrw(cnt2, Address(a2, length_offset));
5202     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5203     // faster to perform another branch before comparing a1 and a2
5204     cmp(cnt1, elem_per_word);
5205     br(LE, SHORT); // short or same
5206     ldr(tmp3, Address(pre(a1, base_offset)));
5207     cmp(cnt1, stubBytesThreshold);
5208     br(GE, STUB);
5209     ldr(tmp4, Address(pre(a2, base_offset)));
5210     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5211     cmp(cnt2, cnt1);
5212     br(NE, DONE);
5213 
5214     // Main 16 byte comparison loop with 2 exits
5215     bind(NEXT_DWORD); {
5216       ldr(tmp1, Address(pre(a1, wordSize)));
5217       ldr(tmp2, Address(pre(a2, wordSize)));
5218       subs(cnt1, cnt1, 2 * elem_per_word);
5219       br(LE, TAIL);
5220       eor(tmp4, tmp3, tmp4);
5221       cbnz(tmp4, DONE);
5222       ldr(tmp3, Address(pre(a1, wordSize)));
5223       ldr(tmp4, Address(pre(a2, wordSize)));
5224       cmp(cnt1, elem_per_word);
5225       br(LE, TAIL2);
5226       cmp(tmp1, tmp2);
5227     } br(EQ, NEXT_DWORD);
5228     b(DONE);
5229 
5230     bind(TAIL);
5231     eor(tmp4, tmp3, tmp4);
5232     eor(tmp2, tmp1, tmp2);
5233     lslv(tmp2, tmp2, tmp5);
5234     orr(tmp5, tmp4, tmp2);
5235     cmp(tmp5, zr);
5236     b(CSET_EQ);
5237 
5238     bind(TAIL2);
5239     eor(tmp2, tmp1, tmp2);
5240     cbnz(tmp2, DONE);
5241     b(LAST_CHECK);
5242 
5243     bind(STUB);
5244     ldr(tmp4, Address(pre(a2, base_offset)));
5245     cmp(cnt2, cnt1);
5246     br(NE, DONE);
5247     if (elem_size == 2) { // convert to byte counter
5248       lsl(cnt1, cnt1, 1);
5249     }
5250     eor(tmp5, tmp3, tmp4);
5251     cbnz(tmp5, DONE);
5252     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5253     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5254     address tpc = trampoline_call(stub);
5255     if (tpc == NULL) {
5256       DEBUG_ONLY(reset_labels5(SHORT, LAST_CHECK, CSET_EQ, SAME, DONE));
5257       postcond(pc() == badAddress);
5258       return NULL;
5259     }
5260     b(DONE);
5261 
5262     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5263     // so, if a2 == null => return false(0), else return true, so we can return a2
5264     mov(result, a2);
5265     b(DONE);
5266     bind(SHORT);
5267     cmp(cnt2, cnt1);
5268     br(NE, DONE);
5269     cbz(cnt1, SAME);
5270     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5271     ldr(tmp3, Address(a1, base_offset));
5272     ldr(tmp4, Address(a2, base_offset));
5273     bind(LAST_CHECK);
5274     eor(tmp4, tmp3, tmp4);
5275     lslv(tmp5, tmp4, tmp5);
5276     cmp(tmp5, zr);
5277     bind(CSET_EQ);
5278     cset(result, EQ);
5279     b(DONE);
5280   }
5281 
5282   bind(SAME);
5283   mov(result, true);
5284   // That's it.
5285   bind(DONE);
5286 
5287   BLOCK_COMMENT("} array_equals");
5288   postcond(pc() != badAddress);
5289   return pc();
5290 }
5291 
5292 // Compare Strings
5293 
5294 // For Strings we're passed the address of the first characters in a1
5295 // and a2 and the length in cnt1.
5296 // elem_size is the element size in bytes: either 1 or 2.
5297 // There are two implementations.  For arrays >= 8 bytes, all
5298 // comparisons (including the final one, which may overlap) are
5299 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5300 // halfword, then a short, and then a byte.
5301 
string_equals(Register a1,Register a2,Register result,Register cnt1,int elem_size)5302 void MacroAssembler::string_equals(Register a1, Register a2,
5303                                    Register result, Register cnt1, int elem_size)
5304 {
5305   Label SAME, DONE, SHORT, NEXT_WORD;
5306   Register tmp1 = rscratch1;
5307   Register tmp2 = rscratch2;
5308   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5309 
5310   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5311   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5312 
5313 #ifndef PRODUCT
5314   {
5315     const char kind = (elem_size == 2) ? 'U' : 'L';
5316     char comment[64];
5317     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5318     BLOCK_COMMENT(comment);
5319   }
5320 #endif
5321 
5322   mov(result, false);
5323 
5324   // Check for short strings, i.e. smaller than wordSize.
5325   subs(cnt1, cnt1, wordSize);
5326   br(Assembler::LT, SHORT);
5327   // Main 8 byte comparison loop.
5328   bind(NEXT_WORD); {
5329     ldr(tmp1, Address(post(a1, wordSize)));
5330     ldr(tmp2, Address(post(a2, wordSize)));
5331     subs(cnt1, cnt1, wordSize);
5332     eor(tmp1, tmp1, tmp2);
5333     cbnz(tmp1, DONE);
5334   } br(GT, NEXT_WORD);
5335   // Last longword.  In the case where length == 4 we compare the
5336   // same longword twice, but that's still faster than another
5337   // conditional branch.
5338   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5339   // length == 4.
5340   ldr(tmp1, Address(a1, cnt1));
5341   ldr(tmp2, Address(a2, cnt1));
5342   eor(tmp2, tmp1, tmp2);
5343   cbnz(tmp2, DONE);
5344   b(SAME);
5345 
5346   bind(SHORT);
5347   Label TAIL03, TAIL01;
5348 
5349   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5350   {
5351     ldrw(tmp1, Address(post(a1, 4)));
5352     ldrw(tmp2, Address(post(a2, 4)));
5353     eorw(tmp1, tmp1, tmp2);
5354     cbnzw(tmp1, DONE);
5355   }
5356   bind(TAIL03);
5357   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5358   {
5359     ldrh(tmp1, Address(post(a1, 2)));
5360     ldrh(tmp2, Address(post(a2, 2)));
5361     eorw(tmp1, tmp1, tmp2);
5362     cbnzw(tmp1, DONE);
5363   }
5364   bind(TAIL01);
5365   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5366     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5367     {
5368       ldrb(tmp1, a1);
5369       ldrb(tmp2, a2);
5370       eorw(tmp1, tmp1, tmp2);
5371       cbnzw(tmp1, DONE);
5372     }
5373   }
5374   // Arrays are equal.
5375   bind(SAME);
5376   mov(result, true);
5377 
5378   // That's it.
5379   bind(DONE);
5380   BLOCK_COMMENT("} string_equals");
5381 }
5382 
5383 
5384 // The size of the blocks erased by the zero_blocks stub.  We must
5385 // handle anything smaller than this ourselves in zero_words().
5386 const int MacroAssembler::zero_words_block_size = 8;
5387 
5388 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5389 // possible, handling small word counts locally and delegating
5390 // anything larger to the zero_blocks stub.  It is expanded many times
5391 // in compiled code, so it is important to keep it short.
5392 
5393 // ptr:   Address of a buffer to be zeroed.
5394 // cnt:   Count in HeapWords.
5395 //
5396 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
zero_words(Register ptr,Register cnt)5397 address MacroAssembler::zero_words(Register ptr, Register cnt)
5398 {
5399   assert(is_power_of_2(zero_words_block_size), "adjust this");
5400   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5401 
5402   BLOCK_COMMENT("zero_words {");
5403   cmp(cnt, zero_words_block_size);
5404   Label around, done, done16;
5405   br(LO, around);
5406   {
5407     RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5408     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5409     if (StubRoutines::aarch64::complete()) {
5410       address tpc = trampoline_call(zero_blocks);
5411       if (tpc == NULL) {
5412         DEBUG_ONLY(reset_labels1(around));
5413         postcond(pc() == badAddress);
5414         return NULL;
5415       }
5416     } else {
5417       bl(zero_blocks);
5418     }
5419   }
5420   bind(around);
5421   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5422     Label l;
5423     tbz(cnt, exact_log2(i), l);
5424     for (int j = 0; j < i; j += 2) {
5425       stp(zr, zr, post(ptr, 16));
5426     }
5427     bind(l);
5428   }
5429   {
5430     Label l;
5431     tbz(cnt, 0, l);
5432     str(zr, Address(ptr));
5433     bind(l);
5434   }
5435   BLOCK_COMMENT("} zero_words");
5436   postcond(pc() != badAddress);
5437   return pc();
5438 }
5439 
5440 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5441 // cnt:          Immediate count in HeapWords.
5442 #define SmallArraySize (18 * BytesPerLong)
zero_words(Register base,uint64_t cnt)5443 void MacroAssembler::zero_words(Register base, uint64_t cnt)
5444 {
5445   BLOCK_COMMENT("zero_words {");
5446   int i = cnt & 1;  // store any odd word to start
5447   if (i) str(zr, Address(base));
5448 
5449   if (cnt <= SmallArraySize / BytesPerLong) {
5450     for (; i < (int)cnt; i += 2) {
5451       stp(zr, zr, Address(base, i * wordSize));
5452     }
5453   } else {
5454     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5455     int remainder = cnt % (2 * unroll);
5456     for (; i < remainder; i += 2) {
5457       stp(zr, zr, Address(base, i * wordSize));
5458     }
5459     Label loop;
5460     Register cnt_reg = rscratch1;
5461     Register loop_base = rscratch2;
5462     cnt = cnt - remainder;
5463     mov(cnt_reg, cnt);
5464     // adjust base and prebias by -2 * wordSize so we can pre-increment
5465     add(loop_base, base, (remainder - 2) * wordSize);
5466     bind(loop);
5467     sub(cnt_reg, cnt_reg, 2 * unroll);
5468     for (i = 1; i < unroll; i++) {
5469       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5470     }
5471     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5472     cbnz(cnt_reg, loop);
5473   }
5474   BLOCK_COMMENT("} zero_words");
5475 }
5476 
5477 // Zero blocks of memory by using DC ZVA.
5478 //
5479 // Aligns the base address first sufficently for DC ZVA, then uses
5480 // DC ZVA repeatedly for every full block.  cnt is the size to be
5481 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5482 // in cnt.
5483 //
5484 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5485 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
zero_dcache_blocks(Register base,Register cnt)5486 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5487   Register tmp = rscratch1;
5488   Register tmp2 = rscratch2;
5489   int zva_length = VM_Version::zva_length();
5490   Label initial_table_end, loop_zva;
5491   Label fini;
5492 
5493   // Base must be 16 byte aligned. If not just return and let caller handle it
5494   tst(base, 0x0f);
5495   br(Assembler::NE, fini);
5496   // Align base with ZVA length.
5497   neg(tmp, base);
5498   andr(tmp, tmp, zva_length - 1);
5499 
5500   // tmp: the number of bytes to be filled to align the base with ZVA length.
5501   add(base, base, tmp);
5502   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5503   adr(tmp2, initial_table_end);
5504   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5505   br(tmp2);
5506 
5507   for (int i = -zva_length + 16; i < 0; i += 16)
5508     stp(zr, zr, Address(base, i));
5509   bind(initial_table_end);
5510 
5511   sub(cnt, cnt, zva_length >> 3);
5512   bind(loop_zva);
5513   dc(Assembler::ZVA, base);
5514   subs(cnt, cnt, zva_length >> 3);
5515   add(base, base, zva_length);
5516   br(Assembler::GE, loop_zva);
5517   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5518   bind(fini);
5519 }
5520 
5521 // base:   Address of a buffer to be filled, 8 bytes aligned.
5522 // cnt:    Count in 8-byte unit.
5523 // value:  Value to be filled with.
5524 // base will point to the end of the buffer after filling.
fill_words(Register base,Register cnt,Register value)5525 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5526 {
5527 //  Algorithm:
5528 //
5529 //    scratch1 = cnt & 7;
5530 //    cnt -= scratch1;
5531 //    p += scratch1;
5532 //    switch (scratch1) {
5533 //      do {
5534 //        cnt -= 8;
5535 //          p[-8] = v;
5536 //        case 7:
5537 //          p[-7] = v;
5538 //        case 6:
5539 //          p[-6] = v;
5540 //          // ...
5541 //        case 1:
5542 //          p[-1] = v;
5543 //        case 0:
5544 //          p += 8;
5545 //      } while (cnt);
5546 //    }
5547 
5548   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5549 
5550   Label fini, skip, entry, loop;
5551   const int unroll = 8; // Number of stp instructions we'll unroll
5552 
5553   cbz(cnt, fini);
5554   tbz(base, 3, skip);
5555   str(value, Address(post(base, 8)));
5556   sub(cnt, cnt, 1);
5557   bind(skip);
5558 
5559   andr(rscratch1, cnt, (unroll-1) * 2);
5560   sub(cnt, cnt, rscratch1);
5561   add(base, base, rscratch1, Assembler::LSL, 3);
5562   adr(rscratch2, entry);
5563   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5564   br(rscratch2);
5565 
5566   bind(loop);
5567   add(base, base, unroll * 16);
5568   for (int i = -unroll; i < 0; i++)
5569     stp(value, value, Address(base, i * 16));
5570   bind(entry);
5571   subs(cnt, cnt, unroll * 2);
5572   br(Assembler::GE, loop);
5573 
5574   tbz(cnt, 0, fini);
5575   str(value, Address(post(base, 8)));
5576   bind(fini);
5577 }
5578 
5579 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5580 // java/lang/StringUTF16.compress.
encode_iso_array(Register src,Register dst,Register len,Register result,FloatRegister Vtmp1,FloatRegister Vtmp2,FloatRegister Vtmp3,FloatRegister Vtmp4)5581 void MacroAssembler::encode_iso_array(Register src, Register dst,
5582                       Register len, Register result,
5583                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5584                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5585 {
5586     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5587         NEXT_32_START, NEXT_32_PRFM_START;
5588     Register tmp1 = rscratch1, tmp2 = rscratch2;
5589 
5590       mov(result, len); // Save initial len
5591 
5592       cmp(len, 8); // handle shortest strings first
5593       br(LT, LOOP_1);
5594       cmp(len, 32);
5595       br(LT, NEXT_8);
5596       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5597       // to convert chars to bytes
5598       if (SoftwarePrefetchHintDistance >= 0) {
5599         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5600         cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5601         br(LE, NEXT_32_START);
5602         b(NEXT_32_PRFM_START);
5603         BIND(NEXT_32_PRFM);
5604           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5605         BIND(NEXT_32_PRFM_START);
5606           prfm(Address(src, SoftwarePrefetchHintDistance));
5607           orr(v4, T16B, Vtmp1, Vtmp2);
5608           orr(v5, T16B, Vtmp3, Vtmp4);
5609           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5610           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5611           uzp2(v5, T16B, v4, v5); // high bytes
5612           umov(tmp2, v5, D, 1);
5613           fmovd(tmp1, v5);
5614           orr(tmp1, tmp1, tmp2);
5615           cbnz(tmp1, LOOP_8);
5616           stpq(Vtmp1, Vtmp3, dst);
5617           sub(len, len, 32);
5618           add(dst, dst, 32);
5619           add(src, src, 64);
5620           cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5621           br(GE, NEXT_32_PRFM);
5622           cmp(len, 32);
5623           br(LT, LOOP_8);
5624         BIND(NEXT_32);
5625           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5626         BIND(NEXT_32_START);
5627       } else {
5628         BIND(NEXT_32);
5629           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5630       }
5631       prfm(Address(src, SoftwarePrefetchHintDistance));
5632       uzp1(v4, T16B, Vtmp1, Vtmp2);
5633       uzp1(v5, T16B, Vtmp3, Vtmp4);
5634       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5635       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5636       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5637       umov(tmp2, Vtmp1, D, 1);
5638       fmovd(tmp1, Vtmp1);
5639       orr(tmp1, tmp1, tmp2);
5640       cbnz(tmp1, LOOP_8);
5641       stpq(v4, v5, dst);
5642       sub(len, len, 32);
5643       add(dst, dst, 32);
5644       add(src, src, 64);
5645       cmp(len, 32);
5646       br(GE, NEXT_32);
5647       cbz(len, DONE);
5648 
5649     BIND(LOOP_8);
5650       cmp(len, 8);
5651       br(LT, LOOP_1);
5652     BIND(NEXT_8);
5653       ld1(Vtmp1, T8H, src);
5654       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5655       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5656       fmovd(tmp1, Vtmp3);
5657       cbnz(tmp1, NEXT_1);
5658       strd(Vtmp2, dst);
5659 
5660       sub(len, len, 8);
5661       add(dst, dst, 8);
5662       add(src, src, 16);
5663       cmp(len, 8);
5664       br(GE, NEXT_8);
5665 
5666     BIND(LOOP_1);
5667 
5668     cbz(len, DONE);
5669     BIND(NEXT_1);
5670       ldrh(tmp1, Address(post(src, 2)));
5671       tst(tmp1, 0xff00);
5672       br(NE, SET_RESULT);
5673       strb(tmp1, Address(post(dst, 1)));
5674       subs(len, len, 1);
5675       br(GT, NEXT_1);
5676 
5677     BIND(SET_RESULT);
5678       sub(result, result, len); // Return index where we stopped
5679                                 // Return len == 0 if we processed all
5680                                 // characters
5681     BIND(DONE);
5682 }
5683 
5684 
5685 // Inflate byte[] array to char[].
byte_array_inflate(Register src,Register dst,Register len,FloatRegister vtmp1,FloatRegister vtmp2,FloatRegister vtmp3,Register tmp4)5686 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5687                                            FloatRegister vtmp1, FloatRegister vtmp2,
5688                                            FloatRegister vtmp3, Register tmp4) {
5689   Label big, done, after_init, to_stub;
5690 
5691   assert_different_registers(src, dst, len, tmp4, rscratch1);
5692 
5693   fmovd(vtmp1, zr);
5694   lsrw(tmp4, len, 3);
5695   bind(after_init);
5696   cbnzw(tmp4, big);
5697   // Short string: less than 8 bytes.
5698   {
5699     Label loop, tiny;
5700 
5701     cmpw(len, 4);
5702     br(LT, tiny);
5703     // Use SIMD to do 4 bytes.
5704     ldrs(vtmp2, post(src, 4));
5705     zip1(vtmp3, T8B, vtmp2, vtmp1);
5706     subw(len, len, 4);
5707     strd(vtmp3, post(dst, 8));
5708 
5709     cbzw(len, done);
5710 
5711     // Do the remaining bytes by steam.
5712     bind(loop);
5713     ldrb(tmp4, post(src, 1));
5714     strh(tmp4, post(dst, 2));
5715     subw(len, len, 1);
5716 
5717     bind(tiny);
5718     cbnz(len, loop);
5719 
5720     b(done);
5721   }
5722 
5723   if (SoftwarePrefetchHintDistance >= 0) {
5724     bind(to_stub);
5725       RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5726       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5727       address tpc = trampoline_call(stub);
5728       if (tpc == NULL) {
5729         DEBUG_ONLY(reset_labels2(big, done));
5730         postcond(pc() == badAddress);
5731         return NULL;
5732       }
5733       b(after_init);
5734   }
5735 
5736   // Unpack the bytes 8 at a time.
5737   bind(big);
5738   {
5739     Label loop, around, loop_last, loop_start;
5740 
5741     if (SoftwarePrefetchHintDistance >= 0) {
5742       const int large_loop_threshold = (64 + 16)/8;
5743       ldrd(vtmp2, post(src, 8));
5744       andw(len, len, 7);
5745       cmp(tmp4, large_loop_threshold);
5746       br(GE, to_stub);
5747       b(loop_start);
5748 
5749       bind(loop);
5750       ldrd(vtmp2, post(src, 8));
5751       bind(loop_start);
5752       subs(tmp4, tmp4, 1);
5753       br(EQ, loop_last);
5754       zip1(vtmp2, T16B, vtmp2, vtmp1);
5755       ldrd(vtmp3, post(src, 8));
5756       st1(vtmp2, T8H, post(dst, 16));
5757       subs(tmp4, tmp4, 1);
5758       zip1(vtmp3, T16B, vtmp3, vtmp1);
5759       st1(vtmp3, T8H, post(dst, 16));
5760       br(NE, loop);
5761       b(around);
5762       bind(loop_last);
5763       zip1(vtmp2, T16B, vtmp2, vtmp1);
5764       st1(vtmp2, T8H, post(dst, 16));
5765       bind(around);
5766       cbz(len, done);
5767     } else {
5768       andw(len, len, 7);
5769       bind(loop);
5770       ldrd(vtmp2, post(src, 8));
5771       sub(tmp4, tmp4, 1);
5772       zip1(vtmp3, T16B, vtmp2, vtmp1);
5773       st1(vtmp3, T8H, post(dst, 16));
5774       cbnz(tmp4, loop);
5775     }
5776   }
5777 
5778   // Do the tail of up to 8 bytes.
5779   add(src, src, len);
5780   ldrd(vtmp3, Address(src, -8));
5781   add(dst, dst, len, ext::uxtw, 1);
5782   zip1(vtmp3, T16B, vtmp3, vtmp1);
5783   strq(vtmp3, Address(dst, -16));
5784 
5785   bind(done);
5786   postcond(pc() != badAddress);
5787   return pc();
5788 }
5789 
5790 // Compress char[] array to byte[].
char_array_compress(Register src,Register dst,Register len,FloatRegister tmp1Reg,FloatRegister tmp2Reg,FloatRegister tmp3Reg,FloatRegister tmp4Reg,Register result)5791 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5792                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5793                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5794                                          Register result) {
5795   encode_iso_array(src, dst, len, result,
5796                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5797   cmp(len, zr);
5798   csel(result, result, zr, EQ);
5799 }
5800 
5801 #ifdef __OpenBSD__
5802 // OpenBSD uses emulated tls so it can't use aarch64_get_thread_helper().
5803 // Save whatever non-callee save context might get clobbered by
5804 // Thread::current.
get_thread(Register dst)5805 void MacroAssembler::get_thread(Register dst) {
5806   RegSet saved_regs = RegSet::range(r0, r18) + lr - dst;
5807   push(saved_regs, sp);
5808 
5809   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
5810   if (dst != c_rarg0) {
5811     mov(dst, c_rarg0);
5812   }
5813 
5814   pop(saved_regs, sp);
5815 }
5816 #else
5817 // get_thread() can be called anywhere inside generated code so we
5818 // need to save whatever non-callee save context might get clobbered
5819 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5820 // the call setup code.
5821 //
5822 // On Linux, aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5823 // On other systems, the helper is a usual C function.
5824 //
get_thread(Register dst)5825 void MacroAssembler::get_thread(Register dst) {
5826   RegSet saved_regs =
5827     LINUX_ONLY(RegSet::range(r0, r1)  + lr - dst)
5828     NOT_LINUX (RegSet::range(r0, r17) + lr - dst);
5829 
5830   push(saved_regs, sp);
5831 
5832   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5833   blr(lr);
5834   if (dst != c_rarg0) {
5835     mov(dst, c_rarg0);
5836   }
5837 
5838   pop(saved_regs, sp);
5839 }
5840 #endif
5841