1 /*
2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.
9  *
10  * This code is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13  * version 2 for more details (a copy is included in the LICENSE file that
14  * accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License version
17  * 2 along with this work; if not, write to the Free Software Foundation,
18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19  *
20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21  * or visit www.oracle.com if you need additional information or have any
22  * questions.
23  *
24  */
25 
26 #include <sys/types.h>
27 
28 #include "precompiled.hpp"
29 #include "jvm.h"
30 #include "asm/assembler.hpp"
31 #include "asm/assembler.inline.hpp"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/cardTable.hpp"
34 #include "gc/shared/barrierSetAssembler.hpp"
35 #include "gc/shared/cardTableBarrierSet.hpp"
36 #include "interpreter/interpreter.hpp"
37 #include "compiler/disassembler.hpp"
38 #include "memory/resourceArea.hpp"
39 #include "memory/universe.hpp"
40 #include "nativeInst_aarch64.hpp"
41 #include "oops/accessDecorators.hpp"
42 #include "oops/compressedOops.inline.hpp"
43 #include "oops/klass.inline.hpp"
44 #include "runtime/biasedLocking.hpp"
45 #include "runtime/icache.hpp"
46 #include "runtime/interfaceSupport.inline.hpp"
47 #include "runtime/jniHandles.inline.hpp"
48 #include "runtime/sharedRuntime.hpp"
49 #include "runtime/thread.hpp"
50 #ifdef COMPILER1
51 #include "c1/c1_LIRAssembler.hpp"
52 #endif
53 #ifdef COMPILER2
54 #include "oops/oop.hpp"
55 #include "opto/compile.hpp"
56 #include "opto/intrinsicnode.hpp"
57 #include "opto/node.hpp"
58 #endif
59 
60 #ifdef PRODUCT
61 #define BLOCK_COMMENT(str) /* nothing */
62 #define STOP(error) stop(error)
63 #else
64 #define BLOCK_COMMENT(str) block_comment(str)
65 #define STOP(error) block_comment(error); stop(error)
66 #endif
67 
68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
69 
70 // Patch any kind of instruction; there may be several instructions.
71 // Return the total length (in bytes) of the instructions.
pd_patch_instruction_size(address branch,address target)72 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
73   int instructions = 1;
74   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
75   long offset = (target - branch) >> 2;
76   unsigned insn = *(unsigned*)branch;
77   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
78     // Load register (literal)
79     Instruction_aarch64::spatch(branch, 23, 5, offset);
80   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
81     // Unconditional branch (immediate)
82     Instruction_aarch64::spatch(branch, 25, 0, offset);
83   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
84     // Conditional branch (immediate)
85     Instruction_aarch64::spatch(branch, 23, 5, offset);
86   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
87     // Compare & branch (immediate)
88     Instruction_aarch64::spatch(branch, 23, 5, offset);
89   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
90     // Test & branch (immediate)
91     Instruction_aarch64::spatch(branch, 18, 5, offset);
92   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
93     // PC-rel. addressing
94     offset = target-branch;
95     int shift = Instruction_aarch64::extract(insn, 31, 31);
96     if (shift) {
97       u_int64_t dest = (u_int64_t)target;
98       uint64_t pc_page = (uint64_t)branch >> 12;
99       uint64_t adr_page = (uint64_t)target >> 12;
100       unsigned offset_lo = dest & 0xfff;
101       offset = adr_page - pc_page;
102 
103       // We handle 4 types of PC relative addressing
104       //   1 - adrp    Rx, target_page
105       //       ldr/str Ry, [Rx, #offset_in_page]
106       //   2 - adrp    Rx, target_page
107       //       add     Ry, Rx, #offset_in_page
108       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
109       //       movk    Rx, #imm16<<32
110       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
111       // In the first 3 cases we must check that Rx is the same in the adrp and the
112       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
113       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
114       // to be followed by a random unrelated ldr/str, add or movk instruction.
115       //
116       unsigned insn2 = ((unsigned*)branch)[1];
117       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
118                 Instruction_aarch64::extract(insn, 4, 0) ==
119                         Instruction_aarch64::extract(insn2, 9, 5)) {
120         // Load/store register (unsigned immediate)
121         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
122         Instruction_aarch64::patch(branch + sizeof (unsigned),
123                                     21, 10, offset_lo >> size);
124         guarantee(((dest >> size) << size) == dest, "misaligned target");
125         instructions = 2;
126       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
127                 Instruction_aarch64::extract(insn, 4, 0) ==
128                         Instruction_aarch64::extract(insn2, 4, 0)) {
129         // add (immediate)
130         Instruction_aarch64::patch(branch + sizeof (unsigned),
131                                    21, 10, offset_lo);
132         instructions = 2;
133       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
134                    Instruction_aarch64::extract(insn, 4, 0) ==
135                      Instruction_aarch64::extract(insn2, 4, 0)) {
136         // movk #imm16<<32
137         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
138         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
139         long pc_page = (long)branch >> 12;
140         long adr_page = (long)dest >> 12;
141         offset = adr_page - pc_page;
142         instructions = 2;
143       }
144     }
145     int offset_lo = offset & 3;
146     offset >>= 2;
147     Instruction_aarch64::spatch(branch, 23, 5, offset);
148     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
149   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
150     u_int64_t dest = (u_int64_t)target;
151     // Move wide constant
152     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
153     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
154     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
155     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
156     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
157     assert(target_addr_for_insn(branch) == target, "should be");
158     instructions = 3;
159   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
160              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
161     // nothing to do
162     assert(target == 0, "did not expect to relocate target for polling page load");
163   } else {
164     ShouldNotReachHere();
165   }
166   return instructions * NativeInstruction::instruction_size;
167 }
168 
patch_oop(address insn_addr,address o)169 int MacroAssembler::patch_oop(address insn_addr, address o) {
170   int instructions;
171   unsigned insn = *(unsigned*)insn_addr;
172   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
173 
174   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
175   // narrow OOPs by setting the upper 16 bits in the first
176   // instruction.
177   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
178     // Move narrow OOP
179     narrowOop n = CompressedOops::encode((oop)o);
180     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
181     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
182     instructions = 2;
183   } else {
184     // Move wide OOP
185     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
186     uintptr_t dest = (uintptr_t)o;
187     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
188     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
189     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
190     instructions = 3;
191   }
192   return instructions * NativeInstruction::instruction_size;
193 }
194 
patch_narrow_klass(address insn_addr,narrowKlass n)195 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
196   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
197   // We encode narrow ones by setting the upper 16 bits in the first
198   // instruction.
199   NativeInstruction *insn = nativeInstruction_at(insn_addr);
200   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
201          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
202 
203   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
204   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
205   return 2 * NativeInstruction::instruction_size;
206 }
207 
target_addr_for_insn(address insn_addr,unsigned insn)208 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
209   long offset = 0;
210   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
211     // Load register (literal)
212     offset = Instruction_aarch64::sextract(insn, 23, 5);
213     return address(((uint64_t)insn_addr + (offset << 2)));
214   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
215     // Unconditional branch (immediate)
216     offset = Instruction_aarch64::sextract(insn, 25, 0);
217   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
218     // Conditional branch (immediate)
219     offset = Instruction_aarch64::sextract(insn, 23, 5);
220   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
221     // Compare & branch (immediate)
222     offset = Instruction_aarch64::sextract(insn, 23, 5);
223    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
224     // Test & branch (immediate)
225     offset = Instruction_aarch64::sextract(insn, 18, 5);
226   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
227     // PC-rel. addressing
228     offset = Instruction_aarch64::extract(insn, 30, 29);
229     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
230     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
231     if (shift) {
232       offset <<= shift;
233       uint64_t target_page = ((uint64_t)insn_addr) + offset;
234       target_page &= ((uint64_t)-1) << shift;
235       // Return the target address for the following sequences
236       //   1 - adrp    Rx, target_page
237       //       ldr/str Ry, [Rx, #offset_in_page]
238       //   2 - adrp    Rx, target_page
239       //       add     Ry, Rx, #offset_in_page
240       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
241       //       movk    Rx, #imm12<<32
242       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
243       //
244       // In the first two cases  we check that the register is the same and
245       // return the target_page + the offset within the page.
246       // Otherwise we assume it is a page aligned relocation and return
247       // the target page only.
248       //
249       unsigned insn2 = ((unsigned*)insn_addr)[1];
250       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
251                 Instruction_aarch64::extract(insn, 4, 0) ==
252                         Instruction_aarch64::extract(insn2, 9, 5)) {
253         // Load/store register (unsigned immediate)
254         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
255         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
256         return address(target_page + (byte_offset << size));
257       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
258                 Instruction_aarch64::extract(insn, 4, 0) ==
259                         Instruction_aarch64::extract(insn2, 4, 0)) {
260         // add (immediate)
261         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
262         return address(target_page + byte_offset);
263       } else {
264         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
265                Instruction_aarch64::extract(insn, 4, 0) ==
266                  Instruction_aarch64::extract(insn2, 4, 0)) {
267           target_page = (target_page & 0xffffffff) |
268                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
269         }
270         return (address)target_page;
271       }
272     } else {
273       ShouldNotReachHere();
274     }
275   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
276     u_int32_t *insns = (u_int32_t *)insn_addr;
277     // Move wide constant: movz, movk, movk.  See movptr().
278     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
279     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
280     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
281                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
282                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
283   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
284              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
285     return 0;
286   } else {
287     ShouldNotReachHere();
288   }
289   return address(((uint64_t)insn_addr + (offset << 2)));
290 }
291 
safepoint_poll(Label & slow_path)292 void MacroAssembler::safepoint_poll(Label& slow_path) {
293   if (SafepointMechanism::uses_thread_local_poll()) {
294     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
295     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
296   } else {
297     unsigned long offset;
298     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
299     ldrw(rscratch1, Address(rscratch1, offset));
300     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
301     cbnz(rscratch1, slow_path);
302   }
303 }
304 
305 // Just like safepoint_poll, but use an acquiring load for thread-
306 // local polling.
307 //
308 // We need an acquire here to ensure that any subsequent load of the
309 // global SafepointSynchronize::_state flag is ordered after this load
310 // of the local Thread::_polling page.  We don't want this poll to
311 // return false (i.e. not safepointing) and a later poll of the global
312 // SafepointSynchronize::_state spuriously to return true.
313 //
314 // This is to avoid a race when we're in a native->Java transition
315 // racing the code which wakes up from a safepoint.
316 //
safepoint_poll_acquire(Label & slow_path)317 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
318   if (SafepointMechanism::uses_thread_local_poll()) {
319     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
320     ldar(rscratch1, rscratch1);
321     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
322   } else {
323     safepoint_poll(slow_path);
324   }
325 }
326 
reset_last_Java_frame(bool clear_fp)327 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
328   // we must set sp to zero to clear frame
329   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
330 
331   // must clear fp, so that compiled frames are not confused; it is
332   // possible that we need it only for debugging
333   if (clear_fp) {
334     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
335   }
336 
337   // Always clear the pc because it could have been set by make_walkable()
338   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
339 }
340 
341 // Calls to C land
342 //
343 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
344 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
345 // has to be reset to 0. This is required to allow proper stack traversal.
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Register last_java_pc,Register scratch)346 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
347                                          Register last_java_fp,
348                                          Register last_java_pc,
349                                          Register scratch) {
350 
351   if (last_java_pc->is_valid()) {
352       str(last_java_pc, Address(rthread,
353                                 JavaThread::frame_anchor_offset()
354                                 + JavaFrameAnchor::last_Java_pc_offset()));
355     }
356 
357   // determine last_java_sp register
358   if (last_java_sp == sp) {
359     mov(scratch, sp);
360     last_java_sp = scratch;
361   } else if (!last_java_sp->is_valid()) {
362     last_java_sp = esp;
363   }
364 
365   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
366 
367   // last_java_fp is optional
368   if (last_java_fp->is_valid()) {
369     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
370   }
371 }
372 
set_last_Java_frame(Register last_java_sp,Register last_java_fp,address last_java_pc,Register scratch)373 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
374                                          Register last_java_fp,
375                                          address  last_java_pc,
376                                          Register scratch) {
377   assert(last_java_pc != NULL, "must provide a valid PC");
378 
379   adr(scratch, last_java_pc);
380   str(scratch, Address(rthread,
381                        JavaThread::frame_anchor_offset()
382                        + JavaFrameAnchor::last_Java_pc_offset()));
383 
384   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
385 }
386 
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Label & L,Register scratch)387 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
388                                          Register last_java_fp,
389                                          Label &L,
390                                          Register scratch) {
391   if (L.is_bound()) {
392     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
393   } else {
394     InstructionMark im(this);
395     L.add_patch_at(code(), locator());
396     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
397   }
398 }
399 
far_call(Address entry,CodeBuffer * cbuf,Register tmp)400 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
401   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
402   assert(CodeCache::find_blob(entry.target()) != NULL,
403          "destination of far call not found in code cache");
404   if (far_branches()) {
405     unsigned long offset;
406     // We can use ADRP here because we know that the total size of
407     // the code cache cannot exceed 2Gb.
408     adrp(tmp, entry, offset);
409     add(tmp, tmp, offset);
410     if (cbuf) cbuf->set_insts_mark();
411     blr(tmp);
412   } else {
413     if (cbuf) cbuf->set_insts_mark();
414     bl(entry);
415   }
416 }
417 
far_jump(Address entry,CodeBuffer * cbuf,Register tmp)418 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
419   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
420   assert(CodeCache::find_blob(entry.target()) != NULL,
421          "destination of far call not found in code cache");
422   if (far_branches()) {
423     unsigned long offset;
424     // We can use ADRP here because we know that the total size of
425     // the code cache cannot exceed 2Gb.
426     adrp(tmp, entry, offset);
427     add(tmp, tmp, offset);
428     if (cbuf) cbuf->set_insts_mark();
429     br(tmp);
430   } else {
431     if (cbuf) cbuf->set_insts_mark();
432     b(entry);
433   }
434 }
435 
reserved_stack_check()436 void MacroAssembler::reserved_stack_check() {
437     // testing if reserved zone needs to be enabled
438     Label no_reserved_zone_enabling;
439 
440     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
441     cmp(sp, rscratch1);
442     br(Assembler::LO, no_reserved_zone_enabling);
443 
444     enter();   // LR and FP are live.
445     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
446     mov(c_rarg0, rthread);
447     blr(rscratch1);
448     leave();
449 
450     // We have already removed our own frame.
451     // throw_delayed_StackOverflowError will think that it's been
452     // called by our caller.
453     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
454     br(rscratch1);
455     should_not_reach_here();
456 
457     bind(no_reserved_zone_enabling);
458 }
459 
biased_locking_enter(Register lock_reg,Register obj_reg,Register swap_reg,Register tmp_reg,bool swap_reg_contains_mark,Label & done,Label * slow_case,BiasedLockingCounters * counters)460 int MacroAssembler::biased_locking_enter(Register lock_reg,
461                                          Register obj_reg,
462                                          Register swap_reg,
463                                          Register tmp_reg,
464                                          bool swap_reg_contains_mark,
465                                          Label& done,
466                                          Label* slow_case,
467                                          BiasedLockingCounters* counters) {
468   assert(UseBiasedLocking, "why call this otherwise?");
469   assert_different_registers(lock_reg, obj_reg, swap_reg);
470 
471   if (PrintBiasedLockingStatistics && counters == NULL)
472     counters = BiasedLocking::counters();
473 
474   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
475   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
476   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
477   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
478   Address saved_mark_addr(lock_reg, 0);
479 
480   // Biased locking
481   // See whether the lock is currently biased toward our thread and
482   // whether the epoch is still valid
483   // Note that the runtime guarantees sufficient alignment of JavaThread
484   // pointers to allow age to be placed into low bits
485   // First check to see whether biasing is even enabled for this object
486   Label cas_label;
487   int null_check_offset = -1;
488   if (!swap_reg_contains_mark) {
489     null_check_offset = offset();
490     ldr(swap_reg, mark_addr);
491   }
492   andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place);
493   cmp(tmp_reg, (u1)markWord::biased_lock_pattern);
494   br(Assembler::NE, cas_label);
495   // The bias pattern is present in the object's header. Need to check
496   // whether the bias owner and the epoch are both still current.
497   load_prototype_header(tmp_reg, obj_reg);
498   orr(tmp_reg, tmp_reg, rthread);
499   eor(tmp_reg, swap_reg, tmp_reg);
500   andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place));
501   if (counters != NULL) {
502     Label around;
503     cbnz(tmp_reg, around);
504     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
505     b(done);
506     bind(around);
507   } else {
508     cbz(tmp_reg, done);
509   }
510 
511   Label try_revoke_bias;
512   Label try_rebias;
513 
514   // At this point we know that the header has the bias pattern and
515   // that we are not the bias owner in the current epoch. We need to
516   // figure out more details about the state of the header in order to
517   // know what operations can be legally performed on the object's
518   // header.
519 
520   // If the low three bits in the xor result aren't clear, that means
521   // the prototype header is no longer biased and we have to revoke
522   // the bias on this object.
523   andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place);
524   cbnz(rscratch1, try_revoke_bias);
525 
526   // Biasing is still enabled for this data type. See whether the
527   // epoch of the current bias is still valid, meaning that the epoch
528   // bits of the mark word are equal to the epoch bits of the
529   // prototype header. (Note that the prototype header's epoch bits
530   // only change at a safepoint.) If not, attempt to rebias the object
531   // toward the current thread. Note that we must be absolutely sure
532   // that the current epoch is invalid in order to do this because
533   // otherwise the manipulations it performs on the mark word are
534   // illegal.
535   andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place);
536   cbnz(rscratch1, try_rebias);
537 
538   // The epoch of the current bias is still valid but we know nothing
539   // about the owner; it might be set or it might be clear. Try to
540   // acquire the bias of the object using an atomic operation. If this
541   // fails we will go in to the runtime to revoke the object's bias.
542   // Note that we first construct the presumed unbiased header so we
543   // don't accidentally blow away another thread's valid bias.
544   {
545     Label here;
546     mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
547     andr(swap_reg, swap_reg, rscratch1);
548     orr(tmp_reg, swap_reg, rthread);
549     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
550     // If the biasing toward our thread failed, this means that
551     // another thread succeeded in biasing it toward itself and we
552     // need to revoke that bias. The revocation will occur in the
553     // interpreter runtime in the slow case.
554     bind(here);
555     if (counters != NULL) {
556       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
557                   tmp_reg, rscratch1, rscratch2);
558     }
559   }
560   b(done);
561 
562   bind(try_rebias);
563   // At this point we know the epoch has expired, meaning that the
564   // current "bias owner", if any, is actually invalid. Under these
565   // circumstances _only_, we are allowed to use the current header's
566   // value as the comparison value when doing the cas to acquire the
567   // bias in the current epoch. In other words, we allow transfer of
568   // the bias from one thread to another directly in this situation.
569   //
570   // FIXME: due to a lack of registers we currently blow away the age
571   // bits in this situation. Should attempt to preserve them.
572   {
573     Label here;
574     load_prototype_header(tmp_reg, obj_reg);
575     orr(tmp_reg, rthread, tmp_reg);
576     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
577     // If the biasing toward our thread failed, then another thread
578     // succeeded in biasing it toward itself and we need to revoke that
579     // bias. The revocation will occur in the runtime in the slow case.
580     bind(here);
581     if (counters != NULL) {
582       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
583                   tmp_reg, rscratch1, rscratch2);
584     }
585   }
586   b(done);
587 
588   bind(try_revoke_bias);
589   // The prototype mark in the klass doesn't have the bias bit set any
590   // more, indicating that objects of this data type are not supposed
591   // to be biased any more. We are going to try to reset the mark of
592   // this object to the prototype value and fall through to the
593   // CAS-based locking scheme. Note that if our CAS fails, it means
594   // that another thread raced us for the privilege of revoking the
595   // bias of this particular object, so it's okay to continue in the
596   // normal locking code.
597   //
598   // FIXME: due to a lack of registers we currently blow away the age
599   // bits in this situation. Should attempt to preserve them.
600   {
601     Label here, nope;
602     load_prototype_header(tmp_reg, obj_reg);
603     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
604     bind(here);
605 
606     // Fall through to the normal CAS-based lock, because no matter what
607     // the result of the above CAS, some thread must have succeeded in
608     // removing the bias bit from the object's header.
609     if (counters != NULL) {
610       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
611                   rscratch1, rscratch2);
612     }
613     bind(nope);
614   }
615 
616   bind(cas_label);
617 
618   return null_check_offset;
619 }
620 
biased_locking_exit(Register obj_reg,Register temp_reg,Label & done)621 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
622   assert(UseBiasedLocking, "why call this otherwise?");
623 
624   // Check for biased locking unlock case, which is a no-op
625   // Note: we do not have to check the thread ID for two reasons.
626   // First, the interpreter checks for IllegalMonitorStateException at
627   // a higher level. Second, if the bias was revoked while we held the
628   // lock, the object could not be rebiased toward another thread, so
629   // the bias bit would be clear.
630   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
631   andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
632   cmp(temp_reg, (u1)markWord::biased_lock_pattern);
633   br(Assembler::EQ, done);
634 }
635 
pass_arg0(MacroAssembler * masm,Register arg)636 static void pass_arg0(MacroAssembler* masm, Register arg) {
637   if (c_rarg0 != arg ) {
638     masm->mov(c_rarg0, arg);
639   }
640 }
641 
pass_arg1(MacroAssembler * masm,Register arg)642 static void pass_arg1(MacroAssembler* masm, Register arg) {
643   if (c_rarg1 != arg ) {
644     masm->mov(c_rarg1, arg);
645   }
646 }
647 
pass_arg2(MacroAssembler * masm,Register arg)648 static void pass_arg2(MacroAssembler* masm, Register arg) {
649   if (c_rarg2 != arg ) {
650     masm->mov(c_rarg2, arg);
651   }
652 }
653 
pass_arg3(MacroAssembler * masm,Register arg)654 static void pass_arg3(MacroAssembler* masm, Register arg) {
655   if (c_rarg3 != arg ) {
656     masm->mov(c_rarg3, arg);
657   }
658 }
659 
call_VM_base(Register oop_result,Register java_thread,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)660 void MacroAssembler::call_VM_base(Register oop_result,
661                                   Register java_thread,
662                                   Register last_java_sp,
663                                   address  entry_point,
664                                   int      number_of_arguments,
665                                   bool     check_exceptions) {
666    // determine java_thread register
667   if (!java_thread->is_valid()) {
668     java_thread = rthread;
669   }
670 
671   // determine last_java_sp register
672   if (!last_java_sp->is_valid()) {
673     last_java_sp = esp;
674   }
675 
676   // debugging support
677   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
678   assert(java_thread == rthread, "unexpected register");
679 #ifdef ASSERT
680   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
681   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
682 #endif // ASSERT
683 
684   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
685   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
686 
687   // push java thread (becomes first argument of C function)
688 
689   mov(c_rarg0, java_thread);
690 
691   // set last Java frame before call
692   assert(last_java_sp != rfp, "can't use rfp");
693 
694   Label l;
695   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
696 
697   // do the call, remove parameters
698   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
699 
700   // reset last Java frame
701   // Only interpreter should have to clear fp
702   reset_last_Java_frame(true);
703 
704    // C++ interp handles this in the interpreter
705   check_and_handle_popframe(java_thread);
706   check_and_handle_earlyret(java_thread);
707 
708   if (check_exceptions) {
709     // check for pending exceptions (java_thread is set upon return)
710     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
711     Label ok;
712     cbz(rscratch1, ok);
713     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
714     br(rscratch1);
715     bind(ok);
716   }
717 
718   // get oop result if there is one and reset the value in the thread
719   if (oop_result->is_valid()) {
720     get_vm_result(oop_result, java_thread);
721   }
722 }
723 
call_VM_helper(Register oop_result,address entry_point,int number_of_arguments,bool check_exceptions)724 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
725   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
726 }
727 
728 // Maybe emit a call via a trampoline.  If the code cache is small
729 // trampolines won't be emitted.
730 
trampoline_call(Address entry,CodeBuffer * cbuf)731 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
732   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
733   assert(entry.rspec().type() == relocInfo::runtime_call_type
734          || entry.rspec().type() == relocInfo::opt_virtual_call_type
735          || entry.rspec().type() == relocInfo::static_call_type
736          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
737 
738   // We need a trampoline if branches are far.
739   if (far_branches()) {
740     bool in_scratch_emit_size = false;
741 #ifdef COMPILER2
742     // We don't want to emit a trampoline if C2 is generating dummy
743     // code during its branch shortening phase.
744     CompileTask* task = ciEnv::current()->task();
745     in_scratch_emit_size =
746       (task != NULL && is_c2_compile(task->comp_level()) &&
747        Compile::current()->in_scratch_emit_size());
748 #endif
749     if (!in_scratch_emit_size) {
750       address stub = emit_trampoline_stub(offset(), entry.target());
751       if (stub == NULL) {
752         return NULL; // CodeCache is full
753       }
754     }
755   }
756 
757   if (cbuf) cbuf->set_insts_mark();
758   relocate(entry.rspec());
759   if (!far_branches()) {
760     bl(entry.target());
761   } else {
762     bl(pc());
763   }
764   // just need to return a non-null address
765   return pc();
766 }
767 
768 
769 // Emit a trampoline stub for a call to a target which is too far away.
770 //
771 // code sequences:
772 //
773 // call-site:
774 //   branch-and-link to <destination> or <trampoline stub>
775 //
776 // Related trampoline stub for this call site in the stub section:
777 //   load the call target from the constant pool
778 //   branch (LR still points to the call site above)
779 
emit_trampoline_stub(int insts_call_instruction_offset,address dest)780 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
781                                              address dest) {
782   // Max stub size: alignment nop, TrampolineStub.
783   address stub = start_a_stub(NativeInstruction::instruction_size
784                    + NativeCallTrampolineStub::instruction_size);
785   if (stub == NULL) {
786     return NULL;  // CodeBuffer::expand failed
787   }
788 
789   // Create a trampoline stub relocation which relates this trampoline stub
790   // with the call instruction at insts_call_instruction_offset in the
791   // instructions code-section.
792   align(wordSize);
793   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
794                                             + insts_call_instruction_offset));
795   const int stub_start_offset = offset();
796 
797   // Now, create the trampoline stub's code:
798   // - load the call
799   // - call
800   Label target;
801   ldr(rscratch1, target);
802   br(rscratch1);
803   bind(target);
804   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
805          "should be");
806   emit_int64((int64_t)dest);
807 
808   const address stub_start_addr = addr_at(stub_start_offset);
809 
810   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
811 
812   end_a_stub();
813   return stub_start_addr;
814 }
815 
emit_static_call_stub()816 void MacroAssembler::emit_static_call_stub() {
817   // CompiledDirectStaticCall::set_to_interpreted knows the
818   // exact layout of this stub.
819 
820   isb();
821   mov_metadata(rmethod, (Metadata*)NULL);
822 
823   // Jump to the entry point of the i2c stub.
824   movptr(rscratch1, 0);
825   br(rscratch1);
826 }
827 
c2bool(Register x)828 void MacroAssembler::c2bool(Register x) {
829   // implements x == 0 ? 0 : 1
830   // note: must only look at least-significant byte of x
831   //       since C-style booleans are stored in one byte
832   //       only! (was bug)
833   tst(x, 0xff);
834   cset(x, Assembler::NE);
835 }
836 
ic_call(address entry,jint method_index)837 address MacroAssembler::ic_call(address entry, jint method_index) {
838   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
839   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
840   // unsigned long offset;
841   // ldr_constant(rscratch2, const_ptr);
842   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
843   return trampoline_call(Address(entry, rh));
844 }
845 
846 // Implementation of call_VM versions
847 
call_VM(Register oop_result,address entry_point,bool check_exceptions)848 void MacroAssembler::call_VM(Register oop_result,
849                              address entry_point,
850                              bool check_exceptions) {
851   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
852 }
853 
call_VM(Register oop_result,address entry_point,Register arg_1,bool check_exceptions)854 void MacroAssembler::call_VM(Register oop_result,
855                              address entry_point,
856                              Register arg_1,
857                              bool check_exceptions) {
858   pass_arg1(this, arg_1);
859   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
860 }
861 
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)862 void MacroAssembler::call_VM(Register oop_result,
863                              address entry_point,
864                              Register arg_1,
865                              Register arg_2,
866                              bool check_exceptions) {
867   assert(arg_1 != c_rarg2, "smashed arg");
868   pass_arg2(this, arg_2);
869   pass_arg1(this, arg_1);
870   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
871 }
872 
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)873 void MacroAssembler::call_VM(Register oop_result,
874                              address entry_point,
875                              Register arg_1,
876                              Register arg_2,
877                              Register arg_3,
878                              bool check_exceptions) {
879   assert(arg_1 != c_rarg3, "smashed arg");
880   assert(arg_2 != c_rarg3, "smashed arg");
881   pass_arg3(this, arg_3);
882 
883   assert(arg_1 != c_rarg2, "smashed arg");
884   pass_arg2(this, arg_2);
885 
886   pass_arg1(this, arg_1);
887   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
888 }
889 
call_VM(Register oop_result,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)890 void MacroAssembler::call_VM(Register oop_result,
891                              Register last_java_sp,
892                              address entry_point,
893                              int number_of_arguments,
894                              bool check_exceptions) {
895   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
896 }
897 
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,bool check_exceptions)898 void MacroAssembler::call_VM(Register oop_result,
899                              Register last_java_sp,
900                              address entry_point,
901                              Register arg_1,
902                              bool check_exceptions) {
903   pass_arg1(this, arg_1);
904   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
905 }
906 
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)907 void MacroAssembler::call_VM(Register oop_result,
908                              Register last_java_sp,
909                              address entry_point,
910                              Register arg_1,
911                              Register arg_2,
912                              bool check_exceptions) {
913 
914   assert(arg_1 != c_rarg2, "smashed arg");
915   pass_arg2(this, arg_2);
916   pass_arg1(this, arg_1);
917   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
918 }
919 
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)920 void MacroAssembler::call_VM(Register oop_result,
921                              Register last_java_sp,
922                              address entry_point,
923                              Register arg_1,
924                              Register arg_2,
925                              Register arg_3,
926                              bool check_exceptions) {
927   assert(arg_1 != c_rarg3, "smashed arg");
928   assert(arg_2 != c_rarg3, "smashed arg");
929   pass_arg3(this, arg_3);
930   assert(arg_1 != c_rarg2, "smashed arg");
931   pass_arg2(this, arg_2);
932   pass_arg1(this, arg_1);
933   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
934 }
935 
936 
get_vm_result(Register oop_result,Register java_thread)937 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
938   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
939   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
940   verify_oop(oop_result, "broken oop in call_VM_base");
941 }
942 
get_vm_result_2(Register metadata_result,Register java_thread)943 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
944   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
945   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
946 }
947 
align(int modulus)948 void MacroAssembler::align(int modulus) {
949   while (offset() % modulus != 0) nop();
950 }
951 
952 // these are no-ops overridden by InterpreterMacroAssembler
953 
check_and_handle_earlyret(Register java_thread)954 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
955 
check_and_handle_popframe(Register java_thread)956 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
957 
958 
delayed_value_impl(intptr_t * delayed_value_addr,Register tmp,int offset)959 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
960                                                       Register tmp,
961                                                       int offset) {
962   intptr_t value = *delayed_value_addr;
963   if (value != 0)
964     return RegisterOrConstant(value + offset);
965 
966   // load indirectly to solve generation ordering problem
967   ldr(tmp, ExternalAddress((address) delayed_value_addr));
968 
969   if (offset != 0)
970     add(tmp, tmp, offset);
971 
972   return RegisterOrConstant(tmp);
973 }
974 
975 // Look up the method for a megamorphic invokeinterface call.
976 // The target method is determined by <intf_klass, itable_index>.
977 // The receiver klass is in recv_klass.
978 // On success, the result will be in method_result, and execution falls through.
979 // On failure, execution transfers to the given label.
lookup_interface_method(Register recv_klass,Register intf_klass,RegisterOrConstant itable_index,Register method_result,Register scan_temp,Label & L_no_such_interface,bool return_method)980 void MacroAssembler::lookup_interface_method(Register recv_klass,
981                                              Register intf_klass,
982                                              RegisterOrConstant itable_index,
983                                              Register method_result,
984                                              Register scan_temp,
985                                              Label& L_no_such_interface,
986                          bool return_method) {
987   assert_different_registers(recv_klass, intf_klass, scan_temp);
988   assert_different_registers(method_result, intf_klass, scan_temp);
989   assert(recv_klass != method_result || !return_method,
990      "recv_klass can be destroyed when method isn't needed");
991   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
992          "caller must use same register for non-constant itable index as for method");
993 
994   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
995   int vtable_base = in_bytes(Klass::vtable_start_offset());
996   int itentry_off = itableMethodEntry::method_offset_in_bytes();
997   int scan_step   = itableOffsetEntry::size() * wordSize;
998   int vte_size    = vtableEntry::size_in_bytes();
999   assert(vte_size == wordSize, "else adjust times_vte_scale");
1000 
1001   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1002 
1003   // %%% Could store the aligned, prescaled offset in the klassoop.
1004   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1005   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1006   add(scan_temp, scan_temp, vtable_base);
1007 
1008   if (return_method) {
1009     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1010     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1011     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1012     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1013     if (itentry_off)
1014       add(recv_klass, recv_klass, itentry_off);
1015   }
1016 
1017   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1018   //   if (scan->interface() == intf) {
1019   //     result = (klass + scan->offset() + itable_index);
1020   //   }
1021   // }
1022   Label search, found_method;
1023 
1024   for (int peel = 1; peel >= 0; peel--) {
1025     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1026     cmp(intf_klass, method_result);
1027 
1028     if (peel) {
1029       br(Assembler::EQ, found_method);
1030     } else {
1031       br(Assembler::NE, search);
1032       // (invert the test to fall through to found_method...)
1033     }
1034 
1035     if (!peel)  break;
1036 
1037     bind(search);
1038 
1039     // Check that the previous entry is non-null.  A null entry means that
1040     // the receiver class doesn't implement the interface, and wasn't the
1041     // same as when the caller was compiled.
1042     cbz(method_result, L_no_such_interface);
1043     add(scan_temp, scan_temp, scan_step);
1044   }
1045 
1046   bind(found_method);
1047 
1048   // Got a hit.
1049   if (return_method) {
1050     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1051     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1052   }
1053 }
1054 
1055 // virtual method calling
lookup_virtual_method(Register recv_klass,RegisterOrConstant vtable_index,Register method_result)1056 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1057                                            RegisterOrConstant vtable_index,
1058                                            Register method_result) {
1059   const int base = in_bytes(Klass::vtable_start_offset());
1060   assert(vtableEntry::size() * wordSize == 8,
1061          "adjust the scaling in the code below");
1062   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1063 
1064   if (vtable_index.is_register()) {
1065     lea(method_result, Address(recv_klass,
1066                                vtable_index.as_register(),
1067                                Address::lsl(LogBytesPerWord)));
1068     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1069   } else {
1070     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1071     ldr(method_result,
1072         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1073   }
1074 }
1075 
check_klass_subtype(Register sub_klass,Register super_klass,Register temp_reg,Label & L_success)1076 void MacroAssembler::check_klass_subtype(Register sub_klass,
1077                            Register super_klass,
1078                            Register temp_reg,
1079                            Label& L_success) {
1080   Label L_failure;
1081   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1082   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1083   bind(L_failure);
1084 }
1085 
1086 
check_klass_subtype_fast_path(Register sub_klass,Register super_klass,Register temp_reg,Label * L_success,Label * L_failure,Label * L_slow_path,RegisterOrConstant super_check_offset)1087 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1088                                                    Register super_klass,
1089                                                    Register temp_reg,
1090                                                    Label* L_success,
1091                                                    Label* L_failure,
1092                                                    Label* L_slow_path,
1093                                         RegisterOrConstant super_check_offset) {
1094   assert_different_registers(sub_klass, super_klass, temp_reg);
1095   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1096   if (super_check_offset.is_register()) {
1097     assert_different_registers(sub_klass, super_klass,
1098                                super_check_offset.as_register());
1099   } else if (must_load_sco) {
1100     assert(temp_reg != noreg, "supply either a temp or a register offset");
1101   }
1102 
1103   Label L_fallthrough;
1104   int label_nulls = 0;
1105   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1106   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1107   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1108   assert(label_nulls <= 1, "at most one NULL in the batch");
1109 
1110   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1111   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1112   Address super_check_offset_addr(super_klass, sco_offset);
1113 
1114   // Hacked jmp, which may only be used just before L_fallthrough.
1115 #define final_jmp(label)                                                \
1116   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1117   else                            b(label)                /*omit semi*/
1118 
1119   // If the pointers are equal, we are done (e.g., String[] elements).
1120   // This self-check enables sharing of secondary supertype arrays among
1121   // non-primary types such as array-of-interface.  Otherwise, each such
1122   // type would need its own customized SSA.
1123   // We move this check to the front of the fast path because many
1124   // type checks are in fact trivially successful in this manner,
1125   // so we get a nicely predicted branch right at the start of the check.
1126   cmp(sub_klass, super_klass);
1127   br(Assembler::EQ, *L_success);
1128 
1129   // Check the supertype display:
1130   if (must_load_sco) {
1131     ldrw(temp_reg, super_check_offset_addr);
1132     super_check_offset = RegisterOrConstant(temp_reg);
1133   }
1134   Address super_check_addr(sub_klass, super_check_offset);
1135   ldr(rscratch1, super_check_addr);
1136   cmp(super_klass, rscratch1); // load displayed supertype
1137 
1138   // This check has worked decisively for primary supers.
1139   // Secondary supers are sought in the super_cache ('super_cache_addr').
1140   // (Secondary supers are interfaces and very deeply nested subtypes.)
1141   // This works in the same check above because of a tricky aliasing
1142   // between the super_cache and the primary super display elements.
1143   // (The 'super_check_addr' can address either, as the case requires.)
1144   // Note that the cache is updated below if it does not help us find
1145   // what we need immediately.
1146   // So if it was a primary super, we can just fail immediately.
1147   // Otherwise, it's the slow path for us (no success at this point).
1148 
1149   if (super_check_offset.is_register()) {
1150     br(Assembler::EQ, *L_success);
1151     subs(zr, super_check_offset.as_register(), sc_offset);
1152     if (L_failure == &L_fallthrough) {
1153       br(Assembler::EQ, *L_slow_path);
1154     } else {
1155       br(Assembler::NE, *L_failure);
1156       final_jmp(*L_slow_path);
1157     }
1158   } else if (super_check_offset.as_constant() == sc_offset) {
1159     // Need a slow path; fast failure is impossible.
1160     if (L_slow_path == &L_fallthrough) {
1161       br(Assembler::EQ, *L_success);
1162     } else {
1163       br(Assembler::NE, *L_slow_path);
1164       final_jmp(*L_success);
1165     }
1166   } else {
1167     // No slow path; it's a fast decision.
1168     if (L_failure == &L_fallthrough) {
1169       br(Assembler::EQ, *L_success);
1170     } else {
1171       br(Assembler::NE, *L_failure);
1172       final_jmp(*L_success);
1173     }
1174   }
1175 
1176   bind(L_fallthrough);
1177 
1178 #undef final_jmp
1179 }
1180 
1181 // These two are taken from x86, but they look generally useful
1182 
1183 // scans count pointer sized words at [addr] for occurence of value,
1184 // generic
repne_scan(Register addr,Register value,Register count,Register scratch)1185 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1186                                 Register scratch) {
1187   Label Lloop, Lexit;
1188   cbz(count, Lexit);
1189   bind(Lloop);
1190   ldr(scratch, post(addr, wordSize));
1191   cmp(value, scratch);
1192   br(EQ, Lexit);
1193   sub(count, count, 1);
1194   cbnz(count, Lloop);
1195   bind(Lexit);
1196 }
1197 
1198 // scans count 4 byte words at [addr] for occurence of value,
1199 // generic
repne_scanw(Register addr,Register value,Register count,Register scratch)1200 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1201                                 Register scratch) {
1202   Label Lloop, Lexit;
1203   cbz(count, Lexit);
1204   bind(Lloop);
1205   ldrw(scratch, post(addr, wordSize));
1206   cmpw(value, scratch);
1207   br(EQ, Lexit);
1208   sub(count, count, 1);
1209   cbnz(count, Lloop);
1210   bind(Lexit);
1211 }
1212 
check_klass_subtype_slow_path(Register sub_klass,Register super_klass,Register temp_reg,Register temp2_reg,Label * L_success,Label * L_failure,bool set_cond_codes)1213 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1214                                                    Register super_klass,
1215                                                    Register temp_reg,
1216                                                    Register temp2_reg,
1217                                                    Label* L_success,
1218                                                    Label* L_failure,
1219                                                    bool set_cond_codes) {
1220   assert_different_registers(sub_klass, super_klass, temp_reg);
1221   if (temp2_reg != noreg)
1222     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1223 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1224 
1225   Label L_fallthrough;
1226   int label_nulls = 0;
1227   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1228   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1229   assert(label_nulls <= 1, "at most one NULL in the batch");
1230 
1231   // a couple of useful fields in sub_klass:
1232   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1233   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1234   Address secondary_supers_addr(sub_klass, ss_offset);
1235   Address super_cache_addr(     sub_klass, sc_offset);
1236 
1237   BLOCK_COMMENT("check_klass_subtype_slow_path");
1238 
1239   // Do a linear scan of the secondary super-klass chain.
1240   // This code is rarely used, so simplicity is a virtue here.
1241   // The repne_scan instruction uses fixed registers, which we must spill.
1242   // Don't worry too much about pre-existing connections with the input regs.
1243 
1244   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1245   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1246 
1247   RegSet pushed_registers;
1248   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1249   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1250 
1251   if (super_klass != r0 || UseCompressedOops) {
1252     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1253   }
1254 
1255   push(pushed_registers, sp);
1256 
1257   // Get super_klass value into r0 (even if it was in r5 or r2).
1258   if (super_klass != r0) {
1259     mov(r0, super_klass);
1260   }
1261 
1262 #ifndef PRODUCT
1263   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1264   Address pst_counter_addr(rscratch2);
1265   ldr(rscratch1, pst_counter_addr);
1266   add(rscratch1, rscratch1, 1);
1267   str(rscratch1, pst_counter_addr);
1268 #endif //PRODUCT
1269 
1270   // We will consult the secondary-super array.
1271   ldr(r5, secondary_supers_addr);
1272   // Load the array length.
1273   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1274   // Skip to start of data.
1275   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1276 
1277   cmp(sp, zr); // Clear Z flag; SP is never zero
1278   // Scan R2 words at [R5] for an occurrence of R0.
1279   // Set NZ/Z based on last compare.
1280   repne_scan(r5, r0, r2, rscratch1);
1281 
1282   // Unspill the temp. registers:
1283   pop(pushed_registers, sp);
1284 
1285   br(Assembler::NE, *L_failure);
1286 
1287   // Success.  Cache the super we found and proceed in triumph.
1288   str(super_klass, super_cache_addr);
1289 
1290   if (L_success != &L_fallthrough) {
1291     b(*L_success);
1292   }
1293 
1294 #undef IS_A_TEMP
1295 
1296   bind(L_fallthrough);
1297 }
1298 
clinit_barrier(Register klass,Register scratch,Label * L_fast_path,Label * L_slow_path)1299 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1300   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
1301   assert_different_registers(klass, rthread, scratch);
1302 
1303   Label L_fallthrough, L_tmp;
1304   if (L_fast_path == NULL) {
1305     L_fast_path = &L_fallthrough;
1306   } else if (L_slow_path == NULL) {
1307     L_slow_path = &L_fallthrough;
1308   }
1309   // Fast path check: class is fully initialized
1310   ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1311   subs(zr, scratch, InstanceKlass::fully_initialized);
1312   br(Assembler::EQ, *L_fast_path);
1313 
1314   // Fast path check: current thread is initializer thread
1315   ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1316   cmp(rthread, scratch);
1317 
1318   if (L_slow_path == &L_fallthrough) {
1319     br(Assembler::EQ, *L_fast_path);
1320     bind(*L_slow_path);
1321   } else if (L_fast_path == &L_fallthrough) {
1322     br(Assembler::NE, *L_slow_path);
1323     bind(*L_fast_path);
1324   } else {
1325     Unimplemented();
1326   }
1327 }
1328 
verify_oop(Register reg,const char * s)1329 void MacroAssembler::verify_oop(Register reg, const char* s) {
1330   if (!VerifyOops) return;
1331 
1332   // Pass register number to verify_oop_subroutine
1333   const char* b = NULL;
1334   {
1335     ResourceMark rm;
1336     stringStream ss;
1337     ss.print("verify_oop: %s: %s", reg->name(), s);
1338     b = code_string(ss.as_string());
1339   }
1340   BLOCK_COMMENT("verify_oop {");
1341 
1342   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1343   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1344 
1345   mov(r0, reg);
1346   mov(rscratch1, (address)b);
1347 
1348   // call indirectly to solve generation ordering problem
1349   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1350   ldr(rscratch2, Address(rscratch2));
1351   blr(rscratch2);
1352 
1353   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1354   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1355 
1356   BLOCK_COMMENT("} verify_oop");
1357 }
1358 
verify_oop_addr(Address addr,const char * s)1359 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1360   if (!VerifyOops) return;
1361 
1362   const char* b = NULL;
1363   {
1364     ResourceMark rm;
1365     stringStream ss;
1366     ss.print("verify_oop_addr: %s", s);
1367     b = code_string(ss.as_string());
1368   }
1369   BLOCK_COMMENT("verify_oop_addr {");
1370 
1371   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1372   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1373 
1374   // addr may contain sp so we will have to adjust it based on the
1375   // pushes that we just did.
1376   if (addr.uses(sp)) {
1377     lea(r0, addr);
1378     ldr(r0, Address(r0, 4 * wordSize));
1379   } else {
1380     ldr(r0, addr);
1381   }
1382   mov(rscratch1, (address)b);
1383 
1384   // call indirectly to solve generation ordering problem
1385   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1386   ldr(rscratch2, Address(rscratch2));
1387   blr(rscratch2);
1388 
1389   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1390   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1391 
1392   BLOCK_COMMENT("} verify_oop_addr");
1393 }
1394 
argument_address(RegisterOrConstant arg_slot,int extra_slot_offset)1395 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1396                                          int extra_slot_offset) {
1397   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1398   int stackElementSize = Interpreter::stackElementSize;
1399   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1400 #ifdef ASSERT
1401   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1402   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1403 #endif
1404   if (arg_slot.is_constant()) {
1405     return Address(esp, arg_slot.as_constant() * stackElementSize
1406                    + offset);
1407   } else {
1408     add(rscratch1, esp, arg_slot.as_register(),
1409         ext::uxtx, exact_log2(stackElementSize));
1410     return Address(rscratch1, offset);
1411   }
1412 }
1413 
call_VM_leaf_base(address entry_point,int number_of_arguments,Label * retaddr)1414 void MacroAssembler::call_VM_leaf_base(address entry_point,
1415                                        int number_of_arguments,
1416                                        Label *retaddr) {
1417   Label E, L;
1418 
1419   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1420 
1421   mov(rscratch1, entry_point);
1422   blr(rscratch1);
1423   if (retaddr)
1424     bind(*retaddr);
1425 
1426   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1427   maybe_isb();
1428 }
1429 
call_VM_leaf(address entry_point,int number_of_arguments)1430 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1431   call_VM_leaf_base(entry_point, number_of_arguments);
1432 }
1433 
call_VM_leaf(address entry_point,Register arg_0)1434 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1435   pass_arg0(this, arg_0);
1436   call_VM_leaf_base(entry_point, 1);
1437 }
1438 
call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1439 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1440   pass_arg0(this, arg_0);
1441   pass_arg1(this, arg_1);
1442   call_VM_leaf_base(entry_point, 2);
1443 }
1444 
call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1445 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1446                                   Register arg_1, Register arg_2) {
1447   pass_arg0(this, arg_0);
1448   pass_arg1(this, arg_1);
1449   pass_arg2(this, arg_2);
1450   call_VM_leaf_base(entry_point, 3);
1451 }
1452 
super_call_VM_leaf(address entry_point,Register arg_0)1453 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1454   pass_arg0(this, arg_0);
1455   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1456 }
1457 
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1458 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1459 
1460   assert(arg_0 != c_rarg1, "smashed arg");
1461   pass_arg1(this, arg_1);
1462   pass_arg0(this, arg_0);
1463   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1464 }
1465 
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1466 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1467   assert(arg_0 != c_rarg2, "smashed arg");
1468   assert(arg_1 != c_rarg2, "smashed arg");
1469   pass_arg2(this, arg_2);
1470   assert(arg_0 != c_rarg1, "smashed arg");
1471   pass_arg1(this, arg_1);
1472   pass_arg0(this, arg_0);
1473   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1474 }
1475 
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2,Register arg_3)1476 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1477   assert(arg_0 != c_rarg3, "smashed arg");
1478   assert(arg_1 != c_rarg3, "smashed arg");
1479   assert(arg_2 != c_rarg3, "smashed arg");
1480   pass_arg3(this, arg_3);
1481   assert(arg_0 != c_rarg2, "smashed arg");
1482   assert(arg_1 != c_rarg2, "smashed arg");
1483   pass_arg2(this, arg_2);
1484   assert(arg_0 != c_rarg1, "smashed arg");
1485   pass_arg1(this, arg_1);
1486   pass_arg0(this, arg_0);
1487   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1488 }
1489 
null_check(Register reg,int offset)1490 void MacroAssembler::null_check(Register reg, int offset) {
1491   if (needs_explicit_null_check(offset)) {
1492     // provoke OS NULL exception if reg = NULL by
1493     // accessing M[reg] w/o changing any registers
1494     // NOTE: this is plenty to provoke a segv
1495     ldr(zr, Address(reg));
1496   } else {
1497     // nothing to do, (later) access of M[reg + offset]
1498     // will provoke OS NULL exception if reg = NULL
1499   }
1500 }
1501 
1502 // MacroAssembler protected routines needed to implement
1503 // public methods
1504 
mov(Register r,Address dest)1505 void MacroAssembler::mov(Register r, Address dest) {
1506   code_section()->relocate(pc(), dest.rspec());
1507   u_int64_t imm64 = (u_int64_t)dest.target();
1508   movptr(r, imm64);
1509 }
1510 
1511 // Move a constant pointer into r.  In AArch64 mode the virtual
1512 // address space is 48 bits in size, so we only need three
1513 // instructions to create a patchable instruction sequence that can
1514 // reach anywhere.
movptr(Register r,uintptr_t imm64)1515 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1516 #ifndef PRODUCT
1517   {
1518     char buffer[64];
1519     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1520     block_comment(buffer);
1521   }
1522 #endif
1523   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1524   movz(r, imm64 & 0xffff);
1525   imm64 >>= 16;
1526   movk(r, imm64 & 0xffff, 16);
1527   imm64 >>= 16;
1528   movk(r, imm64 & 0xffff, 32);
1529 }
1530 
1531 // Macro to mov replicated immediate to vector register.
1532 //  Vd will get the following values for different arrangements in T
1533 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1534 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1535 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1536 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1537 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1538 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1539 //   T1D/T2D: invalid
mov(FloatRegister Vd,SIMD_Arrangement T,u_int32_t imm32)1540 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1541   assert(T != T1D && T != T2D, "invalid arrangement");
1542   if (T == T8B || T == T16B) {
1543     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1544     movi(Vd, T, imm32 & 0xff, 0);
1545     return;
1546   }
1547   u_int32_t nimm32 = ~imm32;
1548   if (T == T4H || T == T8H) {
1549     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1550     imm32 &= 0xffff;
1551     nimm32 &= 0xffff;
1552   }
1553   u_int32_t x = imm32;
1554   int movi_cnt = 0;
1555   int movn_cnt = 0;
1556   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1557   x = nimm32;
1558   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1559   if (movn_cnt < movi_cnt) imm32 = nimm32;
1560   unsigned lsl = 0;
1561   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1562   if (movn_cnt < movi_cnt)
1563     mvni(Vd, T, imm32 & 0xff, lsl);
1564   else
1565     movi(Vd, T, imm32 & 0xff, lsl);
1566   imm32 >>= 8; lsl += 8;
1567   while (imm32) {
1568     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1569     if (movn_cnt < movi_cnt)
1570       bici(Vd, T, imm32 & 0xff, lsl);
1571     else
1572       orri(Vd, T, imm32 & 0xff, lsl);
1573     lsl += 8; imm32 >>= 8;
1574   }
1575 }
1576 
mov_immediate64(Register dst,u_int64_t imm64)1577 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1578 {
1579 #ifndef PRODUCT
1580   {
1581     char buffer[64];
1582     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1583     block_comment(buffer);
1584   }
1585 #endif
1586   if (operand_valid_for_logical_immediate(false, imm64)) {
1587     orr(dst, zr, imm64);
1588   } else {
1589     // we can use a combination of MOVZ or MOVN with
1590     // MOVK to build up the constant
1591     u_int64_t imm_h[4];
1592     int zero_count = 0;
1593     int neg_count = 0;
1594     int i;
1595     for (i = 0; i < 4; i++) {
1596       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1597       if (imm_h[i] == 0) {
1598         zero_count++;
1599       } else if (imm_h[i] == 0xffffL) {
1600         neg_count++;
1601       }
1602     }
1603     if (zero_count == 4) {
1604       // one MOVZ will do
1605       movz(dst, 0);
1606     } else if (neg_count == 4) {
1607       // one MOVN will do
1608       movn(dst, 0);
1609     } else if (zero_count == 3) {
1610       for (i = 0; i < 4; i++) {
1611         if (imm_h[i] != 0L) {
1612           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1613           break;
1614         }
1615       }
1616     } else if (neg_count == 3) {
1617       // one MOVN will do
1618       for (int i = 0; i < 4; i++) {
1619         if (imm_h[i] != 0xffffL) {
1620           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1621           break;
1622         }
1623       }
1624     } else if (zero_count == 2) {
1625       // one MOVZ and one MOVK will do
1626       for (i = 0; i < 3; i++) {
1627         if (imm_h[i] != 0L) {
1628           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1629           i++;
1630           break;
1631         }
1632       }
1633       for (;i < 4; i++) {
1634         if (imm_h[i] != 0L) {
1635           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1636         }
1637       }
1638     } else if (neg_count == 2) {
1639       // one MOVN and one MOVK will do
1640       for (i = 0; i < 4; i++) {
1641         if (imm_h[i] != 0xffffL) {
1642           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1643           i++;
1644           break;
1645         }
1646       }
1647       for (;i < 4; i++) {
1648         if (imm_h[i] != 0xffffL) {
1649           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1650         }
1651       }
1652     } else if (zero_count == 1) {
1653       // one MOVZ and two MOVKs will do
1654       for (i = 0; i < 4; i++) {
1655         if (imm_h[i] != 0L) {
1656           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1657           i++;
1658           break;
1659         }
1660       }
1661       for (;i < 4; i++) {
1662         if (imm_h[i] != 0x0L) {
1663           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1664         }
1665       }
1666     } else if (neg_count == 1) {
1667       // one MOVN and two MOVKs will do
1668       for (i = 0; i < 4; i++) {
1669         if (imm_h[i] != 0xffffL) {
1670           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1671           i++;
1672           break;
1673         }
1674       }
1675       for (;i < 4; i++) {
1676         if (imm_h[i] != 0xffffL) {
1677           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1678         }
1679       }
1680     } else {
1681       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1682       movz(dst, (u_int32_t)imm_h[0], 0);
1683       for (i = 1; i < 4; i++) {
1684         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1685       }
1686     }
1687   }
1688 }
1689 
mov_immediate32(Register dst,u_int32_t imm32)1690 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1691 {
1692 #ifndef PRODUCT
1693     {
1694       char buffer[64];
1695       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1696       block_comment(buffer);
1697     }
1698 #endif
1699   if (operand_valid_for_logical_immediate(true, imm32)) {
1700     orrw(dst, zr, imm32);
1701   } else {
1702     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1703     // constant
1704     u_int32_t imm_h[2];
1705     imm_h[0] = imm32 & 0xffff;
1706     imm_h[1] = ((imm32 >> 16) & 0xffff);
1707     if (imm_h[0] == 0) {
1708       movzw(dst, imm_h[1], 16);
1709     } else if (imm_h[0] == 0xffff) {
1710       movnw(dst, imm_h[1] ^ 0xffff, 16);
1711     } else if (imm_h[1] == 0) {
1712       movzw(dst, imm_h[0], 0);
1713     } else if (imm_h[1] == 0xffff) {
1714       movnw(dst, imm_h[0] ^ 0xffff, 0);
1715     } else {
1716       // use a MOVZ and MOVK (makes it easier to debug)
1717       movzw(dst, imm_h[0], 0);
1718       movkw(dst, imm_h[1], 16);
1719     }
1720   }
1721 }
1722 
1723 // Form an address from base + offset in Rd.  Rd may or may
1724 // not actually be used: you must use the Address that is returned.
1725 // It is up to you to ensure that the shift provided matches the size
1726 // of your data.
form_address(Register Rd,Register base,long byte_offset,int shift)1727 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1728   if (Address::offset_ok_for_immed(byte_offset, shift))
1729     // It fits; no need for any heroics
1730     return Address(base, byte_offset);
1731 
1732   // Don't do anything clever with negative or misaligned offsets
1733   unsigned mask = (1 << shift) - 1;
1734   if (byte_offset < 0 || byte_offset & mask) {
1735     mov(Rd, byte_offset);
1736     add(Rd, base, Rd);
1737     return Address(Rd);
1738   }
1739 
1740   // See if we can do this with two 12-bit offsets
1741   {
1742     unsigned long word_offset = byte_offset >> shift;
1743     unsigned long masked_offset = word_offset & 0xfff000;
1744     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1745         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1746       add(Rd, base, masked_offset << shift);
1747       word_offset -= masked_offset;
1748       return Address(Rd, word_offset << shift);
1749     }
1750   }
1751 
1752   // Do it the hard way
1753   mov(Rd, byte_offset);
1754   add(Rd, base, Rd);
1755   return Address(Rd);
1756 }
1757 
atomic_incw(Register counter_addr,Register tmp,Register tmp2)1758 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1759   if (UseLSE) {
1760     mov(tmp, 1);
1761     ldadd(Assembler::word, tmp, zr, counter_addr);
1762     return;
1763   }
1764   Label retry_load;
1765   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1766     prfm(Address(counter_addr), PSTL1STRM);
1767   bind(retry_load);
1768   // flush and load exclusive from the memory location
1769   ldxrw(tmp, counter_addr);
1770   addw(tmp, tmp, 1);
1771   // if we store+flush with no intervening write tmp wil be zero
1772   stxrw(tmp2, tmp, counter_addr);
1773   cbnzw(tmp2, retry_load);
1774 }
1775 
1776 
corrected_idivl(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1777 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1778                                     bool want_remainder, Register scratch)
1779 {
1780   // Full implementation of Java idiv and irem.  The function
1781   // returns the (pc) offset of the div instruction - may be needed
1782   // for implicit exceptions.
1783   //
1784   // constraint : ra/rb =/= scratch
1785   //         normal case
1786   //
1787   // input : ra: dividend
1788   //         rb: divisor
1789   //
1790   // result: either
1791   //         quotient  (= ra idiv rb)
1792   //         remainder (= ra irem rb)
1793 
1794   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1795 
1796   int idivl_offset = offset();
1797   if (! want_remainder) {
1798     sdivw(result, ra, rb);
1799   } else {
1800     sdivw(scratch, ra, rb);
1801     Assembler::msubw(result, scratch, rb, ra);
1802   }
1803 
1804   return idivl_offset;
1805 }
1806 
corrected_idivq(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1807 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1808                                     bool want_remainder, Register scratch)
1809 {
1810   // Full implementation of Java ldiv and lrem.  The function
1811   // returns the (pc) offset of the div instruction - may be needed
1812   // for implicit exceptions.
1813   //
1814   // constraint : ra/rb =/= scratch
1815   //         normal case
1816   //
1817   // input : ra: dividend
1818   //         rb: divisor
1819   //
1820   // result: either
1821   //         quotient  (= ra idiv rb)
1822   //         remainder (= ra irem rb)
1823 
1824   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1825 
1826   int idivq_offset = offset();
1827   if (! want_remainder) {
1828     sdiv(result, ra, rb);
1829   } else {
1830     sdiv(scratch, ra, rb);
1831     Assembler::msub(result, scratch, rb, ra);
1832   }
1833 
1834   return idivq_offset;
1835 }
1836 
membar(Membar_mask_bits order_constraint)1837 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1838   address prev = pc() - NativeMembar::instruction_size;
1839   address last = code()->last_insn();
1840   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1841     NativeMembar *bar = NativeMembar_at(prev);
1842     // We are merging two memory barrier instructions.  On AArch64 we
1843     // can do this simply by ORing them together.
1844     bar->set_kind(bar->get_kind() | order_constraint);
1845     BLOCK_COMMENT("merged membar");
1846   } else {
1847     code()->set_last_insn(pc());
1848     dmb(Assembler::barrier(order_constraint));
1849   }
1850 }
1851 
try_merge_ldst(Register rt,const Address & adr,size_t size_in_bytes,bool is_store)1852 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1853   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1854     merge_ldst(rt, adr, size_in_bytes, is_store);
1855     code()->clear_last_insn();
1856     return true;
1857   } else {
1858     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1859     const unsigned mask = size_in_bytes - 1;
1860     if (adr.getMode() == Address::base_plus_offset &&
1861         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1862       code()->set_last_insn(pc());
1863     }
1864     return false;
1865   }
1866 }
1867 
ldr(Register Rx,const Address & adr)1868 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1869   // We always try to merge two adjacent loads into one ldp.
1870   if (!try_merge_ldst(Rx, adr, 8, false)) {
1871     Assembler::ldr(Rx, adr);
1872   }
1873 }
1874 
ldrw(Register Rw,const Address & adr)1875 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1876   // We always try to merge two adjacent loads into one ldp.
1877   if (!try_merge_ldst(Rw, adr, 4, false)) {
1878     Assembler::ldrw(Rw, adr);
1879   }
1880 }
1881 
str(Register Rx,const Address & adr)1882 void MacroAssembler::str(Register Rx, const Address &adr) {
1883   // We always try to merge two adjacent stores into one stp.
1884   if (!try_merge_ldst(Rx, adr, 8, true)) {
1885     Assembler::str(Rx, adr);
1886   }
1887 }
1888 
strw(Register Rw,const Address & adr)1889 void MacroAssembler::strw(Register Rw, const Address &adr) {
1890   // We always try to merge two adjacent stores into one stp.
1891   if (!try_merge_ldst(Rw, adr, 4, true)) {
1892     Assembler::strw(Rw, adr);
1893   }
1894 }
1895 
1896 // MacroAssembler routines found actually to be needed
1897 
push(Register src)1898 void MacroAssembler::push(Register src)
1899 {
1900   str(src, Address(pre(esp, -1 * wordSize)));
1901 }
1902 
pop(Register dst)1903 void MacroAssembler::pop(Register dst)
1904 {
1905   ldr(dst, Address(post(esp, 1 * wordSize)));
1906 }
1907 
1908 // Note: load_unsigned_short used to be called load_unsigned_word.
load_unsigned_short(Register dst,Address src)1909 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1910   int off = offset();
1911   ldrh(dst, src);
1912   return off;
1913 }
1914 
load_unsigned_byte(Register dst,Address src)1915 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1916   int off = offset();
1917   ldrb(dst, src);
1918   return off;
1919 }
1920 
load_signed_short(Register dst,Address src)1921 int MacroAssembler::load_signed_short(Register dst, Address src) {
1922   int off = offset();
1923   ldrsh(dst, src);
1924   return off;
1925 }
1926 
load_signed_byte(Register dst,Address src)1927 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1928   int off = offset();
1929   ldrsb(dst, src);
1930   return off;
1931 }
1932 
load_signed_short32(Register dst,Address src)1933 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1934   int off = offset();
1935   ldrshw(dst, src);
1936   return off;
1937 }
1938 
load_signed_byte32(Register dst,Address src)1939 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1940   int off = offset();
1941   ldrsbw(dst, src);
1942   return off;
1943 }
1944 
load_sized_value(Register dst,Address src,size_t size_in_bytes,bool is_signed,Register dst2)1945 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1946   switch (size_in_bytes) {
1947   case  8:  ldr(dst, src); break;
1948   case  4:  ldrw(dst, src); break;
1949   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1950   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1951   default:  ShouldNotReachHere();
1952   }
1953 }
1954 
store_sized_value(Address dst,Register src,size_t size_in_bytes,Register src2)1955 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1956   switch (size_in_bytes) {
1957   case  8:  str(src, dst); break;
1958   case  4:  strw(src, dst); break;
1959   case  2:  strh(src, dst); break;
1960   case  1:  strb(src, dst); break;
1961   default:  ShouldNotReachHere();
1962   }
1963 }
1964 
decrementw(Register reg,int value)1965 void MacroAssembler::decrementw(Register reg, int value)
1966 {
1967   if (value < 0)  { incrementw(reg, -value);      return; }
1968   if (value == 0) {                               return; }
1969   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1970   /* else */ {
1971     guarantee(reg != rscratch2, "invalid dst for register decrement");
1972     movw(rscratch2, (unsigned)value);
1973     subw(reg, reg, rscratch2);
1974   }
1975 }
1976 
decrement(Register reg,int value)1977 void MacroAssembler::decrement(Register reg, int value)
1978 {
1979   if (value < 0)  { increment(reg, -value);      return; }
1980   if (value == 0) {                              return; }
1981   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1982   /* else */ {
1983     assert(reg != rscratch2, "invalid dst for register decrement");
1984     mov(rscratch2, (unsigned long)value);
1985     sub(reg, reg, rscratch2);
1986   }
1987 }
1988 
decrementw(Address dst,int value)1989 void MacroAssembler::decrementw(Address dst, int value)
1990 {
1991   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1992   if (dst.getMode() == Address::literal) {
1993     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1994     lea(rscratch2, dst);
1995     dst = Address(rscratch2);
1996   }
1997   ldrw(rscratch1, dst);
1998   decrementw(rscratch1, value);
1999   strw(rscratch1, dst);
2000 }
2001 
decrement(Address dst,int value)2002 void MacroAssembler::decrement(Address dst, int value)
2003 {
2004   assert(!dst.uses(rscratch1), "invalid address for decrement");
2005   if (dst.getMode() == Address::literal) {
2006     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2007     lea(rscratch2, dst);
2008     dst = Address(rscratch2);
2009   }
2010   ldr(rscratch1, dst);
2011   decrement(rscratch1, value);
2012   str(rscratch1, dst);
2013 }
2014 
incrementw(Register reg,int value)2015 void MacroAssembler::incrementw(Register reg, int value)
2016 {
2017   if (value < 0)  { decrementw(reg, -value);      return; }
2018   if (value == 0) {                               return; }
2019   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2020   /* else */ {
2021     assert(reg != rscratch2, "invalid dst for register increment");
2022     movw(rscratch2, (unsigned)value);
2023     addw(reg, reg, rscratch2);
2024   }
2025 }
2026 
increment(Register reg,int value)2027 void MacroAssembler::increment(Register reg, int value)
2028 {
2029   if (value < 0)  { decrement(reg, -value);      return; }
2030   if (value == 0) {                              return; }
2031   if (value < (1 << 12)) { add(reg, reg, value); return; }
2032   /* else */ {
2033     assert(reg != rscratch2, "invalid dst for register increment");
2034     movw(rscratch2, (unsigned)value);
2035     add(reg, reg, rscratch2);
2036   }
2037 }
2038 
incrementw(Address dst,int value)2039 void MacroAssembler::incrementw(Address dst, int value)
2040 {
2041   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2042   if (dst.getMode() == Address::literal) {
2043     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2044     lea(rscratch2, dst);
2045     dst = Address(rscratch2);
2046   }
2047   ldrw(rscratch1, dst);
2048   incrementw(rscratch1, value);
2049   strw(rscratch1, dst);
2050 }
2051 
increment(Address dst,int value)2052 void MacroAssembler::increment(Address dst, int value)
2053 {
2054   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2055   if (dst.getMode() == Address::literal) {
2056     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2057     lea(rscratch2, dst);
2058     dst = Address(rscratch2);
2059   }
2060   ldr(rscratch1, dst);
2061   increment(rscratch1, value);
2062   str(rscratch1, dst);
2063 }
2064 
2065 
pusha()2066 void MacroAssembler::pusha() {
2067   push(0x7fffffff, sp);
2068 }
2069 
popa()2070 void MacroAssembler::popa() {
2071   pop(0x7fffffff, sp);
2072 }
2073 
2074 // Push lots of registers in the bit set supplied.  Don't push sp.
2075 // Return the number of words pushed
push(unsigned int bitset,Register stack)2076 int MacroAssembler::push(unsigned int bitset, Register stack) {
2077   int words_pushed = 0;
2078 
2079   // Scan bitset to accumulate register pairs
2080   unsigned char regs[32];
2081   int count = 0;
2082   for (int reg = 0; reg <= 30; reg++) {
2083     if (1 & bitset)
2084       regs[count++] = reg;
2085     bitset >>= 1;
2086   }
2087   regs[count++] = zr->encoding_nocheck();
2088   count &= ~1;  // Only push an even nuber of regs
2089 
2090   if (count) {
2091     stp(as_Register(regs[0]), as_Register(regs[1]),
2092        Address(pre(stack, -count * wordSize)));
2093     words_pushed += 2;
2094   }
2095   for (int i = 2; i < count; i += 2) {
2096     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2097        Address(stack, i * wordSize));
2098     words_pushed += 2;
2099   }
2100 
2101   assert(words_pushed == count, "oops, pushed != count");
2102 
2103   return count;
2104 }
2105 
pop(unsigned int bitset,Register stack)2106 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2107   int words_pushed = 0;
2108 
2109   // Scan bitset to accumulate register pairs
2110   unsigned char regs[32];
2111   int count = 0;
2112   for (int reg = 0; reg <= 30; reg++) {
2113     if (1 & bitset)
2114       regs[count++] = reg;
2115     bitset >>= 1;
2116   }
2117   regs[count++] = zr->encoding_nocheck();
2118   count &= ~1;
2119 
2120   for (int i = 2; i < count; i += 2) {
2121     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2122        Address(stack, i * wordSize));
2123     words_pushed += 2;
2124   }
2125   if (count) {
2126     ldp(as_Register(regs[0]), as_Register(regs[1]),
2127        Address(post(stack, count * wordSize)));
2128     words_pushed += 2;
2129   }
2130 
2131   assert(words_pushed == count, "oops, pushed != count");
2132 
2133   return count;
2134 }
2135 
2136 // Push lots of registers in the bit set supplied.  Don't push sp.
2137 // Return the number of words pushed
push_fp(unsigned int bitset,Register stack)2138 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2139   int words_pushed = 0;
2140 
2141   // Scan bitset to accumulate register pairs
2142   unsigned char regs[32];
2143   int count = 0;
2144   for (int reg = 0; reg <= 31; reg++) {
2145     if (1 & bitset)
2146       regs[count++] = reg;
2147     bitset >>= 1;
2148   }
2149   regs[count++] = zr->encoding_nocheck();
2150   count &= ~1;  // Only push an even number of regs
2151 
2152   // Always pushing full 128 bit registers.
2153   if (count) {
2154     stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -count * wordSize * 2)));
2155     words_pushed += 2;
2156   }
2157   for (int i = 2; i < count; i += 2) {
2158     stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2159     words_pushed += 2;
2160   }
2161 
2162   assert(words_pushed == count, "oops, pushed != count");
2163   return count;
2164 }
2165 
pop_fp(unsigned int bitset,Register stack)2166 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2167   int words_pushed = 0;
2168 
2169   // Scan bitset to accumulate register pairs
2170   unsigned char regs[32];
2171   int count = 0;
2172   for (int reg = 0; reg <= 31; reg++) {
2173     if (1 & bitset)
2174       regs[count++] = reg;
2175     bitset >>= 1;
2176   }
2177   regs[count++] = zr->encoding_nocheck();
2178   count &= ~1;
2179 
2180   for (int i = 2; i < count; i += 2) {
2181     ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2182     words_pushed += 2;
2183   }
2184   if (count) {
2185     ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, count * wordSize * 2)));
2186     words_pushed += 2;
2187   }
2188 
2189   assert(words_pushed == count, "oops, pushed != count");
2190 
2191   return count;
2192 }
2193 
2194 #ifdef ASSERT
verify_heapbase(const char * msg)2195 void MacroAssembler::verify_heapbase(const char* msg) {
2196 #if 0
2197   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2198   assert (Universe::heap() != NULL, "java heap should be initialized");
2199   if (CheckCompressedOops) {
2200     Label ok;
2201     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2202     cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2203     br(Assembler::EQ, ok);
2204     stop(msg);
2205     bind(ok);
2206     pop(1 << rscratch1->encoding(), sp);
2207   }
2208 #endif
2209 }
2210 #endif
2211 
resolve_jobject(Register value,Register thread,Register tmp)2212 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2213   Label done, not_weak;
2214   cbz(value, done);           // Use NULL as-is.
2215 
2216   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2217   tbz(r0, 0, not_weak);    // Test for jweak tag.
2218 
2219   // Resolve jweak.
2220   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2221                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2222   verify_oop(value);
2223   b(done);
2224 
2225   bind(not_weak);
2226   // Resolve (untagged) jobject.
2227   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2228   verify_oop(value);
2229   bind(done);
2230 }
2231 
stop(const char * msg)2232 void MacroAssembler::stop(const char* msg) {
2233   address ip = pc();
2234   pusha();
2235   mov(c_rarg0, (address)msg);
2236   mov(c_rarg1, (address)ip);
2237   mov(c_rarg2, sp);
2238   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2239   blr(c_rarg3);
2240   hlt(0);
2241 }
2242 
warn(const char * msg)2243 void MacroAssembler::warn(const char* msg) {
2244   pusha();
2245   mov(c_rarg0, (address)msg);
2246   mov(lr, CAST_FROM_FN_PTR(address, warning));
2247   blr(lr);
2248   popa();
2249 }
2250 
unimplemented(const char * what)2251 void MacroAssembler::unimplemented(const char* what) {
2252   const char* buf = NULL;
2253   {
2254     ResourceMark rm;
2255     stringStream ss;
2256     ss.print("unimplemented: %s", what);
2257     buf = code_string(ss.as_string());
2258   }
2259   stop(buf);
2260 }
2261 
2262 // If a constant does not fit in an immediate field, generate some
2263 // number of MOV instructions and then perform the operation.
wrap_add_sub_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2264 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2265                                            add_sub_imm_insn insn1,
2266                                            add_sub_reg_insn insn2) {
2267   assert(Rd != zr, "Rd = zr and not setting flags?");
2268   if (operand_valid_for_add_sub_immediate((int)imm)) {
2269     (this->*insn1)(Rd, Rn, imm);
2270   } else {
2271     if (uabs(imm) < (1 << 24)) {
2272        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2273        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2274     } else {
2275        assert_different_registers(Rd, Rn);
2276        mov(Rd, (uint64_t)imm);
2277        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2278     }
2279   }
2280 }
2281 
2282 // Seperate vsn which sets the flags. Optimisations are more restricted
2283 // because we must set the flags correctly.
wrap_adds_subs_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2284 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2285                                            add_sub_imm_insn insn1,
2286                                            add_sub_reg_insn insn2) {
2287   if (operand_valid_for_add_sub_immediate((int)imm)) {
2288     (this->*insn1)(Rd, Rn, imm);
2289   } else {
2290     assert_different_registers(Rd, Rn);
2291     assert(Rd != zr, "overflow in immediate operand");
2292     mov(Rd, (uint64_t)imm);
2293     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2294   }
2295 }
2296 
2297 
add(Register Rd,Register Rn,RegisterOrConstant increment)2298 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2299   if (increment.is_register()) {
2300     add(Rd, Rn, increment.as_register());
2301   } else {
2302     add(Rd, Rn, increment.as_constant());
2303   }
2304 }
2305 
addw(Register Rd,Register Rn,RegisterOrConstant increment)2306 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2307   if (increment.is_register()) {
2308     addw(Rd, Rn, increment.as_register());
2309   } else {
2310     addw(Rd, Rn, increment.as_constant());
2311   }
2312 }
2313 
sub(Register Rd,Register Rn,RegisterOrConstant decrement)2314 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2315   if (decrement.is_register()) {
2316     sub(Rd, Rn, decrement.as_register());
2317   } else {
2318     sub(Rd, Rn, decrement.as_constant());
2319   }
2320 }
2321 
subw(Register Rd,Register Rn,RegisterOrConstant decrement)2322 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2323   if (decrement.is_register()) {
2324     subw(Rd, Rn, decrement.as_register());
2325   } else {
2326     subw(Rd, Rn, decrement.as_constant());
2327   }
2328 }
2329 
reinit_heapbase()2330 void MacroAssembler::reinit_heapbase()
2331 {
2332   if (UseCompressedOops) {
2333     if (Universe::is_fully_initialized()) {
2334       mov(rheapbase, CompressedOops::ptrs_base());
2335     } else {
2336       lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2337       ldr(rheapbase, Address(rheapbase));
2338     }
2339   }
2340 }
2341 
2342 // this simulates the behaviour of the x86 cmpxchg instruction using a
2343 // load linked/store conditional pair. we use the acquire/release
2344 // versions of these instructions so that we flush pending writes as
2345 // per Java semantics.
2346 
2347 // n.b the x86 version assumes the old value to be compared against is
2348 // in rax and updates rax with the value located in memory if the
2349 // cmpxchg fails. we supply a register for the old value explicitly
2350 
2351 // the aarch64 load linked/store conditional instructions do not
2352 // accept an offset. so, unlike x86, we must provide a plain register
2353 // to identify the memory word to be compared/exchanged rather than a
2354 // register+offset Address.
2355 
cmpxchgptr(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2356 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2357                                 Label &succeed, Label *fail) {
2358   // oldv holds comparison value
2359   // newv holds value to write in exchange
2360   // addr identifies memory word to compare against/update
2361   if (UseLSE) {
2362     mov(tmp, oldv);
2363     casal(Assembler::xword, oldv, newv, addr);
2364     cmp(tmp, oldv);
2365     br(Assembler::EQ, succeed);
2366     membar(AnyAny);
2367   } else {
2368     Label retry_load, nope;
2369     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2370       prfm(Address(addr), PSTL1STRM);
2371     bind(retry_load);
2372     // flush and load exclusive from the memory location
2373     // and fail if it is not what we expect
2374     ldaxr(tmp, addr);
2375     cmp(tmp, oldv);
2376     br(Assembler::NE, nope);
2377     // if we store+flush with no intervening write tmp wil be zero
2378     stlxr(tmp, newv, addr);
2379     cbzw(tmp, succeed);
2380     // retry so we only ever return after a load fails to compare
2381     // ensures we don't return a stale value after a failed write.
2382     b(retry_load);
2383     // if the memory word differs we return it in oldv and signal a fail
2384     bind(nope);
2385     membar(AnyAny);
2386     mov(oldv, tmp);
2387   }
2388   if (fail)
2389     b(*fail);
2390 }
2391 
cmpxchg_obj_header(Register oldv,Register newv,Register obj,Register tmp,Label & succeed,Label * fail)2392 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2393                                         Label &succeed, Label *fail) {
2394   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2395   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2396 }
2397 
cmpxchgw(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2398 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2399                                 Label &succeed, Label *fail) {
2400   // oldv holds comparison value
2401   // newv holds value to write in exchange
2402   // addr identifies memory word to compare against/update
2403   // tmp returns 0/1 for success/failure
2404   if (UseLSE) {
2405     mov(tmp, oldv);
2406     casal(Assembler::word, oldv, newv, addr);
2407     cmp(tmp, oldv);
2408     br(Assembler::EQ, succeed);
2409     membar(AnyAny);
2410   } else {
2411     Label retry_load, nope;
2412     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2413       prfm(Address(addr), PSTL1STRM);
2414     bind(retry_load);
2415     // flush and load exclusive from the memory location
2416     // and fail if it is not what we expect
2417     ldaxrw(tmp, addr);
2418     cmp(tmp, oldv);
2419     br(Assembler::NE, nope);
2420     // if we store+flush with no intervening write tmp wil be zero
2421     stlxrw(tmp, newv, addr);
2422     cbzw(tmp, succeed);
2423     // retry so we only ever return after a load fails to compare
2424     // ensures we don't return a stale value after a failed write.
2425     b(retry_load);
2426     // if the memory word differs we return it in oldv and signal a fail
2427     bind(nope);
2428     membar(AnyAny);
2429     mov(oldv, tmp);
2430   }
2431   if (fail)
2432     b(*fail);
2433 }
2434 
2435 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2436 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2437 // Pass a register for the result, otherwise pass noreg.
2438 
2439 // Clobbers rscratch1
cmpxchg(Register addr,Register expected,Register new_val,enum operand_size size,bool acquire,bool release,bool weak,Register result)2440 void MacroAssembler::cmpxchg(Register addr, Register expected,
2441                              Register new_val,
2442                              enum operand_size size,
2443                              bool acquire, bool release,
2444                              bool weak,
2445                              Register result) {
2446   if (result == noreg)  result = rscratch1;
2447   BLOCK_COMMENT("cmpxchg {");
2448   if (UseLSE) {
2449     mov(result, expected);
2450     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2451     compare_eq(result, expected, size);
2452   } else {
2453     Label retry_load, done;
2454     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2455       prfm(Address(addr), PSTL1STRM);
2456     bind(retry_load);
2457     load_exclusive(result, addr, size, acquire);
2458     compare_eq(result, expected, size);
2459     br(Assembler::NE, done);
2460     store_exclusive(rscratch1, new_val, addr, size, release);
2461     if (weak) {
2462       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2463     } else {
2464       cbnzw(rscratch1, retry_load);
2465     }
2466     bind(done);
2467   }
2468   BLOCK_COMMENT("} cmpxchg");
2469 }
2470 
2471 // A generic comparison. Only compares for equality, clobbers rscratch1.
compare_eq(Register rm,Register rn,enum operand_size size)2472 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2473   if (size == xword) {
2474     cmp(rm, rn);
2475   } else if (size == word) {
2476     cmpw(rm, rn);
2477   } else if (size == halfword) {
2478     eorw(rscratch1, rm, rn);
2479     ands(zr, rscratch1, 0xffff);
2480   } else if (size == byte) {
2481     eorw(rscratch1, rm, rn);
2482     ands(zr, rscratch1, 0xff);
2483   } else {
2484     ShouldNotReachHere();
2485   }
2486 }
2487 
2488 
different(Register a,RegisterOrConstant b,Register c)2489 static bool different(Register a, RegisterOrConstant b, Register c) {
2490   if (b.is_constant())
2491     return a != c;
2492   else
2493     return a != b.as_register() && a != c && b.as_register() != c;
2494 }
2495 
2496 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2497 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2498   if (UseLSE) {                                                         \
2499     prev = prev->is_valid() ? prev : zr;                                \
2500     if (incr.is_register()) {                                           \
2501       AOP(sz, incr.as_register(), prev, addr);                          \
2502     } else {                                                            \
2503       mov(rscratch2, incr.as_constant());                               \
2504       AOP(sz, rscratch2, prev, addr);                                   \
2505     }                                                                   \
2506     return;                                                             \
2507   }                                                                     \
2508   Register result = rscratch2;                                          \
2509   if (prev->is_valid())                                                 \
2510     result = different(prev, incr, addr) ? prev : rscratch2;            \
2511                                                                         \
2512   Label retry_load;                                                     \
2513   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2514     prfm(Address(addr), PSTL1STRM);                                     \
2515   bind(retry_load);                                                     \
2516   LDXR(result, addr);                                                   \
2517   OP(rscratch1, result, incr);                                          \
2518   STXR(rscratch2, rscratch1, addr);                                     \
2519   cbnzw(rscratch2, retry_load);                                         \
2520   if (prev->is_valid() && prev != result) {                             \
2521     IOP(prev, rscratch1, incr);                                         \
2522   }                                                                     \
2523 }
2524 
2525 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2526 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2527 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2528 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2529 
2530 #undef ATOMIC_OP
2531 
2532 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2533 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2534   if (UseLSE) {                                                         \
2535     prev = prev->is_valid() ? prev : zr;                                \
2536     AOP(sz, newv, prev, addr);                                          \
2537     return;                                                             \
2538   }                                                                     \
2539   Register result = rscratch2;                                          \
2540   if (prev->is_valid())                                                 \
2541     result = different(prev, newv, addr) ? prev : rscratch2;            \
2542                                                                         \
2543   Label retry_load;                                                     \
2544   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2545     prfm(Address(addr), PSTL1STRM);                                     \
2546   bind(retry_load);                                                     \
2547   LDXR(result, addr);                                                   \
2548   STXR(rscratch1, newv, addr);                                          \
2549   cbnzw(rscratch1, retry_load);                                         \
2550   if (prev->is_valid() && prev != result)                               \
2551     mov(prev, result);                                                  \
2552 }
2553 
2554 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2555 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2556 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2557 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2558 
2559 #undef ATOMIC_XCHG
2560 
2561 #ifndef PRODUCT
2562 extern "C" void findpc(intptr_t x);
2563 #endif
2564 
debug64(char * msg,int64_t pc,int64_t regs[])2565 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2566 {
2567   // In order to get locks to work, we need to fake a in_VM state
2568   if (ShowMessageBoxOnError ) {
2569     JavaThread* thread = JavaThread::current();
2570     JavaThreadState saved_state = thread->thread_state();
2571     thread->set_thread_state(_thread_in_vm);
2572 #ifndef PRODUCT
2573     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2574       ttyLocker ttyl;
2575       BytecodeCounter::print();
2576     }
2577 #endif
2578     if (os::message_box(msg, "Execution stopped, print registers?")) {
2579       ttyLocker ttyl;
2580       tty->print_cr(" pc = 0x%016lx", pc);
2581 #ifndef PRODUCT
2582       tty->cr();
2583       findpc(pc);
2584       tty->cr();
2585 #endif
2586       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2587       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2588       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2589       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2590       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2591       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2592       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2593       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2594       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2595       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2596       tty->print_cr("r10 = 0x%016lx", regs[10]);
2597       tty->print_cr("r11 = 0x%016lx", regs[11]);
2598       tty->print_cr("r12 = 0x%016lx", regs[12]);
2599       tty->print_cr("r13 = 0x%016lx", regs[13]);
2600       tty->print_cr("r14 = 0x%016lx", regs[14]);
2601       tty->print_cr("r15 = 0x%016lx", regs[15]);
2602       tty->print_cr("r16 = 0x%016lx", regs[16]);
2603       tty->print_cr("r17 = 0x%016lx", regs[17]);
2604       tty->print_cr("r18 = 0x%016lx", regs[18]);
2605       tty->print_cr("r19 = 0x%016lx", regs[19]);
2606       tty->print_cr("r20 = 0x%016lx", regs[20]);
2607       tty->print_cr("r21 = 0x%016lx", regs[21]);
2608       tty->print_cr("r22 = 0x%016lx", regs[22]);
2609       tty->print_cr("r23 = 0x%016lx", regs[23]);
2610       tty->print_cr("r24 = 0x%016lx", regs[24]);
2611       tty->print_cr("r25 = 0x%016lx", regs[25]);
2612       tty->print_cr("r26 = 0x%016lx", regs[26]);
2613       tty->print_cr("r27 = 0x%016lx", regs[27]);
2614       tty->print_cr("r28 = 0x%016lx", regs[28]);
2615       tty->print_cr("r30 = 0x%016lx", regs[30]);
2616       tty->print_cr("r31 = 0x%016lx", regs[31]);
2617       BREAKPOINT;
2618     }
2619   }
2620   fatal("DEBUG MESSAGE: %s", msg);
2621 }
2622 
push_call_clobbered_registers()2623 void MacroAssembler::push_call_clobbered_registers() {
2624   int step = 4 * wordSize;
2625   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2626   sub(sp, sp, step);
2627   mov(rscratch1, -step);
2628   // Push v0-v7, v16-v31.
2629   for (int i = 31; i>= 4; i -= 4) {
2630     if (i <= v7->encoding() || i >= v16->encoding())
2631       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2632           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2633   }
2634   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2635       as_FloatRegister(3), T1D, Address(sp));
2636 }
2637 
pop_call_clobbered_registers()2638 void MacroAssembler::pop_call_clobbered_registers() {
2639   for (int i = 0; i < 32; i += 4) {
2640     if (i <= v7->encoding() || i >= v16->encoding())
2641       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2642           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2643   }
2644 
2645   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2646 }
2647 
push_CPU_state(bool save_vectors)2648 void MacroAssembler::push_CPU_state(bool save_vectors) {
2649   int step = (save_vectors ? 8 : 4) * wordSize;
2650   push(0x3fffffff, sp);         // integer registers except lr & sp
2651   mov(rscratch1, -step);
2652   sub(sp, sp, step);
2653   for (int i = 28; i >= 4; i -= 4) {
2654     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2655         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2656   }
2657   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2658 }
2659 
pop_CPU_state(bool restore_vectors)2660 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2661   int step = (restore_vectors ? 8 : 4) * wordSize;
2662   for (int i = 0; i <= 28; i += 4)
2663     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2664         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2665   pop(0x3fffffff, sp);         // integer registers except lr & sp
2666 }
2667 
2668 /**
2669  * Helpers for multiply_to_len().
2670  */
add2_with_carry(Register final_dest_hi,Register dest_hi,Register dest_lo,Register src1,Register src2)2671 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2672                                      Register src1, Register src2) {
2673   adds(dest_lo, dest_lo, src1);
2674   adc(dest_hi, dest_hi, zr);
2675   adds(dest_lo, dest_lo, src2);
2676   adc(final_dest_hi, dest_hi, zr);
2677 }
2678 
2679 // Generate an address from (r + r1 extend offset).  "size" is the
2680 // size of the operand.  The result may be in rscratch2.
offsetted_address(Register r,Register r1,Address::extend ext,int offset,int size)2681 Address MacroAssembler::offsetted_address(Register r, Register r1,
2682                                           Address::extend ext, int offset, int size) {
2683   if (offset || (ext.shift() % size != 0)) {
2684     lea(rscratch2, Address(r, r1, ext));
2685     return Address(rscratch2, offset);
2686   } else {
2687     return Address(r, r1, ext);
2688   }
2689 }
2690 
spill_address(int size,int offset,Register tmp)2691 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2692 {
2693   assert(offset >= 0, "spill to negative address?");
2694   // Offset reachable ?
2695   //   Not aligned - 9 bits signed offset
2696   //   Aligned - 12 bits unsigned offset shifted
2697   Register base = sp;
2698   if ((offset & (size-1)) && offset >= (1<<8)) {
2699     add(tmp, base, offset & ((1<<12)-1));
2700     base = tmp;
2701     offset &= -1u<<12;
2702   }
2703 
2704   if (offset >= (1<<12) * size) {
2705     add(tmp, base, offset & (((1<<12)-1)<<12));
2706     base = tmp;
2707     offset &= ~(((1<<12)-1)<<12);
2708   }
2709 
2710   return Address(base, offset);
2711 }
2712 
2713 // Checks whether offset is aligned.
2714 // Returns true if it is, else false.
merge_alignment_check(Register base,size_t size,long cur_offset,long prev_offset) const2715 bool MacroAssembler::merge_alignment_check(Register base,
2716                                            size_t size,
2717                                            long cur_offset,
2718                                            long prev_offset) const {
2719   if (AvoidUnalignedAccesses) {
2720     if (base == sp) {
2721       // Checks whether low offset if aligned to pair of registers.
2722       long pair_mask = size * 2 - 1;
2723       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2724       return (offset & pair_mask) == 0;
2725     } else { // If base is not sp, we can't guarantee the access is aligned.
2726       return false;
2727     }
2728   } else {
2729     long mask = size - 1;
2730     // Load/store pair instruction only supports element size aligned offset.
2731     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2732   }
2733 }
2734 
2735 // Checks whether current and previous loads/stores can be merged.
2736 // Returns true if it can be merged, else false.
ldst_can_merge(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store) const2737 bool MacroAssembler::ldst_can_merge(Register rt,
2738                                     const Address &adr,
2739                                     size_t cur_size_in_bytes,
2740                                     bool is_store) const {
2741   address prev = pc() - NativeInstruction::instruction_size;
2742   address last = code()->last_insn();
2743 
2744   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2745     return false;
2746   }
2747 
2748   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2749     return false;
2750   }
2751 
2752   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2753   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2754 
2755   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2756   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2757 
2758   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2759     return false;
2760   }
2761 
2762   long max_offset = 63 * prev_size_in_bytes;
2763   long min_offset = -64 * prev_size_in_bytes;
2764 
2765   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2766 
2767   // Only same base can be merged.
2768   if (adr.base() != prev_ldst->base()) {
2769     return false;
2770   }
2771 
2772   long cur_offset = adr.offset();
2773   long prev_offset = prev_ldst->offset();
2774   size_t diff = abs(cur_offset - prev_offset);
2775   if (diff != prev_size_in_bytes) {
2776     return false;
2777   }
2778 
2779   // Following cases can not be merged:
2780   // ldr x2, [x2, #8]
2781   // ldr x3, [x2, #16]
2782   // or:
2783   // ldr x2, [x3, #8]
2784   // ldr x2, [x3, #16]
2785   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2786   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2787     return false;
2788   }
2789 
2790   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2791   // Offset range must be in ldp/stp instruction's range.
2792   if (low_offset > max_offset || low_offset < min_offset) {
2793     return false;
2794   }
2795 
2796   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2797     return true;
2798   }
2799 
2800   return false;
2801 }
2802 
2803 // Merge current load/store with previous load/store into ldp/stp.
merge_ldst(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store)2804 void MacroAssembler::merge_ldst(Register rt,
2805                                 const Address &adr,
2806                                 size_t cur_size_in_bytes,
2807                                 bool is_store) {
2808 
2809   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2810 
2811   Register rt_low, rt_high;
2812   address prev = pc() - NativeInstruction::instruction_size;
2813   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2814 
2815   long offset;
2816 
2817   if (adr.offset() < prev_ldst->offset()) {
2818     offset = adr.offset();
2819     rt_low = rt;
2820     rt_high = prev_ldst->target();
2821   } else {
2822     offset = prev_ldst->offset();
2823     rt_low = prev_ldst->target();
2824     rt_high = rt;
2825   }
2826 
2827   Address adr_p = Address(prev_ldst->base(), offset);
2828   // Overwrite previous generated binary.
2829   code_section()->set_end(prev);
2830 
2831   const int sz = prev_ldst->size_in_bytes();
2832   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2833   if (!is_store) {
2834     BLOCK_COMMENT("merged ldr pair");
2835     if (sz == 8) {
2836       ldp(rt_low, rt_high, adr_p);
2837     } else {
2838       ldpw(rt_low, rt_high, adr_p);
2839     }
2840   } else {
2841     BLOCK_COMMENT("merged str pair");
2842     if (sz == 8) {
2843       stp(rt_low, rt_high, adr_p);
2844     } else {
2845       stpw(rt_low, rt_high, adr_p);
2846     }
2847   }
2848 }
2849 
2850 /**
2851  * Multiply 64 bit by 64 bit first loop.
2852  */
multiply_64_x_64_loop(Register x,Register xstart,Register x_xstart,Register y,Register y_idx,Register z,Register carry,Register product,Register idx,Register kdx)2853 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2854                                            Register y, Register y_idx, Register z,
2855                                            Register carry, Register product,
2856                                            Register idx, Register kdx) {
2857   //
2858   //  jlong carry, x[], y[], z[];
2859   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2860   //    huge_128 product = y[idx] * x[xstart] + carry;
2861   //    z[kdx] = (jlong)product;
2862   //    carry  = (jlong)(product >>> 64);
2863   //  }
2864   //  z[xstart] = carry;
2865   //
2866 
2867   Label L_first_loop, L_first_loop_exit;
2868   Label L_one_x, L_one_y, L_multiply;
2869 
2870   subsw(xstart, xstart, 1);
2871   br(Assembler::MI, L_one_x);
2872 
2873   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2874   ldr(x_xstart, Address(rscratch1));
2875   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2876 
2877   bind(L_first_loop);
2878   subsw(idx, idx, 1);
2879   br(Assembler::MI, L_first_loop_exit);
2880   subsw(idx, idx, 1);
2881   br(Assembler::MI, L_one_y);
2882   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2883   ldr(y_idx, Address(rscratch1));
2884   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2885   bind(L_multiply);
2886 
2887   // AArch64 has a multiply-accumulate instruction that we can't use
2888   // here because it has no way to process carries, so we have to use
2889   // separate add and adc instructions.  Bah.
2890   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2891   mul(product, x_xstart, y_idx);
2892   adds(product, product, carry);
2893   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2894 
2895   subw(kdx, kdx, 2);
2896   ror(product, product, 32); // back to big-endian
2897   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2898 
2899   b(L_first_loop);
2900 
2901   bind(L_one_y);
2902   ldrw(y_idx, Address(y,  0));
2903   b(L_multiply);
2904 
2905   bind(L_one_x);
2906   ldrw(x_xstart, Address(x,  0));
2907   b(L_first_loop);
2908 
2909   bind(L_first_loop_exit);
2910 }
2911 
2912 /**
2913  * Multiply 128 bit by 128. Unrolled inner loop.
2914  *
2915  */
multiply_128_x_128_loop(Register y,Register z,Register carry,Register carry2,Register idx,Register jdx,Register yz_idx1,Register yz_idx2,Register tmp,Register tmp3,Register tmp4,Register tmp6,Register product_hi)2916 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2917                                              Register carry, Register carry2,
2918                                              Register idx, Register jdx,
2919                                              Register yz_idx1, Register yz_idx2,
2920                                              Register tmp, Register tmp3, Register tmp4,
2921                                              Register tmp6, Register product_hi) {
2922 
2923   //   jlong carry, x[], y[], z[];
2924   //   int kdx = ystart+1;
2925   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2926   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2927   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2928   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2929   //     carry  = (jlong)(tmp4 >>> 64);
2930   //     z[kdx+idx+1] = (jlong)tmp3;
2931   //     z[kdx+idx] = (jlong)tmp4;
2932   //   }
2933   //   idx += 2;
2934   //   if (idx > 0) {
2935   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2936   //     z[kdx+idx] = (jlong)yz_idx1;
2937   //     carry  = (jlong)(yz_idx1 >>> 64);
2938   //   }
2939   //
2940 
2941   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2942 
2943   lsrw(jdx, idx, 2);
2944 
2945   bind(L_third_loop);
2946 
2947   subsw(jdx, jdx, 1);
2948   br(Assembler::MI, L_third_loop_exit);
2949   subw(idx, idx, 4);
2950 
2951   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2952 
2953   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2954 
2955   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2956 
2957   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2958   ror(yz_idx2, yz_idx2, 32);
2959 
2960   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2961 
2962   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2963   umulh(tmp4, product_hi, yz_idx1);
2964 
2965   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2966   ror(rscratch2, rscratch2, 32);
2967 
2968   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2969   umulh(carry2, product_hi, yz_idx2);
2970 
2971   // propagate sum of both multiplications into carry:tmp4:tmp3
2972   adds(tmp3, tmp3, carry);
2973   adc(tmp4, tmp4, zr);
2974   adds(tmp3, tmp3, rscratch1);
2975   adcs(tmp4, tmp4, tmp);
2976   adc(carry, carry2, zr);
2977   adds(tmp4, tmp4, rscratch2);
2978   adc(carry, carry, zr);
2979 
2980   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2981   ror(tmp4, tmp4, 32);
2982   stp(tmp4, tmp3, Address(tmp6, 0));
2983 
2984   b(L_third_loop);
2985   bind (L_third_loop_exit);
2986 
2987   andw (idx, idx, 0x3);
2988   cbz(idx, L_post_third_loop_done);
2989 
2990   Label L_check_1;
2991   subsw(idx, idx, 2);
2992   br(Assembler::MI, L_check_1);
2993 
2994   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2995   ldr(yz_idx1, Address(rscratch1, 0));
2996   ror(yz_idx1, yz_idx1, 32);
2997   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2998   umulh(tmp4, product_hi, yz_idx1);
2999   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3000   ldr(yz_idx2, Address(rscratch1, 0));
3001   ror(yz_idx2, yz_idx2, 32);
3002 
3003   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3004 
3005   ror(tmp3, tmp3, 32);
3006   str(tmp3, Address(rscratch1, 0));
3007 
3008   bind (L_check_1);
3009 
3010   andw (idx, idx, 0x1);
3011   subsw(idx, idx, 1);
3012   br(Assembler::MI, L_post_third_loop_done);
3013   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3014   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3015   umulh(carry2, tmp4, product_hi);
3016   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3017 
3018   add2_with_carry(carry2, tmp3, tmp4, carry);
3019 
3020   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3021   extr(carry, carry2, tmp3, 32);
3022 
3023   bind(L_post_third_loop_done);
3024 }
3025 
3026 /**
3027  * Code for BigInteger::multiplyToLen() instrinsic.
3028  *
3029  * r0: x
3030  * r1: xlen
3031  * r2: y
3032  * r3: ylen
3033  * r4:  z
3034  * r5: zlen
3035  * r10: tmp1
3036  * r11: tmp2
3037  * r12: tmp3
3038  * r13: tmp4
3039  * r14: tmp5
3040  * r15: tmp6
3041  * r16: tmp7
3042  *
3043  */
multiply_to_len(Register x,Register xlen,Register y,Register ylen,Register z,Register zlen,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,Register product_hi)3044 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3045                                      Register z, Register zlen,
3046                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3047                                      Register tmp5, Register tmp6, Register product_hi) {
3048 
3049   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3050 
3051   const Register idx = tmp1;
3052   const Register kdx = tmp2;
3053   const Register xstart = tmp3;
3054 
3055   const Register y_idx = tmp4;
3056   const Register carry = tmp5;
3057   const Register product  = xlen;
3058   const Register x_xstart = zlen;  // reuse register
3059 
3060   // First Loop.
3061   //
3062   //  final static long LONG_MASK = 0xffffffffL;
3063   //  int xstart = xlen - 1;
3064   //  int ystart = ylen - 1;
3065   //  long carry = 0;
3066   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3067   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3068   //    z[kdx] = (int)product;
3069   //    carry = product >>> 32;
3070   //  }
3071   //  z[xstart] = (int)carry;
3072   //
3073 
3074   movw(idx, ylen);      // idx = ylen;
3075   movw(kdx, zlen);      // kdx = xlen+ylen;
3076   mov(carry, zr);       // carry = 0;
3077 
3078   Label L_done;
3079 
3080   movw(xstart, xlen);
3081   subsw(xstart, xstart, 1);
3082   br(Assembler::MI, L_done);
3083 
3084   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3085 
3086   Label L_second_loop;
3087   cbzw(kdx, L_second_loop);
3088 
3089   Label L_carry;
3090   subw(kdx, kdx, 1);
3091   cbzw(kdx, L_carry);
3092 
3093   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3094   lsr(carry, carry, 32);
3095   subw(kdx, kdx, 1);
3096 
3097   bind(L_carry);
3098   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3099 
3100   // Second and third (nested) loops.
3101   //
3102   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3103   //   carry = 0;
3104   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3105   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3106   //                    (z[k] & LONG_MASK) + carry;
3107   //     z[k] = (int)product;
3108   //     carry = product >>> 32;
3109   //   }
3110   //   z[i] = (int)carry;
3111   // }
3112   //
3113   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3114 
3115   const Register jdx = tmp1;
3116 
3117   bind(L_second_loop);
3118   mov(carry, zr);                // carry = 0;
3119   movw(jdx, ylen);               // j = ystart+1
3120 
3121   subsw(xstart, xstart, 1);      // i = xstart-1;
3122   br(Assembler::MI, L_done);
3123 
3124   str(z, Address(pre(sp, -4 * wordSize)));
3125 
3126   Label L_last_x;
3127   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3128   subsw(xstart, xstart, 1);       // i = xstart-1;
3129   br(Assembler::MI, L_last_x);
3130 
3131   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3132   ldr(product_hi, Address(rscratch1));
3133   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3134 
3135   Label L_third_loop_prologue;
3136   bind(L_third_loop_prologue);
3137 
3138   str(ylen, Address(sp, wordSize));
3139   stp(x, xstart, Address(sp, 2 * wordSize));
3140   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3141                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3142   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3143   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3144 
3145   addw(tmp3, xlen, 1);
3146   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3147   subsw(tmp3, tmp3, 1);
3148   br(Assembler::MI, L_done);
3149 
3150   lsr(carry, carry, 32);
3151   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3152   b(L_second_loop);
3153 
3154   // Next infrequent code is moved outside loops.
3155   bind(L_last_x);
3156   ldrw(product_hi, Address(x,  0));
3157   b(L_third_loop_prologue);
3158 
3159   bind(L_done);
3160 }
3161 
3162 // Code for BigInteger::mulAdd instrinsic
3163 // out     = r0
3164 // in      = r1
3165 // offset  = r2  (already out.length-offset)
3166 // len     = r3
3167 // k       = r4
3168 //
3169 // pseudo code from java implementation:
3170 // carry = 0;
3171 // offset = out.length-offset - 1;
3172 // for (int j=len-1; j >= 0; j--) {
3173 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3174 //     out[offset--] = (int)product;
3175 //     carry = product >>> 32;
3176 // }
3177 // return (int)carry;
mul_add(Register out,Register in,Register offset,Register len,Register k)3178 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3179       Register len, Register k) {
3180     Label LOOP, END;
3181     // pre-loop
3182     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3183     csel(out, zr, out, Assembler::EQ);
3184     br(Assembler::EQ, END);
3185     add(in, in, len, LSL, 2); // in[j+1] address
3186     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3187     mov(out, zr); // used to keep carry now
3188     BIND(LOOP);
3189     ldrw(rscratch1, Address(pre(in, -4)));
3190     madd(rscratch1, rscratch1, k, out);
3191     ldrw(rscratch2, Address(pre(offset, -4)));
3192     add(rscratch1, rscratch1, rscratch2);
3193     strw(rscratch1, Address(offset));
3194     lsr(out, rscratch1, 32);
3195     subs(len, len, 1);
3196     br(Assembler::NE, LOOP);
3197     BIND(END);
3198 }
3199 
3200 /**
3201  * Emits code to update CRC-32 with a byte value according to constants in table
3202  *
3203  * @param [in,out]crc   Register containing the crc.
3204  * @param [in]val       Register containing the byte to fold into the CRC.
3205  * @param [in]table     Register containing the table of crc constants.
3206  *
3207  * uint32_t crc;
3208  * val = crc_table[(val ^ crc) & 0xFF];
3209  * crc = val ^ (crc >> 8);
3210  *
3211  */
update_byte_crc32(Register crc,Register val,Register table)3212 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3213   eor(val, val, crc);
3214   andr(val, val, 0xff);
3215   ldrw(val, Address(table, val, Address::lsl(2)));
3216   eor(crc, val, crc, Assembler::LSR, 8);
3217 }
3218 
3219 /**
3220  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3221  *
3222  * @param [in,out]crc   Register containing the crc.
3223  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3224  * @param [in]table0    Register containing table 0 of crc constants.
3225  * @param [in]table1    Register containing table 1 of crc constants.
3226  * @param [in]table2    Register containing table 2 of crc constants.
3227  * @param [in]table3    Register containing table 3 of crc constants.
3228  *
3229  * uint32_t crc;
3230  *   v = crc ^ v
3231  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3232  *
3233  */
update_word_crc32(Register crc,Register v,Register tmp,Register table0,Register table1,Register table2,Register table3,bool upper)3234 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3235         Register table0, Register table1, Register table2, Register table3,
3236         bool upper) {
3237   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3238   uxtb(tmp, v);
3239   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3240   ubfx(tmp, v, 8, 8);
3241   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3242   eor(crc, crc, tmp);
3243   ubfx(tmp, v, 16, 8);
3244   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3245   eor(crc, crc, tmp);
3246   ubfx(tmp, v, 24, 8);
3247   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3248   eor(crc, crc, tmp);
3249 }
3250 
kernel_crc32_using_crc32(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3251 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3252         Register len, Register tmp0, Register tmp1, Register tmp2,
3253         Register tmp3) {
3254     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3255     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3256 
3257     mvnw(crc, crc);
3258 
3259     subs(len, len, 128);
3260     br(Assembler::GE, CRC_by64_pre);
3261   BIND(CRC_less64);
3262     adds(len, len, 128-32);
3263     br(Assembler::GE, CRC_by32_loop);
3264   BIND(CRC_less32);
3265     adds(len, len, 32-4);
3266     br(Assembler::GE, CRC_by4_loop);
3267     adds(len, len, 4);
3268     br(Assembler::GT, CRC_by1_loop);
3269     b(L_exit);
3270 
3271   BIND(CRC_by32_loop);
3272     ldp(tmp0, tmp1, Address(post(buf, 16)));
3273     subs(len, len, 32);
3274     crc32x(crc, crc, tmp0);
3275     ldr(tmp2, Address(post(buf, 8)));
3276     crc32x(crc, crc, tmp1);
3277     ldr(tmp3, Address(post(buf, 8)));
3278     crc32x(crc, crc, tmp2);
3279     crc32x(crc, crc, tmp3);
3280     br(Assembler::GE, CRC_by32_loop);
3281     cmn(len, 32);
3282     br(Assembler::NE, CRC_less32);
3283     b(L_exit);
3284 
3285   BIND(CRC_by4_loop);
3286     ldrw(tmp0, Address(post(buf, 4)));
3287     subs(len, len, 4);
3288     crc32w(crc, crc, tmp0);
3289     br(Assembler::GE, CRC_by4_loop);
3290     adds(len, len, 4);
3291     br(Assembler::LE, L_exit);
3292   BIND(CRC_by1_loop);
3293     ldrb(tmp0, Address(post(buf, 1)));
3294     subs(len, len, 1);
3295     crc32b(crc, crc, tmp0);
3296     br(Assembler::GT, CRC_by1_loop);
3297     b(L_exit);
3298 
3299   BIND(CRC_by64_pre);
3300     sub(buf, buf, 8);
3301     ldp(tmp0, tmp1, Address(buf, 8));
3302     crc32x(crc, crc, tmp0);
3303     ldr(tmp2, Address(buf, 24));
3304     crc32x(crc, crc, tmp1);
3305     ldr(tmp3, Address(buf, 32));
3306     crc32x(crc, crc, tmp2);
3307     ldr(tmp0, Address(buf, 40));
3308     crc32x(crc, crc, tmp3);
3309     ldr(tmp1, Address(buf, 48));
3310     crc32x(crc, crc, tmp0);
3311     ldr(tmp2, Address(buf, 56));
3312     crc32x(crc, crc, tmp1);
3313     ldr(tmp3, Address(pre(buf, 64)));
3314 
3315     b(CRC_by64_loop);
3316 
3317     align(CodeEntryAlignment);
3318   BIND(CRC_by64_loop);
3319     subs(len, len, 64);
3320     crc32x(crc, crc, tmp2);
3321     ldr(tmp0, Address(buf, 8));
3322     crc32x(crc, crc, tmp3);
3323     ldr(tmp1, Address(buf, 16));
3324     crc32x(crc, crc, tmp0);
3325     ldr(tmp2, Address(buf, 24));
3326     crc32x(crc, crc, tmp1);
3327     ldr(tmp3, Address(buf, 32));
3328     crc32x(crc, crc, tmp2);
3329     ldr(tmp0, Address(buf, 40));
3330     crc32x(crc, crc, tmp3);
3331     ldr(tmp1, Address(buf, 48));
3332     crc32x(crc, crc, tmp0);
3333     ldr(tmp2, Address(buf, 56));
3334     crc32x(crc, crc, tmp1);
3335     ldr(tmp3, Address(pre(buf, 64)));
3336     br(Assembler::GE, CRC_by64_loop);
3337 
3338     // post-loop
3339     crc32x(crc, crc, tmp2);
3340     crc32x(crc, crc, tmp3);
3341 
3342     sub(len, len, 64);
3343     add(buf, buf, 8);
3344     cmn(len, 128);
3345     br(Assembler::NE, CRC_less64);
3346   BIND(L_exit);
3347     mvnw(crc, crc);
3348 }
3349 
3350 /**
3351  * @param crc   register containing existing CRC (32-bit)
3352  * @param buf   register pointing to input byte buffer (byte*)
3353  * @param len   register containing number of bytes
3354  * @param table register that will contain address of CRC table
3355  * @param tmp   scratch register
3356  */
kernel_crc32(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3357 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3358         Register table0, Register table1, Register table2, Register table3,
3359         Register tmp, Register tmp2, Register tmp3) {
3360   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3361   unsigned long offset;
3362 
3363   if (UseCRC32) {
3364       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3365       return;
3366   }
3367 
3368     mvnw(crc, crc);
3369 
3370     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3371     if (offset) add(table0, table0, offset);
3372     add(table1, table0, 1*256*sizeof(juint));
3373     add(table2, table0, 2*256*sizeof(juint));
3374     add(table3, table0, 3*256*sizeof(juint));
3375 
3376   if (UseNeon) {
3377       cmp(len, (u1)64);
3378       br(Assembler::LT, L_by16);
3379       eor(v16, T16B, v16, v16);
3380 
3381     Label L_fold;
3382 
3383       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3384 
3385       ld1(v0, v1, T2D, post(buf, 32));
3386       ld1r(v4, T2D, post(tmp, 8));
3387       ld1r(v5, T2D, post(tmp, 8));
3388       ld1r(v6, T2D, post(tmp, 8));
3389       ld1r(v7, T2D, post(tmp, 8));
3390       mov(v16, T4S, 0, crc);
3391 
3392       eor(v0, T16B, v0, v16);
3393       sub(len, len, 64);
3394 
3395     BIND(L_fold);
3396       pmull(v22, T8H, v0, v5, T8B);
3397       pmull(v20, T8H, v0, v7, T8B);
3398       pmull(v23, T8H, v0, v4, T8B);
3399       pmull(v21, T8H, v0, v6, T8B);
3400 
3401       pmull2(v18, T8H, v0, v5, T16B);
3402       pmull2(v16, T8H, v0, v7, T16B);
3403       pmull2(v19, T8H, v0, v4, T16B);
3404       pmull2(v17, T8H, v0, v6, T16B);
3405 
3406       uzp1(v24, T8H, v20, v22);
3407       uzp2(v25, T8H, v20, v22);
3408       eor(v20, T16B, v24, v25);
3409 
3410       uzp1(v26, T8H, v16, v18);
3411       uzp2(v27, T8H, v16, v18);
3412       eor(v16, T16B, v26, v27);
3413 
3414       ushll2(v22, T4S, v20, T8H, 8);
3415       ushll(v20, T4S, v20, T4H, 8);
3416 
3417       ushll2(v18, T4S, v16, T8H, 8);
3418       ushll(v16, T4S, v16, T4H, 8);
3419 
3420       eor(v22, T16B, v23, v22);
3421       eor(v18, T16B, v19, v18);
3422       eor(v20, T16B, v21, v20);
3423       eor(v16, T16B, v17, v16);
3424 
3425       uzp1(v17, T2D, v16, v20);
3426       uzp2(v21, T2D, v16, v20);
3427       eor(v17, T16B, v17, v21);
3428 
3429       ushll2(v20, T2D, v17, T4S, 16);
3430       ushll(v16, T2D, v17, T2S, 16);
3431 
3432       eor(v20, T16B, v20, v22);
3433       eor(v16, T16B, v16, v18);
3434 
3435       uzp1(v17, T2D, v20, v16);
3436       uzp2(v21, T2D, v20, v16);
3437       eor(v28, T16B, v17, v21);
3438 
3439       pmull(v22, T8H, v1, v5, T8B);
3440       pmull(v20, T8H, v1, v7, T8B);
3441       pmull(v23, T8H, v1, v4, T8B);
3442       pmull(v21, T8H, v1, v6, T8B);
3443 
3444       pmull2(v18, T8H, v1, v5, T16B);
3445       pmull2(v16, T8H, v1, v7, T16B);
3446       pmull2(v19, T8H, v1, v4, T16B);
3447       pmull2(v17, T8H, v1, v6, T16B);
3448 
3449       ld1(v0, v1, T2D, post(buf, 32));
3450 
3451       uzp1(v24, T8H, v20, v22);
3452       uzp2(v25, T8H, v20, v22);
3453       eor(v20, T16B, v24, v25);
3454 
3455       uzp1(v26, T8H, v16, v18);
3456       uzp2(v27, T8H, v16, v18);
3457       eor(v16, T16B, v26, v27);
3458 
3459       ushll2(v22, T4S, v20, T8H, 8);
3460       ushll(v20, T4S, v20, T4H, 8);
3461 
3462       ushll2(v18, T4S, v16, T8H, 8);
3463       ushll(v16, T4S, v16, T4H, 8);
3464 
3465       eor(v22, T16B, v23, v22);
3466       eor(v18, T16B, v19, v18);
3467       eor(v20, T16B, v21, v20);
3468       eor(v16, T16B, v17, v16);
3469 
3470       uzp1(v17, T2D, v16, v20);
3471       uzp2(v21, T2D, v16, v20);
3472       eor(v16, T16B, v17, v21);
3473 
3474       ushll2(v20, T2D, v16, T4S, 16);
3475       ushll(v16, T2D, v16, T2S, 16);
3476 
3477       eor(v20, T16B, v22, v20);
3478       eor(v16, T16B, v16, v18);
3479 
3480       uzp1(v17, T2D, v20, v16);
3481       uzp2(v21, T2D, v20, v16);
3482       eor(v20, T16B, v17, v21);
3483 
3484       shl(v16, T2D, v28, 1);
3485       shl(v17, T2D, v20, 1);
3486 
3487       eor(v0, T16B, v0, v16);
3488       eor(v1, T16B, v1, v17);
3489 
3490       subs(len, len, 32);
3491       br(Assembler::GE, L_fold);
3492 
3493       mov(crc, 0);
3494       mov(tmp, v0, T1D, 0);
3495       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3496       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3497       mov(tmp, v0, T1D, 1);
3498       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3499       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3500       mov(tmp, v1, T1D, 0);
3501       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3502       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3503       mov(tmp, v1, T1D, 1);
3504       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3505       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3506 
3507       add(len, len, 32);
3508   }
3509 
3510   BIND(L_by16);
3511     subs(len, len, 16);
3512     br(Assembler::GE, L_by16_loop);
3513     adds(len, len, 16-4);
3514     br(Assembler::GE, L_by4_loop);
3515     adds(len, len, 4);
3516     br(Assembler::GT, L_by1_loop);
3517     b(L_exit);
3518 
3519   BIND(L_by4_loop);
3520     ldrw(tmp, Address(post(buf, 4)));
3521     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3522     subs(len, len, 4);
3523     br(Assembler::GE, L_by4_loop);
3524     adds(len, len, 4);
3525     br(Assembler::LE, L_exit);
3526   BIND(L_by1_loop);
3527     subs(len, len, 1);
3528     ldrb(tmp, Address(post(buf, 1)));
3529     update_byte_crc32(crc, tmp, table0);
3530     br(Assembler::GT, L_by1_loop);
3531     b(L_exit);
3532 
3533     align(CodeEntryAlignment);
3534   BIND(L_by16_loop);
3535     subs(len, len, 16);
3536     ldp(tmp, tmp3, Address(post(buf, 16)));
3537     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3538     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3539     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3540     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3541     br(Assembler::GE, L_by16_loop);
3542     adds(len, len, 16-4);
3543     br(Assembler::GE, L_by4_loop);
3544     adds(len, len, 4);
3545     br(Assembler::GT, L_by1_loop);
3546   BIND(L_exit);
3547     mvnw(crc, crc);
3548 }
3549 
kernel_crc32c_using_crc32c(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3550 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3551         Register len, Register tmp0, Register tmp1, Register tmp2,
3552         Register tmp3) {
3553     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3554     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3555 
3556     subs(len, len, 128);
3557     br(Assembler::GE, CRC_by64_pre);
3558   BIND(CRC_less64);
3559     adds(len, len, 128-32);
3560     br(Assembler::GE, CRC_by32_loop);
3561   BIND(CRC_less32);
3562     adds(len, len, 32-4);
3563     br(Assembler::GE, CRC_by4_loop);
3564     adds(len, len, 4);
3565     br(Assembler::GT, CRC_by1_loop);
3566     b(L_exit);
3567 
3568   BIND(CRC_by32_loop);
3569     ldp(tmp0, tmp1, Address(post(buf, 16)));
3570     subs(len, len, 32);
3571     crc32cx(crc, crc, tmp0);
3572     ldr(tmp2, Address(post(buf, 8)));
3573     crc32cx(crc, crc, tmp1);
3574     ldr(tmp3, Address(post(buf, 8)));
3575     crc32cx(crc, crc, tmp2);
3576     crc32cx(crc, crc, tmp3);
3577     br(Assembler::GE, CRC_by32_loop);
3578     cmn(len, 32);
3579     br(Assembler::NE, CRC_less32);
3580     b(L_exit);
3581 
3582   BIND(CRC_by4_loop);
3583     ldrw(tmp0, Address(post(buf, 4)));
3584     subs(len, len, 4);
3585     crc32cw(crc, crc, tmp0);
3586     br(Assembler::GE, CRC_by4_loop);
3587     adds(len, len, 4);
3588     br(Assembler::LE, L_exit);
3589   BIND(CRC_by1_loop);
3590     ldrb(tmp0, Address(post(buf, 1)));
3591     subs(len, len, 1);
3592     crc32cb(crc, crc, tmp0);
3593     br(Assembler::GT, CRC_by1_loop);
3594     b(L_exit);
3595 
3596   BIND(CRC_by64_pre);
3597     sub(buf, buf, 8);
3598     ldp(tmp0, tmp1, Address(buf, 8));
3599     crc32cx(crc, crc, tmp0);
3600     ldr(tmp2, Address(buf, 24));
3601     crc32cx(crc, crc, tmp1);
3602     ldr(tmp3, Address(buf, 32));
3603     crc32cx(crc, crc, tmp2);
3604     ldr(tmp0, Address(buf, 40));
3605     crc32cx(crc, crc, tmp3);
3606     ldr(tmp1, Address(buf, 48));
3607     crc32cx(crc, crc, tmp0);
3608     ldr(tmp2, Address(buf, 56));
3609     crc32cx(crc, crc, tmp1);
3610     ldr(tmp3, Address(pre(buf, 64)));
3611 
3612     b(CRC_by64_loop);
3613 
3614     align(CodeEntryAlignment);
3615   BIND(CRC_by64_loop);
3616     subs(len, len, 64);
3617     crc32cx(crc, crc, tmp2);
3618     ldr(tmp0, Address(buf, 8));
3619     crc32cx(crc, crc, tmp3);
3620     ldr(tmp1, Address(buf, 16));
3621     crc32cx(crc, crc, tmp0);
3622     ldr(tmp2, Address(buf, 24));
3623     crc32cx(crc, crc, tmp1);
3624     ldr(tmp3, Address(buf, 32));
3625     crc32cx(crc, crc, tmp2);
3626     ldr(tmp0, Address(buf, 40));
3627     crc32cx(crc, crc, tmp3);
3628     ldr(tmp1, Address(buf, 48));
3629     crc32cx(crc, crc, tmp0);
3630     ldr(tmp2, Address(buf, 56));
3631     crc32cx(crc, crc, tmp1);
3632     ldr(tmp3, Address(pre(buf, 64)));
3633     br(Assembler::GE, CRC_by64_loop);
3634 
3635     // post-loop
3636     crc32cx(crc, crc, tmp2);
3637     crc32cx(crc, crc, tmp3);
3638 
3639     sub(len, len, 64);
3640     add(buf, buf, 8);
3641     cmn(len, 128);
3642     br(Assembler::NE, CRC_less64);
3643   BIND(L_exit);
3644 }
3645 
3646 /**
3647  * @param crc   register containing existing CRC (32-bit)
3648  * @param buf   register pointing to input byte buffer (byte*)
3649  * @param len   register containing number of bytes
3650  * @param table register that will contain address of CRC table
3651  * @param tmp   scratch register
3652  */
kernel_crc32c(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3653 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3654         Register table0, Register table1, Register table2, Register table3,
3655         Register tmp, Register tmp2, Register tmp3) {
3656   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3657 }
3658 
3659 
SkipIfEqual(MacroAssembler * masm,const bool * flag_addr,bool value)3660 SkipIfEqual::SkipIfEqual(
3661     MacroAssembler* masm, const bool* flag_addr, bool value) {
3662   _masm = masm;
3663   unsigned long offset;
3664   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3665   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3666   _masm->cbzw(rscratch1, _label);
3667 }
3668 
~SkipIfEqual()3669 SkipIfEqual::~SkipIfEqual() {
3670   _masm->bind(_label);
3671 }
3672 
addptr(const Address & dst,int32_t src)3673 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3674   Address adr;
3675   switch(dst.getMode()) {
3676   case Address::base_plus_offset:
3677     // This is the expected mode, although we allow all the other
3678     // forms below.
3679     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3680     break;
3681   default:
3682     lea(rscratch2, dst);
3683     adr = Address(rscratch2);
3684     break;
3685   }
3686   ldr(rscratch1, adr);
3687   add(rscratch1, rscratch1, src);
3688   str(rscratch1, adr);
3689 }
3690 
cmpptr(Register src1,Address src2)3691 void MacroAssembler::cmpptr(Register src1, Address src2) {
3692   unsigned long offset;
3693   adrp(rscratch1, src2, offset);
3694   ldr(rscratch1, Address(rscratch1, offset));
3695   cmp(src1, rscratch1);
3696 }
3697 
cmpoop(Register obj1,Register obj2)3698 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3699   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3700   bs->obj_equals(this, obj1, obj2);
3701 }
3702 
load_method_holder(Register holder,Register method)3703 void MacroAssembler::load_method_holder(Register holder, Register method) {
3704   ldr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3705   ldr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3706   ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
3707 }
3708 
load_klass(Register dst,Register src)3709 void MacroAssembler::load_klass(Register dst, Register src) {
3710   if (UseCompressedClassPointers) {
3711     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3712     decode_klass_not_null(dst);
3713   } else {
3714     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3715   }
3716 }
3717 
3718 // ((OopHandle)result).resolve();
resolve_oop_handle(Register result,Register tmp)3719 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3720   // OopHandle::resolve is an indirection.
3721   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3722 }
3723 
load_mirror(Register dst,Register method,Register tmp)3724 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3725   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3726   ldr(dst, Address(rmethod, Method::const_offset()));
3727   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3728   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3729   ldr(dst, Address(dst, mirror_offset));
3730   resolve_oop_handle(dst, tmp);
3731 }
3732 
cmp_klass(Register oop,Register trial_klass,Register tmp)3733 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3734   if (UseCompressedClassPointers) {
3735     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3736     if (CompressedKlassPointers::base() == NULL) {
3737       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3738       return;
3739     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3740                && CompressedKlassPointers::shift() == 0) {
3741       // Only the bottom 32 bits matter
3742       cmpw(trial_klass, tmp);
3743       return;
3744     }
3745     decode_klass_not_null(tmp);
3746   } else {
3747     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3748   }
3749   cmp(trial_klass, tmp);
3750 }
3751 
load_prototype_header(Register dst,Register src)3752 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3753   load_klass(dst, src);
3754   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3755 }
3756 
store_klass(Register dst,Register src)3757 void MacroAssembler::store_klass(Register dst, Register src) {
3758   // FIXME: Should this be a store release?  concurrent gcs assumes
3759   // klass length is valid if klass field is not null.
3760   if (UseCompressedClassPointers) {
3761     encode_klass_not_null(src);
3762     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3763   } else {
3764     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3765   }
3766 }
3767 
store_klass_gap(Register dst,Register src)3768 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3769   if (UseCompressedClassPointers) {
3770     // Store to klass gap in destination
3771     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3772   }
3773 }
3774 
3775 // Algorithm must match CompressedOops::encode.
encode_heap_oop(Register d,Register s)3776 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3777 #ifdef ASSERT
3778   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3779 #endif
3780   verify_oop(s, "broken oop in encode_heap_oop");
3781   if (CompressedOops::base() == NULL) {
3782     if (CompressedOops::shift() != 0) {
3783       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3784       lsr(d, s, LogMinObjAlignmentInBytes);
3785     } else {
3786       mov(d, s);
3787     }
3788   } else {
3789     subs(d, s, rheapbase);
3790     csel(d, d, zr, Assembler::HS);
3791     lsr(d, d, LogMinObjAlignmentInBytes);
3792 
3793     /*  Old algorithm: is this any worse?
3794     Label nonnull;
3795     cbnz(r, nonnull);
3796     sub(r, r, rheapbase);
3797     bind(nonnull);
3798     lsr(r, r, LogMinObjAlignmentInBytes);
3799     */
3800   }
3801 }
3802 
encode_heap_oop_not_null(Register r)3803 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3804 #ifdef ASSERT
3805   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3806   if (CheckCompressedOops) {
3807     Label ok;
3808     cbnz(r, ok);
3809     stop("null oop passed to encode_heap_oop_not_null");
3810     bind(ok);
3811   }
3812 #endif
3813   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3814   if (CompressedOops::base() != NULL) {
3815     sub(r, r, rheapbase);
3816   }
3817   if (CompressedOops::shift() != 0) {
3818     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3819     lsr(r, r, LogMinObjAlignmentInBytes);
3820   }
3821 }
3822 
encode_heap_oop_not_null(Register dst,Register src)3823 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3824 #ifdef ASSERT
3825   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3826   if (CheckCompressedOops) {
3827     Label ok;
3828     cbnz(src, ok);
3829     stop("null oop passed to encode_heap_oop_not_null2");
3830     bind(ok);
3831   }
3832 #endif
3833   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3834 
3835   Register data = src;
3836   if (CompressedOops::base() != NULL) {
3837     sub(dst, src, rheapbase);
3838     data = dst;
3839   }
3840   if (CompressedOops::shift() != 0) {
3841     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3842     lsr(dst, data, LogMinObjAlignmentInBytes);
3843     data = dst;
3844   }
3845   if (data == src)
3846     mov(dst, src);
3847 }
3848 
decode_heap_oop(Register d,Register s)3849 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3850 #ifdef ASSERT
3851   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3852 #endif
3853   if (CompressedOops::base() == NULL) {
3854     if (CompressedOops::shift() != 0 || d != s) {
3855       lsl(d, s, CompressedOops::shift());
3856     }
3857   } else {
3858     Label done;
3859     if (d != s)
3860       mov(d, s);
3861     cbz(s, done);
3862     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3863     bind(done);
3864   }
3865   verify_oop(d, "broken oop in decode_heap_oop");
3866 }
3867 
decode_heap_oop_not_null(Register r)3868 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3869   assert (UseCompressedOops, "should only be used for compressed headers");
3870   assert (Universe::heap() != NULL, "java heap should be initialized");
3871   // Cannot assert, unverified entry point counts instructions (see .ad file)
3872   // vtableStubs also counts instructions in pd_code_size_limit.
3873   // Also do not verify_oop as this is called by verify_oop.
3874   if (CompressedOops::shift() != 0) {
3875     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3876     if (CompressedOops::base() != NULL) {
3877       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3878     } else {
3879       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3880     }
3881   } else {
3882     assert (CompressedOops::base() == NULL, "sanity");
3883   }
3884 }
3885 
decode_heap_oop_not_null(Register dst,Register src)3886 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3887   assert (UseCompressedOops, "should only be used for compressed headers");
3888   assert (Universe::heap() != NULL, "java heap should be initialized");
3889   // Cannot assert, unverified entry point counts instructions (see .ad file)
3890   // vtableStubs also counts instructions in pd_code_size_limit.
3891   // Also do not verify_oop as this is called by verify_oop.
3892   if (CompressedOops::shift() != 0) {
3893     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3894     if (CompressedOops::base() != NULL) {
3895       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3896     } else {
3897       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3898     }
3899   } else {
3900     assert (CompressedOops::base() == NULL, "sanity");
3901     if (dst != src) {
3902       mov(dst, src);
3903     }
3904   }
3905 }
3906 
encode_klass_not_null(Register dst,Register src)3907 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3908   if (CompressedKlassPointers::base() == NULL) {
3909     if (CompressedKlassPointers::shift() != 0) {
3910       assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3911       lsr(dst, src, LogKlassAlignmentInBytes);
3912     } else {
3913       if (dst != src) mov(dst, src);
3914     }
3915     return;
3916   }
3917 
3918   if (use_XOR_for_compressed_class_base) {
3919     if (CompressedKlassPointers::shift() != 0) {
3920       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3921       lsr(dst, dst, LogKlassAlignmentInBytes);
3922     } else {
3923       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3924     }
3925     return;
3926   }
3927 
3928   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3929       && CompressedKlassPointers::shift() == 0) {
3930     movw(dst, src);
3931     return;
3932   }
3933 
3934 #ifdef ASSERT
3935   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3936 #endif
3937 
3938   Register rbase = dst;
3939   if (dst == src) rbase = rheapbase;
3940   mov(rbase, (uint64_t)CompressedKlassPointers::base());
3941   sub(dst, src, rbase);
3942   if (CompressedKlassPointers::shift() != 0) {
3943     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3944     lsr(dst, dst, LogKlassAlignmentInBytes);
3945   }
3946   if (dst == src) reinit_heapbase();
3947 }
3948 
encode_klass_not_null(Register r)3949 void MacroAssembler::encode_klass_not_null(Register r) {
3950   encode_klass_not_null(r, r);
3951 }
3952 
decode_klass_not_null(Register dst,Register src)3953 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3954   Register rbase = dst;
3955   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3956 
3957   if (CompressedKlassPointers::base() == NULL) {
3958     if (CompressedKlassPointers::shift() != 0) {
3959       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3960       lsl(dst, src, LogKlassAlignmentInBytes);
3961     } else {
3962       if (dst != src) mov(dst, src);
3963     }
3964     return;
3965   }
3966 
3967   if (use_XOR_for_compressed_class_base) {
3968     if (CompressedKlassPointers::shift() != 0) {
3969       lsl(dst, src, LogKlassAlignmentInBytes);
3970       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
3971     } else {
3972       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3973     }
3974     return;
3975   }
3976 
3977   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3978       && CompressedKlassPointers::shift() == 0) {
3979     if (dst != src)
3980       movw(dst, src);
3981     movk(dst, (uint64_t)CompressedKlassPointers::base() >> 32, 32);
3982     return;
3983   }
3984 
3985   // Cannot assert, unverified entry point counts instructions (see .ad file)
3986   // vtableStubs also counts instructions in pd_code_size_limit.
3987   // Also do not verify_oop as this is called by verify_oop.
3988   if (dst == src) rbase = rheapbase;
3989   mov(rbase, (uint64_t)CompressedKlassPointers::base());
3990   if (CompressedKlassPointers::shift() != 0) {
3991     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3992     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3993   } else {
3994     add(dst, rbase, src);
3995   }
3996   if (dst == src) reinit_heapbase();
3997 }
3998 
decode_klass_not_null(Register r)3999 void  MacroAssembler::decode_klass_not_null(Register r) {
4000   decode_klass_not_null(r, r);
4001 }
4002 
set_narrow_oop(Register dst,jobject obj)4003 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4004 #ifdef ASSERT
4005   {
4006     ThreadInVMfromUnknown tiv;
4007     assert (UseCompressedOops, "should only be used for compressed oops");
4008     assert (Universe::heap() != NULL, "java heap should be initialized");
4009     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4010     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4011   }
4012 #endif
4013   int oop_index = oop_recorder()->find_index(obj);
4014   InstructionMark im(this);
4015   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4016   code_section()->relocate(inst_mark(), rspec);
4017   movz(dst, 0xDEAD, 16);
4018   movk(dst, 0xBEEF);
4019 }
4020 
set_narrow_klass(Register dst,Klass * k)4021 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4022   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4023   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4024   int index = oop_recorder()->find_index(k);
4025   assert(! Universe::heap()->is_in(k), "should not be an oop");
4026 
4027   InstructionMark im(this);
4028   RelocationHolder rspec = metadata_Relocation::spec(index);
4029   code_section()->relocate(inst_mark(), rspec);
4030   narrowKlass nk = CompressedKlassPointers::encode(k);
4031   movz(dst, (nk >> 16), 16);
4032   movk(dst, nk & 0xffff);
4033 }
4034 
access_load_at(BasicType type,DecoratorSet decorators,Register dst,Address src,Register tmp1,Register thread_tmp)4035 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4036                                     Register dst, Address src,
4037                                     Register tmp1, Register thread_tmp) {
4038   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4039   decorators = AccessInternal::decorator_fixup(decorators);
4040   bool as_raw = (decorators & AS_RAW) != 0;
4041   if (as_raw) {
4042     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4043   } else {
4044     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4045   }
4046 }
4047 
access_store_at(BasicType type,DecoratorSet decorators,Address dst,Register src,Register tmp1,Register thread_tmp)4048 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4049                                      Address dst, Register src,
4050                                      Register tmp1, Register thread_tmp) {
4051   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4052   decorators = AccessInternal::decorator_fixup(decorators);
4053   bool as_raw = (decorators & AS_RAW) != 0;
4054   if (as_raw) {
4055     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4056   } else {
4057     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4058   }
4059 }
4060 
resolve(DecoratorSet decorators,Register obj)4061 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4062   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4063   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4064     decorators |= ACCESS_READ | ACCESS_WRITE;
4065   }
4066   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4067   return bs->resolve(this, decorators, obj);
4068 }
4069 
load_heap_oop(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4070 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4071                                    Register thread_tmp, DecoratorSet decorators) {
4072   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4073 }
4074 
load_heap_oop_not_null(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4075 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4076                                             Register thread_tmp, DecoratorSet decorators) {
4077   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4078 }
4079 
store_heap_oop(Address dst,Register src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4080 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4081                                     Register thread_tmp, DecoratorSet decorators) {
4082   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4083 }
4084 
4085 // Used for storing NULLs.
store_heap_oop_null(Address dst)4086 void MacroAssembler::store_heap_oop_null(Address dst) {
4087   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4088 }
4089 
allocate_metadata_address(Metadata * obj)4090 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4091   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4092   int index = oop_recorder()->allocate_metadata_index(obj);
4093   RelocationHolder rspec = metadata_Relocation::spec(index);
4094   return Address((address)obj, rspec);
4095 }
4096 
4097 // Move an oop into a register.  immediate is true if we want
4098 // immediate instrcutions, i.e. we are not going to patch this
4099 // instruction while the code is being executed by another thread.  In
4100 // that case we can use move immediates rather than the constant pool.
movoop(Register dst,jobject obj,bool immediate)4101 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4102   int oop_index;
4103   if (obj == NULL) {
4104     oop_index = oop_recorder()->allocate_oop_index(obj);
4105   } else {
4106 #ifdef ASSERT
4107     {
4108       ThreadInVMfromUnknown tiv;
4109       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4110     }
4111 #endif
4112     oop_index = oop_recorder()->find_index(obj);
4113   }
4114   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4115   if (! immediate) {
4116     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4117     ldr_constant(dst, Address(dummy, rspec));
4118   } else
4119     mov(dst, Address((address)obj, rspec));
4120 }
4121 
4122 // Move a metadata address into a register.
mov_metadata(Register dst,Metadata * obj)4123 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4124   int oop_index;
4125   if (obj == NULL) {
4126     oop_index = oop_recorder()->allocate_metadata_index(obj);
4127   } else {
4128     oop_index = oop_recorder()->find_index(obj);
4129   }
4130   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4131   mov(dst, Address((address)obj, rspec));
4132 }
4133 
constant_oop_address(jobject obj)4134 Address MacroAssembler::constant_oop_address(jobject obj) {
4135 #ifdef ASSERT
4136   {
4137     ThreadInVMfromUnknown tiv;
4138     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4139     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4140   }
4141 #endif
4142   int oop_index = oop_recorder()->find_index(obj);
4143   return Address((address)obj, oop_Relocation::spec(oop_index));
4144 }
4145 
4146 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
tlab_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Register t2,Label & slow_case)4147 void MacroAssembler::tlab_allocate(Register obj,
4148                                    Register var_size_in_bytes,
4149                                    int con_size_in_bytes,
4150                                    Register t1,
4151                                    Register t2,
4152                                    Label& slow_case) {
4153   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4154   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4155 }
4156 
4157 // Defines obj, preserves var_size_in_bytes
eden_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Label & slow_case)4158 void MacroAssembler::eden_allocate(Register obj,
4159                                    Register var_size_in_bytes,
4160                                    int con_size_in_bytes,
4161                                    Register t1,
4162                                    Label& slow_case) {
4163   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4164   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4165 }
4166 
4167 // Zero words; len is in bytes
4168 // Destroys all registers except addr
4169 // len must be a nonzero multiple of wordSize
zero_memory(Register addr,Register len,Register t1)4170 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4171   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4172 
4173 #ifdef ASSERT
4174   { Label L;
4175     tst(len, BytesPerWord - 1);
4176     br(Assembler::EQ, L);
4177     stop("len is not a multiple of BytesPerWord");
4178     bind(L);
4179   }
4180 #endif
4181 
4182 #ifndef PRODUCT
4183   block_comment("zero memory");
4184 #endif
4185 
4186   Label loop;
4187   Label entry;
4188 
4189 //  Algorithm:
4190 //
4191 //    scratch1 = cnt & 7;
4192 //    cnt -= scratch1;
4193 //    p += scratch1;
4194 //    switch (scratch1) {
4195 //      do {
4196 //        cnt -= 8;
4197 //          p[-8] = 0;
4198 //        case 7:
4199 //          p[-7] = 0;
4200 //        case 6:
4201 //          p[-6] = 0;
4202 //          // ...
4203 //        case 1:
4204 //          p[-1] = 0;
4205 //        case 0:
4206 //          p += 8;
4207 //      } while (cnt);
4208 //    }
4209 
4210   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4211 
4212   lsr(len, len, LogBytesPerWord);
4213   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4214   sub(len, len, rscratch1);      // cnt -= unroll
4215   // t1 always points to the end of the region we're about to zero
4216   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4217   adr(rscratch2, entry);
4218   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4219   br(rscratch2);
4220   bind(loop);
4221   sub(len, len, unroll);
4222   for (int i = -unroll; i < 0; i++)
4223     Assembler::str(zr, Address(t1, i * wordSize));
4224   bind(entry);
4225   add(t1, t1, unroll * wordSize);
4226   cbnz(len, loop);
4227 }
4228 
verify_tlab()4229 void MacroAssembler::verify_tlab() {
4230 #ifdef ASSERT
4231   if (UseTLAB && VerifyOops) {
4232     Label next, ok;
4233 
4234     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4235 
4236     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4237     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4238     cmp(rscratch2, rscratch1);
4239     br(Assembler::HS, next);
4240     STOP("assert(top >= start)");
4241     should_not_reach_here();
4242 
4243     bind(next);
4244     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4245     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4246     cmp(rscratch2, rscratch1);
4247     br(Assembler::HS, ok);
4248     STOP("assert(top <= end)");
4249     should_not_reach_here();
4250 
4251     bind(ok);
4252     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4253   }
4254 #endif
4255 }
4256 
4257 // Writes to stack successive pages until offset reached to check for
4258 // stack overflow + shadow pages.  This clobbers tmp.
bang_stack_size(Register size,Register tmp)4259 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4260   assert_different_registers(tmp, size, rscratch1);
4261   mov(tmp, sp);
4262   // Bang stack for total size given plus shadow page size.
4263   // Bang one page at a time because large size can bang beyond yellow and
4264   // red zones.
4265   Label loop;
4266   mov(rscratch1, os::vm_page_size());
4267   bind(loop);
4268   lea(tmp, Address(tmp, -os::vm_page_size()));
4269   subsw(size, size, rscratch1);
4270   str(size, Address(tmp));
4271   br(Assembler::GT, loop);
4272 
4273   // Bang down shadow pages too.
4274   // At this point, (tmp-0) is the last address touched, so don't
4275   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4276   // was post-decremented.)  Skip this address by starting at i=1, and
4277   // touch a few more pages below.  N.B.  It is important to touch all
4278   // the way down to and including i=StackShadowPages.
4279   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4280     // this could be any sized move but this is can be a debugging crumb
4281     // so the bigger the better.
4282     lea(tmp, Address(tmp, -os::vm_page_size()));
4283     str(size, Address(tmp));
4284   }
4285 }
4286 
4287 
4288 // Move the address of the polling page into dest.
get_polling_page(Register dest,address page,relocInfo::relocType rtype)4289 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4290   if (SafepointMechanism::uses_thread_local_poll()) {
4291     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4292   } else {
4293     unsigned long off;
4294     adrp(dest, Address(page, rtype), off);
4295     assert(off == 0, "polling page must be page aligned");
4296   }
4297 }
4298 
4299 // Move the address of the polling page into r, then read the polling
4300 // page.
read_polling_page(Register r,address page,relocInfo::relocType rtype)4301 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4302   get_polling_page(r, page, rtype);
4303   return read_polling_page(r, rtype);
4304 }
4305 
4306 // Read the polling page.  The address of the polling page must
4307 // already be in r.
read_polling_page(Register r,relocInfo::relocType rtype)4308 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4309   InstructionMark im(this);
4310   code_section()->relocate(inst_mark(), rtype);
4311   ldrw(zr, Address(r, 0));
4312   return inst_mark();
4313 }
4314 
adrp(Register reg1,const Address & dest,unsigned long & byte_offset)4315 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4316   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4317   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4318   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4319   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4320   long offset_low = dest_page - low_page;
4321   long offset_high = dest_page - high_page;
4322 
4323   assert(is_valid_AArch64_address(dest.target()), "bad address");
4324   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4325 
4326   InstructionMark im(this);
4327   code_section()->relocate(inst_mark(), dest.rspec());
4328   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4329   // the code cache so that if it is relocated we know it will still reach
4330   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4331     _adrp(reg1, dest.target());
4332   } else {
4333     unsigned long target = (unsigned long)dest.target();
4334     unsigned long adrp_target
4335       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4336 
4337     _adrp(reg1, (address)adrp_target);
4338     movk(reg1, target >> 32, 32);
4339   }
4340   byte_offset = (unsigned long)dest.target() & 0xfff;
4341 }
4342 
load_byte_map_base(Register reg)4343 void MacroAssembler::load_byte_map_base(Register reg) {
4344   CardTable::CardValue* byte_map_base =
4345     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4346 
4347   if (is_valid_AArch64_address((address)byte_map_base)) {
4348     // Strictly speaking the byte_map_base isn't an address at all,
4349     // and it might even be negative.
4350     unsigned long offset;
4351     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4352     // We expect offset to be zero with most collectors.
4353     if (offset != 0) {
4354       add(reg, reg, offset);
4355     }
4356   } else {
4357     mov(reg, (uint64_t)byte_map_base);
4358   }
4359 }
4360 
build_frame(int framesize)4361 void MacroAssembler::build_frame(int framesize) {
4362   assert(framesize > 0, "framesize must be > 0");
4363   if (framesize < ((1 << 9) + 2 * wordSize)) {
4364     sub(sp, sp, framesize);
4365     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4366     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4367   } else {
4368     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4369     if (PreserveFramePointer) mov(rfp, sp);
4370     if (framesize < ((1 << 12) + 2 * wordSize))
4371       sub(sp, sp, framesize - 2 * wordSize);
4372     else {
4373       mov(rscratch1, framesize - 2 * wordSize);
4374       sub(sp, sp, rscratch1);
4375     }
4376   }
4377 }
4378 
remove_frame(int framesize)4379 void MacroAssembler::remove_frame(int framesize) {
4380   assert(framesize > 0, "framesize must be > 0");
4381   if (framesize < ((1 << 9) + 2 * wordSize)) {
4382     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4383     add(sp, sp, framesize);
4384   } else {
4385     if (framesize < ((1 << 12) + 2 * wordSize))
4386       add(sp, sp, framesize - 2 * wordSize);
4387     else {
4388       mov(rscratch1, framesize - 2 * wordSize);
4389       add(sp, sp, rscratch1);
4390     }
4391     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4392   }
4393 }
4394 
4395 #ifdef COMPILER2
4396 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4397 
4398 // Search for str1 in str2 and return index or -1
string_indexof(Register str2,Register str1,Register cnt2,Register cnt1,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,int icnt1,Register result,int ae)4399 void MacroAssembler::string_indexof(Register str2, Register str1,
4400                                     Register cnt2, Register cnt1,
4401                                     Register tmp1, Register tmp2,
4402                                     Register tmp3, Register tmp4,
4403                                     Register tmp5, Register tmp6,
4404                                     int icnt1, Register result, int ae) {
4405   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4406   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4407 
4408   Register ch1 = rscratch1;
4409   Register ch2 = rscratch2;
4410   Register cnt1tmp = tmp1;
4411   Register cnt2tmp = tmp2;
4412   Register cnt1_neg = cnt1;
4413   Register cnt2_neg = cnt2;
4414   Register result_tmp = tmp4;
4415 
4416   bool isL = ae == StrIntrinsicNode::LL;
4417 
4418   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4419   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4420   int str1_chr_shift = str1_isL ? 0:1;
4421   int str2_chr_shift = str2_isL ? 0:1;
4422   int str1_chr_size = str1_isL ? 1:2;
4423   int str2_chr_size = str2_isL ? 1:2;
4424   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4425                                       (chr_insn)&MacroAssembler::ldrh;
4426   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4427                                       (chr_insn)&MacroAssembler::ldrh;
4428   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4429   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4430 
4431   // Note, inline_string_indexOf() generates checks:
4432   // if (substr.count > string.count) return -1;
4433   // if (substr.count == 0) return 0;
4434 
4435   // We have two strings, a source string in str2, cnt2 and a pattern string
4436   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4437 
4438   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4439   // With a small pattern and source we use linear scan.
4440 
4441   if (icnt1 == -1) {
4442     sub(result_tmp, cnt2, cnt1);
4443     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4444     br(LT, LINEARSEARCH);
4445     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4446     subs(zr, cnt1, 256);
4447     lsr(tmp1, cnt2, 2);
4448     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4449     br(GE, LINEARSTUB);
4450   }
4451 
4452 // The Boyer Moore alogorithm is based on the description here:-
4453 //
4454 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4455 //
4456 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4457 // and the 'Good Suffix' rule.
4458 //
4459 // These rules are essentially heuristics for how far we can shift the
4460 // pattern along the search string.
4461 //
4462 // The implementation here uses the 'Bad Character' rule only because of the
4463 // complexity of initialisation for the 'Good Suffix' rule.
4464 //
4465 // This is also known as the Boyer-Moore-Horspool algorithm:-
4466 //
4467 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4468 //
4469 // This particular implementation has few java-specific optimizations.
4470 //
4471 // #define ASIZE 256
4472 //
4473 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4474 //       int i, j;
4475 //       unsigned c;
4476 //       unsigned char bc[ASIZE];
4477 //
4478 //       /* Preprocessing */
4479 //       for (i = 0; i < ASIZE; ++i)
4480 //          bc[i] = m;
4481 //       for (i = 0; i < m - 1; ) {
4482 //          c = x[i];
4483 //          ++i;
4484 //          // c < 256 for Latin1 string, so, no need for branch
4485 //          #ifdef PATTERN_STRING_IS_LATIN1
4486 //          bc[c] = m - i;
4487 //          #else
4488 //          if (c < ASIZE) bc[c] = m - i;
4489 //          #endif
4490 //       }
4491 //
4492 //       /* Searching */
4493 //       j = 0;
4494 //       while (j <= n - m) {
4495 //          c = y[i+j];
4496 //          if (x[m-1] == c)
4497 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4498 //          if (i < 0) return j;
4499 //          // c < 256 for Latin1 string, so, no need for branch
4500 //          #ifdef SOURCE_STRING_IS_LATIN1
4501 //          // LL case: (c< 256) always true. Remove branch
4502 //          j += bc[y[j+m-1]];
4503 //          #endif
4504 //          #ifndef PATTERN_STRING_IS_UTF
4505 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4506 //          if (c < ASIZE)
4507 //            j += bc[y[j+m-1]];
4508 //          else
4509 //            j += 1
4510 //          #endif
4511 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4512 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4513 //          if (c < ASIZE)
4514 //            j += bc[y[j+m-1]];
4515 //          else
4516 //            j += m
4517 //          #endif
4518 //       }
4519 //    }
4520 
4521   if (icnt1 == -1) {
4522     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4523         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4524     Register cnt1end = tmp2;
4525     Register str2end = cnt2;
4526     Register skipch = tmp2;
4527 
4528     // str1 length is >=8, so, we can read at least 1 register for cases when
4529     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4530     // UL case. We'll re-read last character in inner pre-loop code to have
4531     // single outer pre-loop load
4532     const int firstStep = isL ? 7 : 3;
4533 
4534     const int ASIZE = 256;
4535     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4536     sub(sp, sp, ASIZE);
4537     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4538     mov(ch1, sp);
4539     BIND(BM_INIT_LOOP);
4540       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4541       subs(tmp5, tmp5, 1);
4542       br(GT, BM_INIT_LOOP);
4543 
4544       sub(cnt1tmp, cnt1, 1);
4545       mov(tmp5, str2);
4546       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4547       sub(ch2, cnt1, 1);
4548       mov(tmp3, str1);
4549     BIND(BCLOOP);
4550       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4551       if (!str1_isL) {
4552         subs(zr, ch1, ASIZE);
4553         br(HS, BCSKIP);
4554       }
4555       strb(ch2, Address(sp, ch1));
4556     BIND(BCSKIP);
4557       subs(ch2, ch2, 1);
4558       br(GT, BCLOOP);
4559 
4560       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4561       if (str1_isL == str2_isL) {
4562         // load last 8 bytes (8LL/4UU symbols)
4563         ldr(tmp6, Address(tmp6, -wordSize));
4564       } else {
4565         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4566         // convert Latin1 to UTF. We'll have to wait until load completed, but
4567         // it's still faster than per-character loads+checks
4568         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4569         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4570         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4571         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4572         orr(ch2, ch1, ch2, LSL, 16);
4573         orr(tmp6, tmp6, tmp3, LSL, 48);
4574         orr(tmp6, tmp6, ch2, LSL, 16);
4575       }
4576     BIND(BMLOOPSTR2);
4577       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4578       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4579       if (str1_isL == str2_isL) {
4580         // re-init tmp3. It's for free because it's executed in parallel with
4581         // load above. Alternative is to initialize it before loop, but it'll
4582         // affect performance on in-order systems with 2 or more ld/st pipelines
4583         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4584       }
4585       if (!isL) { // UU/UL case
4586         lsl(ch2, cnt1tmp, 1); // offset in bytes
4587       }
4588       cmp(tmp3, skipch);
4589       br(NE, BMSKIP);
4590       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4591       mov(ch1, tmp6);
4592       if (isL) {
4593         b(BMLOOPSTR1_AFTER_LOAD);
4594       } else {
4595         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4596         b(BMLOOPSTR1_CMP);
4597       }
4598     BIND(BMLOOPSTR1);
4599       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4600       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4601     BIND(BMLOOPSTR1_AFTER_LOAD);
4602       subs(cnt1tmp, cnt1tmp, 1);
4603       br(LT, BMLOOPSTR1_LASTCMP);
4604     BIND(BMLOOPSTR1_CMP);
4605       cmp(ch1, ch2);
4606       br(EQ, BMLOOPSTR1);
4607     BIND(BMSKIP);
4608       if (!isL) {
4609         // if we've met UTF symbol while searching Latin1 pattern, then we can
4610         // skip cnt1 symbols
4611         if (str1_isL != str2_isL) {
4612           mov(result_tmp, cnt1);
4613         } else {
4614           mov(result_tmp, 1);
4615         }
4616         subs(zr, skipch, ASIZE);
4617         br(HS, BMADV);
4618       }
4619       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4620     BIND(BMADV);
4621       sub(cnt1tmp, cnt1, 1);
4622       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4623       cmp(str2, str2end);
4624       br(LE, BMLOOPSTR2);
4625       add(sp, sp, ASIZE);
4626       b(NOMATCH);
4627     BIND(BMLOOPSTR1_LASTCMP);
4628       cmp(ch1, ch2);
4629       br(NE, BMSKIP);
4630     BIND(BMMATCH);
4631       sub(result, str2, tmp5);
4632       if (!str2_isL) lsr(result, result, 1);
4633       add(sp, sp, ASIZE);
4634       b(DONE);
4635 
4636     BIND(LINEARSTUB);
4637     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4638     br(LT, LINEAR_MEDIUM);
4639     mov(result, zr);
4640     RuntimeAddress stub = NULL;
4641     if (isL) {
4642       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4643       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4644     } else if (str1_isL) {
4645       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4646        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4647     } else {
4648       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4649       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4650     }
4651     trampoline_call(stub);
4652     b(DONE);
4653   }
4654 
4655   BIND(LINEARSEARCH);
4656   {
4657     Label DO1, DO2, DO3;
4658 
4659     Register str2tmp = tmp2;
4660     Register first = tmp3;
4661 
4662     if (icnt1 == -1)
4663     {
4664         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4665 
4666         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4667         br(LT, DOSHORT);
4668       BIND(LINEAR_MEDIUM);
4669         (this->*str1_load_1chr)(first, Address(str1));
4670         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4671         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4672         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4673         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4674 
4675       BIND(FIRST_LOOP);
4676         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4677         cmp(first, ch2);
4678         br(EQ, STR1_LOOP);
4679       BIND(STR2_NEXT);
4680         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4681         br(LE, FIRST_LOOP);
4682         b(NOMATCH);
4683 
4684       BIND(STR1_LOOP);
4685         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4686         add(cnt2tmp, cnt2_neg, str2_chr_size);
4687         br(GE, MATCH);
4688 
4689       BIND(STR1_NEXT);
4690         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4691         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4692         cmp(ch1, ch2);
4693         br(NE, STR2_NEXT);
4694         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4695         add(cnt2tmp, cnt2tmp, str2_chr_size);
4696         br(LT, STR1_NEXT);
4697         b(MATCH);
4698 
4699       BIND(DOSHORT);
4700       if (str1_isL == str2_isL) {
4701         cmp(cnt1, (u1)2);
4702         br(LT, DO1);
4703         br(GT, DO3);
4704       }
4705     }
4706 
4707     if (icnt1 == 4) {
4708       Label CH1_LOOP;
4709 
4710         (this->*load_4chr)(ch1, str1);
4711         sub(result_tmp, cnt2, 4);
4712         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4713         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4714 
4715       BIND(CH1_LOOP);
4716         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4717         cmp(ch1, ch2);
4718         br(EQ, MATCH);
4719         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4720         br(LE, CH1_LOOP);
4721         b(NOMATCH);
4722       }
4723 
4724     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4725       Label CH1_LOOP;
4726 
4727       BIND(DO2);
4728         (this->*load_2chr)(ch1, str1);
4729         if (icnt1 == 2) {
4730           sub(result_tmp, cnt2, 2);
4731         }
4732         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4733         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4734       BIND(CH1_LOOP);
4735         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4736         cmp(ch1, ch2);
4737         br(EQ, MATCH);
4738         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4739         br(LE, CH1_LOOP);
4740         b(NOMATCH);
4741     }
4742 
4743     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4744       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4745 
4746       BIND(DO3);
4747         (this->*load_2chr)(first, str1);
4748         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4749         if (icnt1 == 3) {
4750           sub(result_tmp, cnt2, 3);
4751         }
4752         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4753         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4754       BIND(FIRST_LOOP);
4755         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4756         cmpw(first, ch2);
4757         br(EQ, STR1_LOOP);
4758       BIND(STR2_NEXT);
4759         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4760         br(LE, FIRST_LOOP);
4761         b(NOMATCH);
4762 
4763       BIND(STR1_LOOP);
4764         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4765         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4766         cmp(ch1, ch2);
4767         br(NE, STR2_NEXT);
4768         b(MATCH);
4769     }
4770 
4771     if (icnt1 == -1 || icnt1 == 1) {
4772       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4773 
4774       BIND(DO1);
4775         (this->*str1_load_1chr)(ch1, str1);
4776         cmp(cnt2, (u1)8);
4777         br(LT, DO1_SHORT);
4778 
4779         sub(result_tmp, cnt2, 8/str2_chr_size);
4780         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4781         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4782         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4783 
4784         if (str2_isL) {
4785           orr(ch1, ch1, ch1, LSL, 8);
4786         }
4787         orr(ch1, ch1, ch1, LSL, 16);
4788         orr(ch1, ch1, ch1, LSL, 32);
4789       BIND(CH1_LOOP);
4790         ldr(ch2, Address(str2, cnt2_neg));
4791         eor(ch2, ch1, ch2);
4792         sub(tmp1, ch2, tmp3);
4793         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4794         bics(tmp1, tmp1, tmp2);
4795         br(NE, HAS_ZERO);
4796         adds(cnt2_neg, cnt2_neg, 8);
4797         br(LT, CH1_LOOP);
4798 
4799         cmp(cnt2_neg, (u1)8);
4800         mov(cnt2_neg, 0);
4801         br(LT, CH1_LOOP);
4802         b(NOMATCH);
4803 
4804       BIND(HAS_ZERO);
4805         rev(tmp1, tmp1);
4806         clz(tmp1, tmp1);
4807         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4808         b(MATCH);
4809 
4810       BIND(DO1_SHORT);
4811         mov(result_tmp, cnt2);
4812         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4813         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4814       BIND(DO1_LOOP);
4815         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4816         cmpw(ch1, ch2);
4817         br(EQ, MATCH);
4818         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4819         br(LT, DO1_LOOP);
4820     }
4821   }
4822   BIND(NOMATCH);
4823     mov(result, -1);
4824     b(DONE);
4825   BIND(MATCH);
4826     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4827   BIND(DONE);
4828 }
4829 
4830 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4831 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4832 
string_indexof_char(Register str1,Register cnt1,Register ch,Register result,Register tmp1,Register tmp2,Register tmp3)4833 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4834                                          Register ch, Register result,
4835                                          Register tmp1, Register tmp2, Register tmp3)
4836 {
4837   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4838   Register cnt1_neg = cnt1;
4839   Register ch1 = rscratch1;
4840   Register result_tmp = rscratch2;
4841 
4842   cbz(cnt1, NOMATCH);
4843 
4844   cmp(cnt1, (u1)4);
4845   br(LT, DO1_SHORT);
4846 
4847   orr(ch, ch, ch, LSL, 16);
4848   orr(ch, ch, ch, LSL, 32);
4849 
4850   sub(cnt1, cnt1, 4);
4851   mov(result_tmp, cnt1);
4852   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4853   sub(cnt1_neg, zr, cnt1, LSL, 1);
4854 
4855   mov(tmp3, 0x0001000100010001);
4856 
4857   BIND(CH1_LOOP);
4858     ldr(ch1, Address(str1, cnt1_neg));
4859     eor(ch1, ch, ch1);
4860     sub(tmp1, ch1, tmp3);
4861     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4862     bics(tmp1, tmp1, tmp2);
4863     br(NE, HAS_ZERO);
4864     adds(cnt1_neg, cnt1_neg, 8);
4865     br(LT, CH1_LOOP);
4866 
4867     cmp(cnt1_neg, (u1)8);
4868     mov(cnt1_neg, 0);
4869     br(LT, CH1_LOOP);
4870     b(NOMATCH);
4871 
4872   BIND(HAS_ZERO);
4873     rev(tmp1, tmp1);
4874     clz(tmp1, tmp1);
4875     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4876     b(MATCH);
4877 
4878   BIND(DO1_SHORT);
4879     mov(result_tmp, cnt1);
4880     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4881     sub(cnt1_neg, zr, cnt1, LSL, 1);
4882   BIND(DO1_LOOP);
4883     ldrh(ch1, Address(str1, cnt1_neg));
4884     cmpw(ch, ch1);
4885     br(EQ, MATCH);
4886     adds(cnt1_neg, cnt1_neg, 2);
4887     br(LT, DO1_LOOP);
4888   BIND(NOMATCH);
4889     mov(result, -1);
4890     b(DONE);
4891   BIND(MATCH);
4892     add(result, result_tmp, cnt1_neg, ASR, 1);
4893   BIND(DONE);
4894 }
4895 
4896 // Compare strings.
string_compare(Register str1,Register str2,Register cnt1,Register cnt2,Register result,Register tmp1,Register tmp2,FloatRegister vtmp1,FloatRegister vtmp2,FloatRegister vtmp3,int ae)4897 void MacroAssembler::string_compare(Register str1, Register str2,
4898     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4899     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4900   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4901       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4902       SHORT_LOOP_START, TAIL_CHECK;
4903 
4904   const u1 STUB_THRESHOLD = 64 + 8;
4905   bool isLL = ae == StrIntrinsicNode::LL;
4906   bool isLU = ae == StrIntrinsicNode::LU;
4907   bool isUL = ae == StrIntrinsicNode::UL;
4908 
4909   bool str1_isL = isLL || isLU;
4910   bool str2_isL = isLL || isUL;
4911 
4912   int str1_chr_shift = str1_isL ? 0 : 1;
4913   int str2_chr_shift = str2_isL ? 0 : 1;
4914   int str1_chr_size = str1_isL ? 1 : 2;
4915   int str2_chr_size = str2_isL ? 1 : 2;
4916   int minCharsInWord = isLL ? wordSize : wordSize/2;
4917 
4918   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4919   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4920                                       (chr_insn)&MacroAssembler::ldrh;
4921   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4922                                       (chr_insn)&MacroAssembler::ldrh;
4923   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4924                             (uxt_insn)&MacroAssembler::uxthw;
4925 
4926   BLOCK_COMMENT("string_compare {");
4927 
4928   // Bizzarely, the counts are passed in bytes, regardless of whether they
4929   // are L or U strings, however the result is always in characters.
4930   if (!str1_isL) asrw(cnt1, cnt1, 1);
4931   if (!str2_isL) asrw(cnt2, cnt2, 1);
4932 
4933   // Compute the minimum of the string lengths and save the difference.
4934   subsw(result, cnt1, cnt2);
4935   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4936 
4937   // A very short string
4938   cmpw(cnt2, minCharsInWord);
4939   br(Assembler::LE, SHORT_STRING);
4940 
4941   // Compare longwords
4942   // load first parts of strings and finish initialization while loading
4943   {
4944     if (str1_isL == str2_isL) { // LL or UU
4945       ldr(tmp1, Address(str1));
4946       cmp(str1, str2);
4947       br(Assembler::EQ, DONE);
4948       ldr(tmp2, Address(str2));
4949       cmp(cnt2, STUB_THRESHOLD);
4950       br(GE, STUB);
4951       subsw(cnt2, cnt2, minCharsInWord);
4952       br(EQ, TAIL_CHECK);
4953       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4954       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4955       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4956     } else if (isLU) {
4957       ldrs(vtmp, Address(str1));
4958       cmp(str1, str2);
4959       br(Assembler::EQ, DONE);
4960       ldr(tmp2, Address(str2));
4961       cmp(cnt2, STUB_THRESHOLD);
4962       br(GE, STUB);
4963       subw(cnt2, cnt2, 4);
4964       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4965       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4966       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4967       zip1(vtmp, T8B, vtmp, vtmpZ);
4968       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4969       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4970       add(cnt1, cnt1, 4);
4971       fmovd(tmp1, vtmp);
4972     } else { // UL case
4973       ldr(tmp1, Address(str1));
4974       cmp(str1, str2);
4975       br(Assembler::EQ, DONE);
4976       ldrs(vtmp, Address(str2));
4977       cmp(cnt2, STUB_THRESHOLD);
4978       br(GE, STUB);
4979       subw(cnt2, cnt2, 4);
4980       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4981       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4982       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4983       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4984       zip1(vtmp, T8B, vtmp, vtmpZ);
4985       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4986       add(cnt1, cnt1, 8);
4987       fmovd(tmp2, vtmp);
4988     }
4989     adds(cnt2, cnt2, isUL ? 4 : 8);
4990     br(GE, TAIL);
4991     eor(rscratch2, tmp1, tmp2);
4992     cbnz(rscratch2, DIFFERENCE);
4993     // main loop
4994     bind(NEXT_WORD);
4995     if (str1_isL == str2_isL) {
4996       ldr(tmp1, Address(str1, cnt2));
4997       ldr(tmp2, Address(str2, cnt2));
4998       adds(cnt2, cnt2, 8);
4999     } else if (isLU) {
5000       ldrs(vtmp, Address(str1, cnt1));
5001       ldr(tmp2, Address(str2, cnt2));
5002       add(cnt1, cnt1, 4);
5003       zip1(vtmp, T8B, vtmp, vtmpZ);
5004       fmovd(tmp1, vtmp);
5005       adds(cnt2, cnt2, 8);
5006     } else { // UL
5007       ldrs(vtmp, Address(str2, cnt2));
5008       ldr(tmp1, Address(str1, cnt1));
5009       zip1(vtmp, T8B, vtmp, vtmpZ);
5010       add(cnt1, cnt1, 8);
5011       fmovd(tmp2, vtmp);
5012       adds(cnt2, cnt2, 4);
5013     }
5014     br(GE, TAIL);
5015 
5016     eor(rscratch2, tmp1, tmp2);
5017     cbz(rscratch2, NEXT_WORD);
5018     b(DIFFERENCE);
5019     bind(TAIL);
5020     eor(rscratch2, tmp1, tmp2);
5021     cbnz(rscratch2, DIFFERENCE);
5022     // Last longword.  In the case where length == 4 we compare the
5023     // same longword twice, but that's still faster than another
5024     // conditional branch.
5025     if (str1_isL == str2_isL) {
5026       ldr(tmp1, Address(str1));
5027       ldr(tmp2, Address(str2));
5028     } else if (isLU) {
5029       ldrs(vtmp, Address(str1));
5030       ldr(tmp2, Address(str2));
5031       zip1(vtmp, T8B, vtmp, vtmpZ);
5032       fmovd(tmp1, vtmp);
5033     } else { // UL
5034       ldrs(vtmp, Address(str2));
5035       ldr(tmp1, Address(str1));
5036       zip1(vtmp, T8B, vtmp, vtmpZ);
5037       fmovd(tmp2, vtmp);
5038     }
5039     bind(TAIL_CHECK);
5040     eor(rscratch2, tmp1, tmp2);
5041     cbz(rscratch2, DONE);
5042 
5043     // Find the first different characters in the longwords and
5044     // compute their difference.
5045     bind(DIFFERENCE);
5046     rev(rscratch2, rscratch2);
5047     clz(rscratch2, rscratch2);
5048     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5049     lsrv(tmp1, tmp1, rscratch2);
5050     (this->*ext_chr)(tmp1, tmp1);
5051     lsrv(tmp2, tmp2, rscratch2);
5052     (this->*ext_chr)(tmp2, tmp2);
5053     subw(result, tmp1, tmp2);
5054     b(DONE);
5055   }
5056 
5057   bind(STUB);
5058     RuntimeAddress stub = NULL;
5059     switch(ae) {
5060       case StrIntrinsicNode::LL:
5061         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5062         break;
5063       case StrIntrinsicNode::UU:
5064         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5065         break;
5066       case StrIntrinsicNode::LU:
5067         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5068         break;
5069       case StrIntrinsicNode::UL:
5070         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5071         break;
5072       default:
5073         ShouldNotReachHere();
5074      }
5075     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5076     trampoline_call(stub);
5077     b(DONE);
5078 
5079   bind(SHORT_STRING);
5080   // Is the minimum length zero?
5081   cbz(cnt2, DONE);
5082   // arrange code to do most branches while loading and loading next characters
5083   // while comparing previous
5084   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5085   subs(cnt2, cnt2, 1);
5086   br(EQ, SHORT_LAST_INIT);
5087   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5088   b(SHORT_LOOP_START);
5089   bind(SHORT_LOOP);
5090   subs(cnt2, cnt2, 1);
5091   br(EQ, SHORT_LAST);
5092   bind(SHORT_LOOP_START);
5093   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5094   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5095   cmp(tmp1, cnt1);
5096   br(NE, SHORT_LOOP_TAIL);
5097   subs(cnt2, cnt2, 1);
5098   br(EQ, SHORT_LAST2);
5099   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5100   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5101   cmp(tmp2, rscratch1);
5102   br(EQ, SHORT_LOOP);
5103   sub(result, tmp2, rscratch1);
5104   b(DONE);
5105   bind(SHORT_LOOP_TAIL);
5106   sub(result, tmp1, cnt1);
5107   b(DONE);
5108   bind(SHORT_LAST2);
5109   cmp(tmp2, rscratch1);
5110   br(EQ, DONE);
5111   sub(result, tmp2, rscratch1);
5112 
5113   b(DONE);
5114   bind(SHORT_LAST_INIT);
5115   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5116   bind(SHORT_LAST);
5117   cmp(tmp1, cnt1);
5118   br(EQ, DONE);
5119   sub(result, tmp1, cnt1);
5120 
5121   bind(DONE);
5122 
5123   BLOCK_COMMENT("} string_compare");
5124 }
5125 #endif // COMPILER2
5126 
5127 // This method checks if provided byte array contains byte with highest bit set.
has_negatives(Register ary1,Register len,Register result)5128 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5129     // Simple and most common case of aligned small array which is not at the
5130     // end of memory page is placed here. All other cases are in stub.
5131     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5132     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5133     assert_different_registers(ary1, len, result);
5134 
5135     cmpw(len, 0);
5136     br(LE, SET_RESULT);
5137     cmpw(len, 4 * wordSize);
5138     br(GE, STUB_LONG); // size > 32 then go to stub
5139 
5140     int shift = 64 - exact_log2(os::vm_page_size());
5141     lsl(rscratch1, ary1, shift);
5142     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5143     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5144     br(CS, STUB); // at the end of page then go to stub
5145     subs(len, len, wordSize);
5146     br(LT, END);
5147 
5148   BIND(LOOP);
5149     ldr(rscratch1, Address(post(ary1, wordSize)));
5150     tst(rscratch1, UPPER_BIT_MASK);
5151     br(NE, SET_RESULT);
5152     subs(len, len, wordSize);
5153     br(GE, LOOP);
5154     cmpw(len, -wordSize);
5155     br(EQ, SET_RESULT);
5156 
5157   BIND(END);
5158     ldr(result, Address(ary1));
5159     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5160     lslv(result, result, len);
5161     tst(result, UPPER_BIT_MASK);
5162     b(SET_RESULT);
5163 
5164   BIND(STUB);
5165     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5166     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5167     trampoline_call(has_neg);
5168     b(DONE);
5169 
5170   BIND(STUB_LONG);
5171     RuntimeAddress has_neg_long =  RuntimeAddress(
5172             StubRoutines::aarch64::has_negatives_long());
5173     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5174     trampoline_call(has_neg_long);
5175     b(DONE);
5176 
5177   BIND(SET_RESULT);
5178     cset(result, NE); // set true or false
5179 
5180   BIND(DONE);
5181 }
5182 
arrays_equals(Register a1,Register a2,Register tmp3,Register tmp4,Register tmp5,Register result,Register cnt1,int elem_size)5183 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5184                                    Register tmp4, Register tmp5, Register result,
5185                                    Register cnt1, int elem_size) {
5186   Label DONE, SAME;
5187   Register tmp1 = rscratch1;
5188   Register tmp2 = rscratch2;
5189   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5190   int elem_per_word = wordSize/elem_size;
5191   int log_elem_size = exact_log2(elem_size);
5192   int length_offset = arrayOopDesc::length_offset_in_bytes();
5193   int base_offset
5194     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5195   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5196 
5197   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5198   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5199 
5200 #ifndef PRODUCT
5201   {
5202     const char kind = (elem_size == 2) ? 'U' : 'L';
5203     char comment[64];
5204     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5205     BLOCK_COMMENT(comment);
5206   }
5207 #endif
5208 
5209   // if (a1 == a2)
5210   //     return true;
5211   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5212   br(EQ, SAME);
5213 
5214   if (UseSimpleArrayEquals) {
5215     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5216     // if (a1 == null || a2 == null)
5217     //     return false;
5218     // a1 & a2 == 0 means (some-pointer is null) or
5219     // (very-rare-or-even-probably-impossible-pointer-values)
5220     // so, we can save one branch in most cases
5221     tst(a1, a2);
5222     mov(result, false);
5223     br(EQ, A_MIGHT_BE_NULL);
5224     // if (a1.length != a2.length)
5225     //      return false;
5226     bind(A_IS_NOT_NULL);
5227     ldrw(cnt1, Address(a1, length_offset));
5228     ldrw(cnt2, Address(a2, length_offset));
5229     eorw(tmp5, cnt1, cnt2);
5230     cbnzw(tmp5, DONE);
5231     lea(a1, Address(a1, base_offset));
5232     lea(a2, Address(a2, base_offset));
5233     // Check for short strings, i.e. smaller than wordSize.
5234     subs(cnt1, cnt1, elem_per_word);
5235     br(Assembler::LT, SHORT);
5236     // Main 8 byte comparison loop.
5237     bind(NEXT_WORD); {
5238       ldr(tmp1, Address(post(a1, wordSize)));
5239       ldr(tmp2, Address(post(a2, wordSize)));
5240       subs(cnt1, cnt1, elem_per_word);
5241       eor(tmp5, tmp1, tmp2);
5242       cbnz(tmp5, DONE);
5243     } br(GT, NEXT_WORD);
5244     // Last longword.  In the case where length == 4 we compare the
5245     // same longword twice, but that's still faster than another
5246     // conditional branch.
5247     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5248     // length == 4.
5249     if (log_elem_size > 0)
5250       lsl(cnt1, cnt1, log_elem_size);
5251     ldr(tmp3, Address(a1, cnt1));
5252     ldr(tmp4, Address(a2, cnt1));
5253     eor(tmp5, tmp3, tmp4);
5254     cbnz(tmp5, DONE);
5255     b(SAME);
5256     bind(A_MIGHT_BE_NULL);
5257     // in case both a1 and a2 are not-null, proceed with loads
5258     cbz(a1, DONE);
5259     cbz(a2, DONE);
5260     b(A_IS_NOT_NULL);
5261     bind(SHORT);
5262 
5263     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5264     {
5265       ldrw(tmp1, Address(post(a1, 4)));
5266       ldrw(tmp2, Address(post(a2, 4)));
5267       eorw(tmp5, tmp1, tmp2);
5268       cbnzw(tmp5, DONE);
5269     }
5270     bind(TAIL03);
5271     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5272     {
5273       ldrh(tmp3, Address(post(a1, 2)));
5274       ldrh(tmp4, Address(post(a2, 2)));
5275       eorw(tmp5, tmp3, tmp4);
5276       cbnzw(tmp5, DONE);
5277     }
5278     bind(TAIL01);
5279     if (elem_size == 1) { // Only needed when comparing byte arrays.
5280       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5281       {
5282         ldrb(tmp1, a1);
5283         ldrb(tmp2, a2);
5284         eorw(tmp5, tmp1, tmp2);
5285         cbnzw(tmp5, DONE);
5286       }
5287     }
5288   } else {
5289     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5290         CSET_EQ, LAST_CHECK;
5291     mov(result, false);
5292     cbz(a1, DONE);
5293     ldrw(cnt1, Address(a1, length_offset));
5294     cbz(a2, DONE);
5295     ldrw(cnt2, Address(a2, length_offset));
5296     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5297     // faster to perform another branch before comparing a1 and a2
5298     cmp(cnt1, (u1)elem_per_word);
5299     br(LE, SHORT); // short or same
5300     ldr(tmp3, Address(pre(a1, base_offset)));
5301     subs(zr, cnt1, stubBytesThreshold);
5302     br(GE, STUB);
5303     ldr(tmp4, Address(pre(a2, base_offset)));
5304     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5305     cmp(cnt2, cnt1);
5306     br(NE, DONE);
5307 
5308     // Main 16 byte comparison loop with 2 exits
5309     bind(NEXT_DWORD); {
5310       ldr(tmp1, Address(pre(a1, wordSize)));
5311       ldr(tmp2, Address(pre(a2, wordSize)));
5312       subs(cnt1, cnt1, 2 * elem_per_word);
5313       br(LE, TAIL);
5314       eor(tmp4, tmp3, tmp4);
5315       cbnz(tmp4, DONE);
5316       ldr(tmp3, Address(pre(a1, wordSize)));
5317       ldr(tmp4, Address(pre(a2, wordSize)));
5318       cmp(cnt1, (u1)elem_per_word);
5319       br(LE, TAIL2);
5320       cmp(tmp1, tmp2);
5321     } br(EQ, NEXT_DWORD);
5322     b(DONE);
5323 
5324     bind(TAIL);
5325     eor(tmp4, tmp3, tmp4);
5326     eor(tmp2, tmp1, tmp2);
5327     lslv(tmp2, tmp2, tmp5);
5328     orr(tmp5, tmp4, tmp2);
5329     cmp(tmp5, zr);
5330     b(CSET_EQ);
5331 
5332     bind(TAIL2);
5333     eor(tmp2, tmp1, tmp2);
5334     cbnz(tmp2, DONE);
5335     b(LAST_CHECK);
5336 
5337     bind(STUB);
5338     ldr(tmp4, Address(pre(a2, base_offset)));
5339     cmp(cnt2, cnt1);
5340     br(NE, DONE);
5341     if (elem_size == 2) { // convert to byte counter
5342       lsl(cnt1, cnt1, 1);
5343     }
5344     eor(tmp5, tmp3, tmp4);
5345     cbnz(tmp5, DONE);
5346     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5347     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5348     trampoline_call(stub);
5349     b(DONE);
5350 
5351     bind(EARLY_OUT);
5352     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5353     // so, if a2 == null => return false(0), else return true, so we can return a2
5354     mov(result, a2);
5355     b(DONE);
5356     bind(SHORT);
5357     cmp(cnt2, cnt1);
5358     br(NE, DONE);
5359     cbz(cnt1, SAME);
5360     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5361     ldr(tmp3, Address(a1, base_offset));
5362     ldr(tmp4, Address(a2, base_offset));
5363     bind(LAST_CHECK);
5364     eor(tmp4, tmp3, tmp4);
5365     lslv(tmp5, tmp4, tmp5);
5366     cmp(tmp5, zr);
5367     bind(CSET_EQ);
5368     cset(result, EQ);
5369     b(DONE);
5370   }
5371 
5372   bind(SAME);
5373   mov(result, true);
5374   // That's it.
5375   bind(DONE);
5376 
5377   BLOCK_COMMENT("} array_equals");
5378 }
5379 
5380 // Compare Strings
5381 
5382 // For Strings we're passed the address of the first characters in a1
5383 // and a2 and the length in cnt1.
5384 // elem_size is the element size in bytes: either 1 or 2.
5385 // There are two implementations.  For arrays >= 8 bytes, all
5386 // comparisons (including the final one, which may overlap) are
5387 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5388 // halfword, then a short, and then a byte.
5389 
string_equals(Register a1,Register a2,Register result,Register cnt1,int elem_size)5390 void MacroAssembler::string_equals(Register a1, Register a2,
5391                                    Register result, Register cnt1, int elem_size)
5392 {
5393   Label SAME, DONE, SHORT, NEXT_WORD;
5394   Register tmp1 = rscratch1;
5395   Register tmp2 = rscratch2;
5396   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5397 
5398   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5399   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5400 
5401 #ifndef PRODUCT
5402   {
5403     const char kind = (elem_size == 2) ? 'U' : 'L';
5404     char comment[64];
5405     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5406     BLOCK_COMMENT(comment);
5407   }
5408 #endif
5409 
5410   mov(result, false);
5411 
5412   // Check for short strings, i.e. smaller than wordSize.
5413   subs(cnt1, cnt1, wordSize);
5414   br(Assembler::LT, SHORT);
5415   // Main 8 byte comparison loop.
5416   bind(NEXT_WORD); {
5417     ldr(tmp1, Address(post(a1, wordSize)));
5418     ldr(tmp2, Address(post(a2, wordSize)));
5419     subs(cnt1, cnt1, wordSize);
5420     eor(tmp1, tmp1, tmp2);
5421     cbnz(tmp1, DONE);
5422   } br(GT, NEXT_WORD);
5423   // Last longword.  In the case where length == 4 we compare the
5424   // same longword twice, but that's still faster than another
5425   // conditional branch.
5426   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5427   // length == 4.
5428   ldr(tmp1, Address(a1, cnt1));
5429   ldr(tmp2, Address(a2, cnt1));
5430   eor(tmp2, tmp1, tmp2);
5431   cbnz(tmp2, DONE);
5432   b(SAME);
5433 
5434   bind(SHORT);
5435   Label TAIL03, TAIL01;
5436 
5437   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5438   {
5439     ldrw(tmp1, Address(post(a1, 4)));
5440     ldrw(tmp2, Address(post(a2, 4)));
5441     eorw(tmp1, tmp1, tmp2);
5442     cbnzw(tmp1, DONE);
5443   }
5444   bind(TAIL03);
5445   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5446   {
5447     ldrh(tmp1, Address(post(a1, 2)));
5448     ldrh(tmp2, Address(post(a2, 2)));
5449     eorw(tmp1, tmp1, tmp2);
5450     cbnzw(tmp1, DONE);
5451   }
5452   bind(TAIL01);
5453   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5454     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5455     {
5456       ldrb(tmp1, a1);
5457       ldrb(tmp2, a2);
5458       eorw(tmp1, tmp1, tmp2);
5459       cbnzw(tmp1, DONE);
5460     }
5461   }
5462   // Arrays are equal.
5463   bind(SAME);
5464   mov(result, true);
5465 
5466   // That's it.
5467   bind(DONE);
5468   BLOCK_COMMENT("} string_equals");
5469 }
5470 
5471 
5472 // The size of the blocks erased by the zero_blocks stub.  We must
5473 // handle anything smaller than this ourselves in zero_words().
5474 const int MacroAssembler::zero_words_block_size = 8;
5475 
5476 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5477 // possible, handling small word counts locally and delegating
5478 // anything larger to the zero_blocks stub.  It is expanded many times
5479 // in compiled code, so it is important to keep it short.
5480 
5481 // ptr:   Address of a buffer to be zeroed.
5482 // cnt:   Count in HeapWords.
5483 //
5484 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
zero_words(Register ptr,Register cnt)5485 void MacroAssembler::zero_words(Register ptr, Register cnt)
5486 {
5487   assert(is_power_of_2(zero_words_block_size), "adjust this");
5488   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5489 
5490   BLOCK_COMMENT("zero_words {");
5491   cmp(cnt, (u1)zero_words_block_size);
5492   Label around;
5493   br(LO, around);
5494   {
5495     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5496     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5497     if (StubRoutines::aarch64::complete()) {
5498       trampoline_call(zero_blocks);
5499     } else {
5500       bl(zero_blocks);
5501     }
5502   }
5503   bind(around);
5504   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5505     Label l;
5506     tbz(cnt, exact_log2(i), l);
5507     for (int j = 0; j < i; j += 2) {
5508       stp(zr, zr, post(ptr, 16));
5509     }
5510     bind(l);
5511   }
5512   {
5513     Label l;
5514     tbz(cnt, 0, l);
5515     str(zr, Address(ptr));
5516     bind(l);
5517   }
5518   BLOCK_COMMENT("} zero_words");
5519 }
5520 
5521 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5522 // cnt:          Immediate count in HeapWords.
5523 #define SmallArraySize (18 * BytesPerLong)
zero_words(Register base,u_int64_t cnt)5524 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5525 {
5526   BLOCK_COMMENT("zero_words {");
5527   int i = cnt & 1;  // store any odd word to start
5528   if (i) str(zr, Address(base));
5529 
5530   if (cnt <= SmallArraySize / BytesPerLong) {
5531     for (; i < (int)cnt; i += 2)
5532       stp(zr, zr, Address(base, i * wordSize));
5533   } else {
5534     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5535     int remainder = cnt % (2 * unroll);
5536     for (; i < remainder; i += 2)
5537       stp(zr, zr, Address(base, i * wordSize));
5538 
5539     Label loop;
5540     Register cnt_reg = rscratch1;
5541     Register loop_base = rscratch2;
5542     cnt = cnt - remainder;
5543     mov(cnt_reg, cnt);
5544     // adjust base and prebias by -2 * wordSize so we can pre-increment
5545     add(loop_base, base, (remainder - 2) * wordSize);
5546     bind(loop);
5547     sub(cnt_reg, cnt_reg, 2 * unroll);
5548     for (i = 1; i < unroll; i++)
5549       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5550     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5551     cbnz(cnt_reg, loop);
5552   }
5553   BLOCK_COMMENT("} zero_words");
5554 }
5555 
5556 // Zero blocks of memory by using DC ZVA.
5557 //
5558 // Aligns the base address first sufficently for DC ZVA, then uses
5559 // DC ZVA repeatedly for every full block.  cnt is the size to be
5560 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5561 // in cnt.
5562 //
5563 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5564 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
zero_dcache_blocks(Register base,Register cnt)5565 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5566   Register tmp = rscratch1;
5567   Register tmp2 = rscratch2;
5568   int zva_length = VM_Version::zva_length();
5569   Label initial_table_end, loop_zva;
5570   Label fini;
5571 
5572   // Base must be 16 byte aligned. If not just return and let caller handle it
5573   tst(base, 0x0f);
5574   br(Assembler::NE, fini);
5575   // Align base with ZVA length.
5576   neg(tmp, base);
5577   andr(tmp, tmp, zva_length - 1);
5578 
5579   // tmp: the number of bytes to be filled to align the base with ZVA length.
5580   add(base, base, tmp);
5581   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5582   adr(tmp2, initial_table_end);
5583   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5584   br(tmp2);
5585 
5586   for (int i = -zva_length + 16; i < 0; i += 16)
5587     stp(zr, zr, Address(base, i));
5588   bind(initial_table_end);
5589 
5590   sub(cnt, cnt, zva_length >> 3);
5591   bind(loop_zva);
5592   dc(Assembler::ZVA, base);
5593   subs(cnt, cnt, zva_length >> 3);
5594   add(base, base, zva_length);
5595   br(Assembler::GE, loop_zva);
5596   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5597   bind(fini);
5598 }
5599 
5600 // base:   Address of a buffer to be filled, 8 bytes aligned.
5601 // cnt:    Count in 8-byte unit.
5602 // value:  Value to be filled with.
5603 // base will point to the end of the buffer after filling.
fill_words(Register base,Register cnt,Register value)5604 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5605 {
5606 //  Algorithm:
5607 //
5608 //    scratch1 = cnt & 7;
5609 //    cnt -= scratch1;
5610 //    p += scratch1;
5611 //    switch (scratch1) {
5612 //      do {
5613 //        cnt -= 8;
5614 //          p[-8] = v;
5615 //        case 7:
5616 //          p[-7] = v;
5617 //        case 6:
5618 //          p[-6] = v;
5619 //          // ...
5620 //        case 1:
5621 //          p[-1] = v;
5622 //        case 0:
5623 //          p += 8;
5624 //      } while (cnt);
5625 //    }
5626 
5627   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5628 
5629   Label fini, skip, entry, loop;
5630   const int unroll = 8; // Number of stp instructions we'll unroll
5631 
5632   cbz(cnt, fini);
5633   tbz(base, 3, skip);
5634   str(value, Address(post(base, 8)));
5635   sub(cnt, cnt, 1);
5636   bind(skip);
5637 
5638   andr(rscratch1, cnt, (unroll-1) * 2);
5639   sub(cnt, cnt, rscratch1);
5640   add(base, base, rscratch1, Assembler::LSL, 3);
5641   adr(rscratch2, entry);
5642   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5643   br(rscratch2);
5644 
5645   bind(loop);
5646   add(base, base, unroll * 16);
5647   for (int i = -unroll; i < 0; i++)
5648     stp(value, value, Address(base, i * 16));
5649   bind(entry);
5650   subs(cnt, cnt, unroll * 2);
5651   br(Assembler::GE, loop);
5652 
5653   tbz(cnt, 0, fini);
5654   str(value, Address(post(base, 8)));
5655   bind(fini);
5656 }
5657 
5658 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5659 // java/lang/StringUTF16.compress.
encode_iso_array(Register src,Register dst,Register len,Register result,FloatRegister Vtmp1,FloatRegister Vtmp2,FloatRegister Vtmp3,FloatRegister Vtmp4)5660 void MacroAssembler::encode_iso_array(Register src, Register dst,
5661                       Register len, Register result,
5662                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5663                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5664 {
5665     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5666         NEXT_32_START, NEXT_32_PRFM_START;
5667     Register tmp1 = rscratch1, tmp2 = rscratch2;
5668 
5669       mov(result, len); // Save initial len
5670 
5671       cmp(len, (u1)8); // handle shortest strings first
5672       br(LT, LOOP_1);
5673       cmp(len, (u1)32);
5674       br(LT, NEXT_8);
5675       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5676       // to convert chars to bytes
5677       if (SoftwarePrefetchHintDistance >= 0) {
5678         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5679         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5680         br(LE, NEXT_32_START);
5681         b(NEXT_32_PRFM_START);
5682         BIND(NEXT_32_PRFM);
5683           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5684         BIND(NEXT_32_PRFM_START);
5685           prfm(Address(src, SoftwarePrefetchHintDistance));
5686           orr(v4, T16B, Vtmp1, Vtmp2);
5687           orr(v5, T16B, Vtmp3, Vtmp4);
5688           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5689           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5690           uzp2(v5, T16B, v4, v5); // high bytes
5691           umov(tmp2, v5, D, 1);
5692           fmovd(tmp1, v5);
5693           orr(tmp1, tmp1, tmp2);
5694           cbnz(tmp1, LOOP_8);
5695           stpq(Vtmp1, Vtmp3, dst);
5696           sub(len, len, 32);
5697           add(dst, dst, 32);
5698           add(src, src, 64);
5699           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5700           br(GE, NEXT_32_PRFM);
5701           cmp(len, (u1)32);
5702           br(LT, LOOP_8);
5703         BIND(NEXT_32);
5704           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5705         BIND(NEXT_32_START);
5706       } else {
5707         BIND(NEXT_32);
5708           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5709       }
5710       prfm(Address(src, SoftwarePrefetchHintDistance));
5711       uzp1(v4, T16B, Vtmp1, Vtmp2);
5712       uzp1(v5, T16B, Vtmp3, Vtmp4);
5713       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5714       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5715       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5716       umov(tmp2, Vtmp1, D, 1);
5717       fmovd(tmp1, Vtmp1);
5718       orr(tmp1, tmp1, tmp2);
5719       cbnz(tmp1, LOOP_8);
5720       stpq(v4, v5, dst);
5721       sub(len, len, 32);
5722       add(dst, dst, 32);
5723       add(src, src, 64);
5724       cmp(len, (u1)32);
5725       br(GE, NEXT_32);
5726       cbz(len, DONE);
5727 
5728     BIND(LOOP_8);
5729       cmp(len, (u1)8);
5730       br(LT, LOOP_1);
5731     BIND(NEXT_8);
5732       ld1(Vtmp1, T8H, src);
5733       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5734       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5735       fmovd(tmp1, Vtmp3);
5736       cbnz(tmp1, NEXT_1);
5737       strd(Vtmp2, dst);
5738 
5739       sub(len, len, 8);
5740       add(dst, dst, 8);
5741       add(src, src, 16);
5742       cmp(len, (u1)8);
5743       br(GE, NEXT_8);
5744 
5745     BIND(LOOP_1);
5746 
5747     cbz(len, DONE);
5748     BIND(NEXT_1);
5749       ldrh(tmp1, Address(post(src, 2)));
5750       tst(tmp1, 0xff00);
5751       br(NE, SET_RESULT);
5752       strb(tmp1, Address(post(dst, 1)));
5753       subs(len, len, 1);
5754       br(GT, NEXT_1);
5755 
5756     BIND(SET_RESULT);
5757       sub(result, result, len); // Return index where we stopped
5758                                 // Return len == 0 if we processed all
5759                                 // characters
5760     BIND(DONE);
5761 }
5762 
5763 
5764 // Inflate byte[] array to char[].
byte_array_inflate(Register src,Register dst,Register len,FloatRegister vtmp1,FloatRegister vtmp2,FloatRegister vtmp3,Register tmp4)5765 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5766                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5767                                         Register tmp4) {
5768   Label big, done, after_init, to_stub;
5769 
5770   assert_different_registers(src, dst, len, tmp4, rscratch1);
5771 
5772   fmovd(vtmp1, zr);
5773   lsrw(tmp4, len, 3);
5774   bind(after_init);
5775   cbnzw(tmp4, big);
5776   // Short string: less than 8 bytes.
5777   {
5778     Label loop, tiny;
5779 
5780     cmpw(len, 4);
5781     br(LT, tiny);
5782     // Use SIMD to do 4 bytes.
5783     ldrs(vtmp2, post(src, 4));
5784     zip1(vtmp3, T8B, vtmp2, vtmp1);
5785     subw(len, len, 4);
5786     strd(vtmp3, post(dst, 8));
5787 
5788     cbzw(len, done);
5789 
5790     // Do the remaining bytes by steam.
5791     bind(loop);
5792     ldrb(tmp4, post(src, 1));
5793     strh(tmp4, post(dst, 2));
5794     subw(len, len, 1);
5795 
5796     bind(tiny);
5797     cbnz(len, loop);
5798 
5799     b(done);
5800   }
5801 
5802   if (SoftwarePrefetchHintDistance >= 0) {
5803     bind(to_stub);
5804       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5805       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5806       trampoline_call(stub);
5807       b(after_init);
5808   }
5809 
5810   // Unpack the bytes 8 at a time.
5811   bind(big);
5812   {
5813     Label loop, around, loop_last, loop_start;
5814 
5815     if (SoftwarePrefetchHintDistance >= 0) {
5816       const int large_loop_threshold = (64 + 16)/8;
5817       ldrd(vtmp2, post(src, 8));
5818       andw(len, len, 7);
5819       cmp(tmp4, (u1)large_loop_threshold);
5820       br(GE, to_stub);
5821       b(loop_start);
5822 
5823       bind(loop);
5824       ldrd(vtmp2, post(src, 8));
5825       bind(loop_start);
5826       subs(tmp4, tmp4, 1);
5827       br(EQ, loop_last);
5828       zip1(vtmp2, T16B, vtmp2, vtmp1);
5829       ldrd(vtmp3, post(src, 8));
5830       st1(vtmp2, T8H, post(dst, 16));
5831       subs(tmp4, tmp4, 1);
5832       zip1(vtmp3, T16B, vtmp3, vtmp1);
5833       st1(vtmp3, T8H, post(dst, 16));
5834       br(NE, loop);
5835       b(around);
5836       bind(loop_last);
5837       zip1(vtmp2, T16B, vtmp2, vtmp1);
5838       st1(vtmp2, T8H, post(dst, 16));
5839       bind(around);
5840       cbz(len, done);
5841     } else {
5842       andw(len, len, 7);
5843       bind(loop);
5844       ldrd(vtmp2, post(src, 8));
5845       sub(tmp4, tmp4, 1);
5846       zip1(vtmp3, T16B, vtmp2, vtmp1);
5847       st1(vtmp3, T8H, post(dst, 16));
5848       cbnz(tmp4, loop);
5849     }
5850   }
5851 
5852   // Do the tail of up to 8 bytes.
5853   add(src, src, len);
5854   ldrd(vtmp3, Address(src, -8));
5855   add(dst, dst, len, ext::uxtw, 1);
5856   zip1(vtmp3, T16B, vtmp3, vtmp1);
5857   strq(vtmp3, Address(dst, -16));
5858 
5859   bind(done);
5860 }
5861 
5862 // Compress char[] array to byte[].
char_array_compress(Register src,Register dst,Register len,FloatRegister tmp1Reg,FloatRegister tmp2Reg,FloatRegister tmp3Reg,FloatRegister tmp4Reg,Register result)5863 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5864                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5865                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5866                                          Register result) {
5867   encode_iso_array(src, dst, len, result,
5868                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5869   cmp(len, zr);
5870   csel(result, result, zr, EQ);
5871 }
5872 
5873 // get_thread() can be called anywhere inside generated code so we
5874 // need to save whatever non-callee save context might get clobbered
5875 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5876 // the call setup code.
5877 //
5878 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5879 //
get_thread(Register dst)5880 void MacroAssembler::get_thread(Register dst) {
5881   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5882   push(saved_regs, sp);
5883 
5884   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5885   blr(lr);
5886   if (dst != c_rarg0) {
5887     mov(dst, c_rarg0);
5888   }
5889 
5890   pop(saved_regs, sp);
5891 }
5892 
cache_wb(Address line)5893 void MacroAssembler::cache_wb(Address line) {
5894   assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
5895   assert(line.index() == noreg, "index should be noreg");
5896   assert(line.offset() == 0, "offset should be 0");
5897   // would like to assert this
5898   // assert(line._ext.shift == 0, "shift should be zero");
5899   if (VM_Version::supports_dcpop()) {
5900     // writeback using clear virtual address to point of persistence
5901     dc(Assembler::CVAP, line.base());
5902   } else {
5903     // no need to generate anything as Unsafe.writebackMemory should
5904     // never invoke this stub
5905   }
5906 }
5907 
cache_wbsync(bool is_pre)5908 void MacroAssembler::cache_wbsync(bool is_pre) {
5909   // we only need a barrier post sync
5910   if (!is_pre) {
5911     membar(Assembler::AnyAny);
5912   }
5913 }
5914