1 /*
2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include <sys/types.h>
27
28 #include "precompiled.hpp"
29 #include "jvm.h"
30 #include "asm/assembler.hpp"
31 #include "asm/assembler.inline.hpp"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/cardTable.hpp"
34 #include "gc/shared/barrierSetAssembler.hpp"
35 #include "gc/shared/cardTableBarrierSet.hpp"
36 #include "interpreter/interpreter.hpp"
37 #include "compiler/disassembler.hpp"
38 #include "memory/resourceArea.hpp"
39 #include "memory/universe.hpp"
40 #include "nativeInst_aarch64.hpp"
41 #include "oops/accessDecorators.hpp"
42 #include "oops/compressedOops.inline.hpp"
43 #include "oops/klass.inline.hpp"
44 #include "runtime/biasedLocking.hpp"
45 #include "runtime/icache.hpp"
46 #include "runtime/interfaceSupport.inline.hpp"
47 #include "runtime/jniHandles.inline.hpp"
48 #include "runtime/sharedRuntime.hpp"
49 #include "runtime/thread.hpp"
50 #ifdef COMPILER1
51 #include "c1/c1_LIRAssembler.hpp"
52 #endif
53 #ifdef COMPILER2
54 #include "oops/oop.hpp"
55 #include "opto/compile.hpp"
56 #include "opto/intrinsicnode.hpp"
57 #include "opto/node.hpp"
58 #endif
59
60 #ifdef PRODUCT
61 #define BLOCK_COMMENT(str) /* nothing */
62 #define STOP(error) stop(error)
63 #else
64 #define BLOCK_COMMENT(str) block_comment(str)
65 #define STOP(error) block_comment(error); stop(error)
66 #endif
67
68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
69
70 // Patch any kind of instruction; there may be several instructions.
71 // Return the total length (in bytes) of the instructions.
pd_patch_instruction_size(address branch,address target)72 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
73 int instructions = 1;
74 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
75 long offset = (target - branch) >> 2;
76 unsigned insn = *(unsigned*)branch;
77 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
78 // Load register (literal)
79 Instruction_aarch64::spatch(branch, 23, 5, offset);
80 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
81 // Unconditional branch (immediate)
82 Instruction_aarch64::spatch(branch, 25, 0, offset);
83 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
84 // Conditional branch (immediate)
85 Instruction_aarch64::spatch(branch, 23, 5, offset);
86 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
87 // Compare & branch (immediate)
88 Instruction_aarch64::spatch(branch, 23, 5, offset);
89 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
90 // Test & branch (immediate)
91 Instruction_aarch64::spatch(branch, 18, 5, offset);
92 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
93 // PC-rel. addressing
94 offset = target-branch;
95 int shift = Instruction_aarch64::extract(insn, 31, 31);
96 if (shift) {
97 u_int64_t dest = (u_int64_t)target;
98 uint64_t pc_page = (uint64_t)branch >> 12;
99 uint64_t adr_page = (uint64_t)target >> 12;
100 unsigned offset_lo = dest & 0xfff;
101 offset = adr_page - pc_page;
102
103 // We handle 4 types of PC relative addressing
104 // 1 - adrp Rx, target_page
105 // ldr/str Ry, [Rx, #offset_in_page]
106 // 2 - adrp Rx, target_page
107 // add Ry, Rx, #offset_in_page
108 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0)
109 // movk Rx, #imm16<<32
110 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0)
111 // In the first 3 cases we must check that Rx is the same in the adrp and the
112 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
113 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
114 // to be followed by a random unrelated ldr/str, add or movk instruction.
115 //
116 unsigned insn2 = ((unsigned*)branch)[1];
117 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
118 Instruction_aarch64::extract(insn, 4, 0) ==
119 Instruction_aarch64::extract(insn2, 9, 5)) {
120 // Load/store register (unsigned immediate)
121 unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
122 Instruction_aarch64::patch(branch + sizeof (unsigned),
123 21, 10, offset_lo >> size);
124 guarantee(((dest >> size) << size) == dest, "misaligned target");
125 instructions = 2;
126 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
127 Instruction_aarch64::extract(insn, 4, 0) ==
128 Instruction_aarch64::extract(insn2, 4, 0)) {
129 // add (immediate)
130 Instruction_aarch64::patch(branch + sizeof (unsigned),
131 21, 10, offset_lo);
132 instructions = 2;
133 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
134 Instruction_aarch64::extract(insn, 4, 0) ==
135 Instruction_aarch64::extract(insn2, 4, 0)) {
136 // movk #imm16<<32
137 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
138 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
139 long pc_page = (long)branch >> 12;
140 long adr_page = (long)dest >> 12;
141 offset = adr_page - pc_page;
142 instructions = 2;
143 }
144 }
145 int offset_lo = offset & 3;
146 offset >>= 2;
147 Instruction_aarch64::spatch(branch, 23, 5, offset);
148 Instruction_aarch64::patch(branch, 30, 29, offset_lo);
149 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
150 u_int64_t dest = (u_int64_t)target;
151 // Move wide constant
152 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
153 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
154 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
155 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
156 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
157 assert(target_addr_for_insn(branch) == target, "should be");
158 instructions = 3;
159 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
160 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
161 // nothing to do
162 assert(target == 0, "did not expect to relocate target for polling page load");
163 } else {
164 ShouldNotReachHere();
165 }
166 return instructions * NativeInstruction::instruction_size;
167 }
168
patch_oop(address insn_addr,address o)169 int MacroAssembler::patch_oop(address insn_addr, address o) {
170 int instructions;
171 unsigned insn = *(unsigned*)insn_addr;
172 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
173
174 // OOPs are either narrow (32 bits) or wide (48 bits). We encode
175 // narrow OOPs by setting the upper 16 bits in the first
176 // instruction.
177 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
178 // Move narrow OOP
179 narrowOop n = CompressedOops::encode((oop)o);
180 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
181 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
182 instructions = 2;
183 } else {
184 // Move wide OOP
185 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
186 uintptr_t dest = (uintptr_t)o;
187 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
188 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
189 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
190 instructions = 3;
191 }
192 return instructions * NativeInstruction::instruction_size;
193 }
194
patch_narrow_klass(address insn_addr,narrowKlass n)195 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
196 // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
197 // We encode narrow ones by setting the upper 16 bits in the first
198 // instruction.
199 NativeInstruction *insn = nativeInstruction_at(insn_addr);
200 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
201 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
202
203 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
204 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
205 return 2 * NativeInstruction::instruction_size;
206 }
207
target_addr_for_insn(address insn_addr,unsigned insn)208 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
209 long offset = 0;
210 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
211 // Load register (literal)
212 offset = Instruction_aarch64::sextract(insn, 23, 5);
213 return address(((uint64_t)insn_addr + (offset << 2)));
214 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
215 // Unconditional branch (immediate)
216 offset = Instruction_aarch64::sextract(insn, 25, 0);
217 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
218 // Conditional branch (immediate)
219 offset = Instruction_aarch64::sextract(insn, 23, 5);
220 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
221 // Compare & branch (immediate)
222 offset = Instruction_aarch64::sextract(insn, 23, 5);
223 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
224 // Test & branch (immediate)
225 offset = Instruction_aarch64::sextract(insn, 18, 5);
226 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
227 // PC-rel. addressing
228 offset = Instruction_aarch64::extract(insn, 30, 29);
229 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
230 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
231 if (shift) {
232 offset <<= shift;
233 uint64_t target_page = ((uint64_t)insn_addr) + offset;
234 target_page &= ((uint64_t)-1) << shift;
235 // Return the target address for the following sequences
236 // 1 - adrp Rx, target_page
237 // ldr/str Ry, [Rx, #offset_in_page]
238 // 2 - adrp Rx, target_page
239 // add Ry, Rx, #offset_in_page
240 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0)
241 // movk Rx, #imm12<<32
242 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0)
243 //
244 // In the first two cases we check that the register is the same and
245 // return the target_page + the offset within the page.
246 // Otherwise we assume it is a page aligned relocation and return
247 // the target page only.
248 //
249 unsigned insn2 = ((unsigned*)insn_addr)[1];
250 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
251 Instruction_aarch64::extract(insn, 4, 0) ==
252 Instruction_aarch64::extract(insn2, 9, 5)) {
253 // Load/store register (unsigned immediate)
254 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
255 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
256 return address(target_page + (byte_offset << size));
257 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
258 Instruction_aarch64::extract(insn, 4, 0) ==
259 Instruction_aarch64::extract(insn2, 4, 0)) {
260 // add (immediate)
261 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
262 return address(target_page + byte_offset);
263 } else {
264 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
265 Instruction_aarch64::extract(insn, 4, 0) ==
266 Instruction_aarch64::extract(insn2, 4, 0)) {
267 target_page = (target_page & 0xffffffff) |
268 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
269 }
270 return (address)target_page;
271 }
272 } else {
273 ShouldNotReachHere();
274 }
275 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
276 u_int32_t *insns = (u_int32_t *)insn_addr;
277 // Move wide constant: movz, movk, movk. See movptr().
278 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
279 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
280 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
281 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
282 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
283 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
284 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
285 return 0;
286 } else {
287 ShouldNotReachHere();
288 }
289 return address(((uint64_t)insn_addr + (offset << 2)));
290 }
291
safepoint_poll(Label & slow_path)292 void MacroAssembler::safepoint_poll(Label& slow_path) {
293 if (SafepointMechanism::uses_thread_local_poll()) {
294 ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
295 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
296 } else {
297 unsigned long offset;
298 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
299 ldrw(rscratch1, Address(rscratch1, offset));
300 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
301 cbnz(rscratch1, slow_path);
302 }
303 }
304
305 // Just like safepoint_poll, but use an acquiring load for thread-
306 // local polling.
307 //
308 // We need an acquire here to ensure that any subsequent load of the
309 // global SafepointSynchronize::_state flag is ordered after this load
310 // of the local Thread::_polling page. We don't want this poll to
311 // return false (i.e. not safepointing) and a later poll of the global
312 // SafepointSynchronize::_state spuriously to return true.
313 //
314 // This is to avoid a race when we're in a native->Java transition
315 // racing the code which wakes up from a safepoint.
316 //
safepoint_poll_acquire(Label & slow_path)317 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
318 if (SafepointMechanism::uses_thread_local_poll()) {
319 lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
320 ldar(rscratch1, rscratch1);
321 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
322 } else {
323 safepoint_poll(slow_path);
324 }
325 }
326
reset_last_Java_frame(bool clear_fp)327 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
328 // we must set sp to zero to clear frame
329 str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
330
331 // must clear fp, so that compiled frames are not confused; it is
332 // possible that we need it only for debugging
333 if (clear_fp) {
334 str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
335 }
336
337 // Always clear the pc because it could have been set by make_walkable()
338 str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
339 }
340
341 // Calls to C land
342 //
343 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
344 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
345 // has to be reset to 0. This is required to allow proper stack traversal.
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Register last_java_pc,Register scratch)346 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
347 Register last_java_fp,
348 Register last_java_pc,
349 Register scratch) {
350
351 if (last_java_pc->is_valid()) {
352 str(last_java_pc, Address(rthread,
353 JavaThread::frame_anchor_offset()
354 + JavaFrameAnchor::last_Java_pc_offset()));
355 }
356
357 // determine last_java_sp register
358 if (last_java_sp == sp) {
359 mov(scratch, sp);
360 last_java_sp = scratch;
361 } else if (!last_java_sp->is_valid()) {
362 last_java_sp = esp;
363 }
364
365 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
366
367 // last_java_fp is optional
368 if (last_java_fp->is_valid()) {
369 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
370 }
371 }
372
set_last_Java_frame(Register last_java_sp,Register last_java_fp,address last_java_pc,Register scratch)373 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
374 Register last_java_fp,
375 address last_java_pc,
376 Register scratch) {
377 assert(last_java_pc != NULL, "must provide a valid PC");
378
379 adr(scratch, last_java_pc);
380 str(scratch, Address(rthread,
381 JavaThread::frame_anchor_offset()
382 + JavaFrameAnchor::last_Java_pc_offset()));
383
384 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
385 }
386
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Label & L,Register scratch)387 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
388 Register last_java_fp,
389 Label &L,
390 Register scratch) {
391 if (L.is_bound()) {
392 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
393 } else {
394 InstructionMark im(this);
395 L.add_patch_at(code(), locator());
396 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
397 }
398 }
399
far_call(Address entry,CodeBuffer * cbuf,Register tmp)400 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
401 assert(ReservedCodeCacheSize < 4*G, "branch out of range");
402 assert(CodeCache::find_blob(entry.target()) != NULL,
403 "destination of far call not found in code cache");
404 if (far_branches()) {
405 unsigned long offset;
406 // We can use ADRP here because we know that the total size of
407 // the code cache cannot exceed 2Gb.
408 adrp(tmp, entry, offset);
409 add(tmp, tmp, offset);
410 if (cbuf) cbuf->set_insts_mark();
411 blr(tmp);
412 } else {
413 if (cbuf) cbuf->set_insts_mark();
414 bl(entry);
415 }
416 }
417
far_jump(Address entry,CodeBuffer * cbuf,Register tmp)418 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
419 assert(ReservedCodeCacheSize < 4*G, "branch out of range");
420 assert(CodeCache::find_blob(entry.target()) != NULL,
421 "destination of far call not found in code cache");
422 if (far_branches()) {
423 unsigned long offset;
424 // We can use ADRP here because we know that the total size of
425 // the code cache cannot exceed 2Gb.
426 adrp(tmp, entry, offset);
427 add(tmp, tmp, offset);
428 if (cbuf) cbuf->set_insts_mark();
429 br(tmp);
430 } else {
431 if (cbuf) cbuf->set_insts_mark();
432 b(entry);
433 }
434 }
435
reserved_stack_check()436 void MacroAssembler::reserved_stack_check() {
437 // testing if reserved zone needs to be enabled
438 Label no_reserved_zone_enabling;
439
440 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
441 cmp(sp, rscratch1);
442 br(Assembler::LO, no_reserved_zone_enabling);
443
444 enter(); // LR and FP are live.
445 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
446 mov(c_rarg0, rthread);
447 blr(rscratch1);
448 leave();
449
450 // We have already removed our own frame.
451 // throw_delayed_StackOverflowError will think that it's been
452 // called by our caller.
453 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
454 br(rscratch1);
455 should_not_reach_here();
456
457 bind(no_reserved_zone_enabling);
458 }
459
biased_locking_enter(Register lock_reg,Register obj_reg,Register swap_reg,Register tmp_reg,bool swap_reg_contains_mark,Label & done,Label * slow_case,BiasedLockingCounters * counters)460 int MacroAssembler::biased_locking_enter(Register lock_reg,
461 Register obj_reg,
462 Register swap_reg,
463 Register tmp_reg,
464 bool swap_reg_contains_mark,
465 Label& done,
466 Label* slow_case,
467 BiasedLockingCounters* counters) {
468 assert(UseBiasedLocking, "why call this otherwise?");
469 assert_different_registers(lock_reg, obj_reg, swap_reg);
470
471 if (PrintBiasedLockingStatistics && counters == NULL)
472 counters = BiasedLocking::counters();
473
474 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
475 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
476 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
477 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes());
478 Address saved_mark_addr(lock_reg, 0);
479
480 // Biased locking
481 // See whether the lock is currently biased toward our thread and
482 // whether the epoch is still valid
483 // Note that the runtime guarantees sufficient alignment of JavaThread
484 // pointers to allow age to be placed into low bits
485 // First check to see whether biasing is even enabled for this object
486 Label cas_label;
487 int null_check_offset = -1;
488 if (!swap_reg_contains_mark) {
489 null_check_offset = offset();
490 ldr(swap_reg, mark_addr);
491 }
492 andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place);
493 cmp(tmp_reg, (u1)markWord::biased_lock_pattern);
494 br(Assembler::NE, cas_label);
495 // The bias pattern is present in the object's header. Need to check
496 // whether the bias owner and the epoch are both still current.
497 load_prototype_header(tmp_reg, obj_reg);
498 orr(tmp_reg, tmp_reg, rthread);
499 eor(tmp_reg, swap_reg, tmp_reg);
500 andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place));
501 if (counters != NULL) {
502 Label around;
503 cbnz(tmp_reg, around);
504 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
505 b(done);
506 bind(around);
507 } else {
508 cbz(tmp_reg, done);
509 }
510
511 Label try_revoke_bias;
512 Label try_rebias;
513
514 // At this point we know that the header has the bias pattern and
515 // that we are not the bias owner in the current epoch. We need to
516 // figure out more details about the state of the header in order to
517 // know what operations can be legally performed on the object's
518 // header.
519
520 // If the low three bits in the xor result aren't clear, that means
521 // the prototype header is no longer biased and we have to revoke
522 // the bias on this object.
523 andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place);
524 cbnz(rscratch1, try_revoke_bias);
525
526 // Biasing is still enabled for this data type. See whether the
527 // epoch of the current bias is still valid, meaning that the epoch
528 // bits of the mark word are equal to the epoch bits of the
529 // prototype header. (Note that the prototype header's epoch bits
530 // only change at a safepoint.) If not, attempt to rebias the object
531 // toward the current thread. Note that we must be absolutely sure
532 // that the current epoch is invalid in order to do this because
533 // otherwise the manipulations it performs on the mark word are
534 // illegal.
535 andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place);
536 cbnz(rscratch1, try_rebias);
537
538 // The epoch of the current bias is still valid but we know nothing
539 // about the owner; it might be set or it might be clear. Try to
540 // acquire the bias of the object using an atomic operation. If this
541 // fails we will go in to the runtime to revoke the object's bias.
542 // Note that we first construct the presumed unbiased header so we
543 // don't accidentally blow away another thread's valid bias.
544 {
545 Label here;
546 mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
547 andr(swap_reg, swap_reg, rscratch1);
548 orr(tmp_reg, swap_reg, rthread);
549 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
550 // If the biasing toward our thread failed, this means that
551 // another thread succeeded in biasing it toward itself and we
552 // need to revoke that bias. The revocation will occur in the
553 // interpreter runtime in the slow case.
554 bind(here);
555 if (counters != NULL) {
556 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
557 tmp_reg, rscratch1, rscratch2);
558 }
559 }
560 b(done);
561
562 bind(try_rebias);
563 // At this point we know the epoch has expired, meaning that the
564 // current "bias owner", if any, is actually invalid. Under these
565 // circumstances _only_, we are allowed to use the current header's
566 // value as the comparison value when doing the cas to acquire the
567 // bias in the current epoch. In other words, we allow transfer of
568 // the bias from one thread to another directly in this situation.
569 //
570 // FIXME: due to a lack of registers we currently blow away the age
571 // bits in this situation. Should attempt to preserve them.
572 {
573 Label here;
574 load_prototype_header(tmp_reg, obj_reg);
575 orr(tmp_reg, rthread, tmp_reg);
576 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
577 // If the biasing toward our thread failed, then another thread
578 // succeeded in biasing it toward itself and we need to revoke that
579 // bias. The revocation will occur in the runtime in the slow case.
580 bind(here);
581 if (counters != NULL) {
582 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
583 tmp_reg, rscratch1, rscratch2);
584 }
585 }
586 b(done);
587
588 bind(try_revoke_bias);
589 // The prototype mark in the klass doesn't have the bias bit set any
590 // more, indicating that objects of this data type are not supposed
591 // to be biased any more. We are going to try to reset the mark of
592 // this object to the prototype value and fall through to the
593 // CAS-based locking scheme. Note that if our CAS fails, it means
594 // that another thread raced us for the privilege of revoking the
595 // bias of this particular object, so it's okay to continue in the
596 // normal locking code.
597 //
598 // FIXME: due to a lack of registers we currently blow away the age
599 // bits in this situation. Should attempt to preserve them.
600 {
601 Label here, nope;
602 load_prototype_header(tmp_reg, obj_reg);
603 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
604 bind(here);
605
606 // Fall through to the normal CAS-based lock, because no matter what
607 // the result of the above CAS, some thread must have succeeded in
608 // removing the bias bit from the object's header.
609 if (counters != NULL) {
610 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
611 rscratch1, rscratch2);
612 }
613 bind(nope);
614 }
615
616 bind(cas_label);
617
618 return null_check_offset;
619 }
620
biased_locking_exit(Register obj_reg,Register temp_reg,Label & done)621 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
622 assert(UseBiasedLocking, "why call this otherwise?");
623
624 // Check for biased locking unlock case, which is a no-op
625 // Note: we do not have to check the thread ID for two reasons.
626 // First, the interpreter checks for IllegalMonitorStateException at
627 // a higher level. Second, if the bias was revoked while we held the
628 // lock, the object could not be rebiased toward another thread, so
629 // the bias bit would be clear.
630 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
631 andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
632 cmp(temp_reg, (u1)markWord::biased_lock_pattern);
633 br(Assembler::EQ, done);
634 }
635
pass_arg0(MacroAssembler * masm,Register arg)636 static void pass_arg0(MacroAssembler* masm, Register arg) {
637 if (c_rarg0 != arg ) {
638 masm->mov(c_rarg0, arg);
639 }
640 }
641
pass_arg1(MacroAssembler * masm,Register arg)642 static void pass_arg1(MacroAssembler* masm, Register arg) {
643 if (c_rarg1 != arg ) {
644 masm->mov(c_rarg1, arg);
645 }
646 }
647
pass_arg2(MacroAssembler * masm,Register arg)648 static void pass_arg2(MacroAssembler* masm, Register arg) {
649 if (c_rarg2 != arg ) {
650 masm->mov(c_rarg2, arg);
651 }
652 }
653
pass_arg3(MacroAssembler * masm,Register arg)654 static void pass_arg3(MacroAssembler* masm, Register arg) {
655 if (c_rarg3 != arg ) {
656 masm->mov(c_rarg3, arg);
657 }
658 }
659
call_VM_base(Register oop_result,Register java_thread,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)660 void MacroAssembler::call_VM_base(Register oop_result,
661 Register java_thread,
662 Register last_java_sp,
663 address entry_point,
664 int number_of_arguments,
665 bool check_exceptions) {
666 // determine java_thread register
667 if (!java_thread->is_valid()) {
668 java_thread = rthread;
669 }
670
671 // determine last_java_sp register
672 if (!last_java_sp->is_valid()) {
673 last_java_sp = esp;
674 }
675
676 // debugging support
677 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
678 assert(java_thread == rthread, "unexpected register");
679 #ifdef ASSERT
680 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
681 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
682 #endif // ASSERT
683
684 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
685 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
686
687 // push java thread (becomes first argument of C function)
688
689 mov(c_rarg0, java_thread);
690
691 // set last Java frame before call
692 assert(last_java_sp != rfp, "can't use rfp");
693
694 Label l;
695 set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
696
697 // do the call, remove parameters
698 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
699
700 // reset last Java frame
701 // Only interpreter should have to clear fp
702 reset_last_Java_frame(true);
703
704 // C++ interp handles this in the interpreter
705 check_and_handle_popframe(java_thread);
706 check_and_handle_earlyret(java_thread);
707
708 if (check_exceptions) {
709 // check for pending exceptions (java_thread is set upon return)
710 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
711 Label ok;
712 cbz(rscratch1, ok);
713 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
714 br(rscratch1);
715 bind(ok);
716 }
717
718 // get oop result if there is one and reset the value in the thread
719 if (oop_result->is_valid()) {
720 get_vm_result(oop_result, java_thread);
721 }
722 }
723
call_VM_helper(Register oop_result,address entry_point,int number_of_arguments,bool check_exceptions)724 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
725 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
726 }
727
728 // Maybe emit a call via a trampoline. If the code cache is small
729 // trampolines won't be emitted.
730
trampoline_call(Address entry,CodeBuffer * cbuf)731 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
732 assert(JavaThread::current()->is_Compiler_thread(), "just checking");
733 assert(entry.rspec().type() == relocInfo::runtime_call_type
734 || entry.rspec().type() == relocInfo::opt_virtual_call_type
735 || entry.rspec().type() == relocInfo::static_call_type
736 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
737
738 // We need a trampoline if branches are far.
739 if (far_branches()) {
740 bool in_scratch_emit_size = false;
741 #ifdef COMPILER2
742 // We don't want to emit a trampoline if C2 is generating dummy
743 // code during its branch shortening phase.
744 CompileTask* task = ciEnv::current()->task();
745 in_scratch_emit_size =
746 (task != NULL && is_c2_compile(task->comp_level()) &&
747 Compile::current()->in_scratch_emit_size());
748 #endif
749 if (!in_scratch_emit_size) {
750 address stub = emit_trampoline_stub(offset(), entry.target());
751 if (stub == NULL) {
752 return NULL; // CodeCache is full
753 }
754 }
755 }
756
757 if (cbuf) cbuf->set_insts_mark();
758 relocate(entry.rspec());
759 if (!far_branches()) {
760 bl(entry.target());
761 } else {
762 bl(pc());
763 }
764 // just need to return a non-null address
765 return pc();
766 }
767
768
769 // Emit a trampoline stub for a call to a target which is too far away.
770 //
771 // code sequences:
772 //
773 // call-site:
774 // branch-and-link to <destination> or <trampoline stub>
775 //
776 // Related trampoline stub for this call site in the stub section:
777 // load the call target from the constant pool
778 // branch (LR still points to the call site above)
779
emit_trampoline_stub(int insts_call_instruction_offset,address dest)780 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
781 address dest) {
782 // Max stub size: alignment nop, TrampolineStub.
783 address stub = start_a_stub(NativeInstruction::instruction_size
784 + NativeCallTrampolineStub::instruction_size);
785 if (stub == NULL) {
786 return NULL; // CodeBuffer::expand failed
787 }
788
789 // Create a trampoline stub relocation which relates this trampoline stub
790 // with the call instruction at insts_call_instruction_offset in the
791 // instructions code-section.
792 align(wordSize);
793 relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
794 + insts_call_instruction_offset));
795 const int stub_start_offset = offset();
796
797 // Now, create the trampoline stub's code:
798 // - load the call
799 // - call
800 Label target;
801 ldr(rscratch1, target);
802 br(rscratch1);
803 bind(target);
804 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
805 "should be");
806 emit_int64((int64_t)dest);
807
808 const address stub_start_addr = addr_at(stub_start_offset);
809
810 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
811
812 end_a_stub();
813 return stub_start_addr;
814 }
815
emit_static_call_stub()816 void MacroAssembler::emit_static_call_stub() {
817 // CompiledDirectStaticCall::set_to_interpreted knows the
818 // exact layout of this stub.
819
820 isb();
821 mov_metadata(rmethod, (Metadata*)NULL);
822
823 // Jump to the entry point of the i2c stub.
824 movptr(rscratch1, 0);
825 br(rscratch1);
826 }
827
c2bool(Register x)828 void MacroAssembler::c2bool(Register x) {
829 // implements x == 0 ? 0 : 1
830 // note: must only look at least-significant byte of x
831 // since C-style booleans are stored in one byte
832 // only! (was bug)
833 tst(x, 0xff);
834 cset(x, Assembler::NE);
835 }
836
ic_call(address entry,jint method_index)837 address MacroAssembler::ic_call(address entry, jint method_index) {
838 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
839 // address const_ptr = long_constant((jlong)Universe::non_oop_word());
840 // unsigned long offset;
841 // ldr_constant(rscratch2, const_ptr);
842 movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
843 return trampoline_call(Address(entry, rh));
844 }
845
846 // Implementation of call_VM versions
847
call_VM(Register oop_result,address entry_point,bool check_exceptions)848 void MacroAssembler::call_VM(Register oop_result,
849 address entry_point,
850 bool check_exceptions) {
851 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
852 }
853
call_VM(Register oop_result,address entry_point,Register arg_1,bool check_exceptions)854 void MacroAssembler::call_VM(Register oop_result,
855 address entry_point,
856 Register arg_1,
857 bool check_exceptions) {
858 pass_arg1(this, arg_1);
859 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
860 }
861
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)862 void MacroAssembler::call_VM(Register oop_result,
863 address entry_point,
864 Register arg_1,
865 Register arg_2,
866 bool check_exceptions) {
867 assert(arg_1 != c_rarg2, "smashed arg");
868 pass_arg2(this, arg_2);
869 pass_arg1(this, arg_1);
870 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
871 }
872
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)873 void MacroAssembler::call_VM(Register oop_result,
874 address entry_point,
875 Register arg_1,
876 Register arg_2,
877 Register arg_3,
878 bool check_exceptions) {
879 assert(arg_1 != c_rarg3, "smashed arg");
880 assert(arg_2 != c_rarg3, "smashed arg");
881 pass_arg3(this, arg_3);
882
883 assert(arg_1 != c_rarg2, "smashed arg");
884 pass_arg2(this, arg_2);
885
886 pass_arg1(this, arg_1);
887 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
888 }
889
call_VM(Register oop_result,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)890 void MacroAssembler::call_VM(Register oop_result,
891 Register last_java_sp,
892 address entry_point,
893 int number_of_arguments,
894 bool check_exceptions) {
895 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
896 }
897
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,bool check_exceptions)898 void MacroAssembler::call_VM(Register oop_result,
899 Register last_java_sp,
900 address entry_point,
901 Register arg_1,
902 bool check_exceptions) {
903 pass_arg1(this, arg_1);
904 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
905 }
906
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)907 void MacroAssembler::call_VM(Register oop_result,
908 Register last_java_sp,
909 address entry_point,
910 Register arg_1,
911 Register arg_2,
912 bool check_exceptions) {
913
914 assert(arg_1 != c_rarg2, "smashed arg");
915 pass_arg2(this, arg_2);
916 pass_arg1(this, arg_1);
917 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
918 }
919
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)920 void MacroAssembler::call_VM(Register oop_result,
921 Register last_java_sp,
922 address entry_point,
923 Register arg_1,
924 Register arg_2,
925 Register arg_3,
926 bool check_exceptions) {
927 assert(arg_1 != c_rarg3, "smashed arg");
928 assert(arg_2 != c_rarg3, "smashed arg");
929 pass_arg3(this, arg_3);
930 assert(arg_1 != c_rarg2, "smashed arg");
931 pass_arg2(this, arg_2);
932 pass_arg1(this, arg_1);
933 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
934 }
935
936
get_vm_result(Register oop_result,Register java_thread)937 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
938 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
939 str(zr, Address(java_thread, JavaThread::vm_result_offset()));
940 verify_oop(oop_result, "broken oop in call_VM_base");
941 }
942
get_vm_result_2(Register metadata_result,Register java_thread)943 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
944 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
945 str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
946 }
947
align(int modulus)948 void MacroAssembler::align(int modulus) {
949 while (offset() % modulus != 0) nop();
950 }
951
952 // these are no-ops overridden by InterpreterMacroAssembler
953
check_and_handle_earlyret(Register java_thread)954 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
955
check_and_handle_popframe(Register java_thread)956 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
957
958
delayed_value_impl(intptr_t * delayed_value_addr,Register tmp,int offset)959 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
960 Register tmp,
961 int offset) {
962 intptr_t value = *delayed_value_addr;
963 if (value != 0)
964 return RegisterOrConstant(value + offset);
965
966 // load indirectly to solve generation ordering problem
967 ldr(tmp, ExternalAddress((address) delayed_value_addr));
968
969 if (offset != 0)
970 add(tmp, tmp, offset);
971
972 return RegisterOrConstant(tmp);
973 }
974
975 // Look up the method for a megamorphic invokeinterface call.
976 // The target method is determined by <intf_klass, itable_index>.
977 // The receiver klass is in recv_klass.
978 // On success, the result will be in method_result, and execution falls through.
979 // On failure, execution transfers to the given label.
lookup_interface_method(Register recv_klass,Register intf_klass,RegisterOrConstant itable_index,Register method_result,Register scan_temp,Label & L_no_such_interface,bool return_method)980 void MacroAssembler::lookup_interface_method(Register recv_klass,
981 Register intf_klass,
982 RegisterOrConstant itable_index,
983 Register method_result,
984 Register scan_temp,
985 Label& L_no_such_interface,
986 bool return_method) {
987 assert_different_registers(recv_klass, intf_klass, scan_temp);
988 assert_different_registers(method_result, intf_klass, scan_temp);
989 assert(recv_klass != method_result || !return_method,
990 "recv_klass can be destroyed when method isn't needed");
991 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
992 "caller must use same register for non-constant itable index as for method");
993
994 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
995 int vtable_base = in_bytes(Klass::vtable_start_offset());
996 int itentry_off = itableMethodEntry::method_offset_in_bytes();
997 int scan_step = itableOffsetEntry::size() * wordSize;
998 int vte_size = vtableEntry::size_in_bytes();
999 assert(vte_size == wordSize, "else adjust times_vte_scale");
1000
1001 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1002
1003 // %%% Could store the aligned, prescaled offset in the klassoop.
1004 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1005 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1006 add(scan_temp, scan_temp, vtable_base);
1007
1008 if (return_method) {
1009 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1010 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1011 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1012 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1013 if (itentry_off)
1014 add(recv_klass, recv_klass, itentry_off);
1015 }
1016
1017 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1018 // if (scan->interface() == intf) {
1019 // result = (klass + scan->offset() + itable_index);
1020 // }
1021 // }
1022 Label search, found_method;
1023
1024 for (int peel = 1; peel >= 0; peel--) {
1025 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1026 cmp(intf_klass, method_result);
1027
1028 if (peel) {
1029 br(Assembler::EQ, found_method);
1030 } else {
1031 br(Assembler::NE, search);
1032 // (invert the test to fall through to found_method...)
1033 }
1034
1035 if (!peel) break;
1036
1037 bind(search);
1038
1039 // Check that the previous entry is non-null. A null entry means that
1040 // the receiver class doesn't implement the interface, and wasn't the
1041 // same as when the caller was compiled.
1042 cbz(method_result, L_no_such_interface);
1043 add(scan_temp, scan_temp, scan_step);
1044 }
1045
1046 bind(found_method);
1047
1048 // Got a hit.
1049 if (return_method) {
1050 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1051 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1052 }
1053 }
1054
1055 // virtual method calling
lookup_virtual_method(Register recv_klass,RegisterOrConstant vtable_index,Register method_result)1056 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1057 RegisterOrConstant vtable_index,
1058 Register method_result) {
1059 const int base = in_bytes(Klass::vtable_start_offset());
1060 assert(vtableEntry::size() * wordSize == 8,
1061 "adjust the scaling in the code below");
1062 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1063
1064 if (vtable_index.is_register()) {
1065 lea(method_result, Address(recv_klass,
1066 vtable_index.as_register(),
1067 Address::lsl(LogBytesPerWord)));
1068 ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1069 } else {
1070 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1071 ldr(method_result,
1072 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1073 }
1074 }
1075
check_klass_subtype(Register sub_klass,Register super_klass,Register temp_reg,Label & L_success)1076 void MacroAssembler::check_klass_subtype(Register sub_klass,
1077 Register super_klass,
1078 Register temp_reg,
1079 Label& L_success) {
1080 Label L_failure;
1081 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL);
1082 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1083 bind(L_failure);
1084 }
1085
1086
check_klass_subtype_fast_path(Register sub_klass,Register super_klass,Register temp_reg,Label * L_success,Label * L_failure,Label * L_slow_path,RegisterOrConstant super_check_offset)1087 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1088 Register super_klass,
1089 Register temp_reg,
1090 Label* L_success,
1091 Label* L_failure,
1092 Label* L_slow_path,
1093 RegisterOrConstant super_check_offset) {
1094 assert_different_registers(sub_klass, super_klass, temp_reg);
1095 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1096 if (super_check_offset.is_register()) {
1097 assert_different_registers(sub_klass, super_klass,
1098 super_check_offset.as_register());
1099 } else if (must_load_sco) {
1100 assert(temp_reg != noreg, "supply either a temp or a register offset");
1101 }
1102
1103 Label L_fallthrough;
1104 int label_nulls = 0;
1105 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1106 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1107 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1108 assert(label_nulls <= 1, "at most one NULL in the batch");
1109
1110 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1111 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1112 Address super_check_offset_addr(super_klass, sco_offset);
1113
1114 // Hacked jmp, which may only be used just before L_fallthrough.
1115 #define final_jmp(label) \
1116 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
1117 else b(label) /*omit semi*/
1118
1119 // If the pointers are equal, we are done (e.g., String[] elements).
1120 // This self-check enables sharing of secondary supertype arrays among
1121 // non-primary types such as array-of-interface. Otherwise, each such
1122 // type would need its own customized SSA.
1123 // We move this check to the front of the fast path because many
1124 // type checks are in fact trivially successful in this manner,
1125 // so we get a nicely predicted branch right at the start of the check.
1126 cmp(sub_klass, super_klass);
1127 br(Assembler::EQ, *L_success);
1128
1129 // Check the supertype display:
1130 if (must_load_sco) {
1131 ldrw(temp_reg, super_check_offset_addr);
1132 super_check_offset = RegisterOrConstant(temp_reg);
1133 }
1134 Address super_check_addr(sub_klass, super_check_offset);
1135 ldr(rscratch1, super_check_addr);
1136 cmp(super_klass, rscratch1); // load displayed supertype
1137
1138 // This check has worked decisively for primary supers.
1139 // Secondary supers are sought in the super_cache ('super_cache_addr').
1140 // (Secondary supers are interfaces and very deeply nested subtypes.)
1141 // This works in the same check above because of a tricky aliasing
1142 // between the super_cache and the primary super display elements.
1143 // (The 'super_check_addr' can address either, as the case requires.)
1144 // Note that the cache is updated below if it does not help us find
1145 // what we need immediately.
1146 // So if it was a primary super, we can just fail immediately.
1147 // Otherwise, it's the slow path for us (no success at this point).
1148
1149 if (super_check_offset.is_register()) {
1150 br(Assembler::EQ, *L_success);
1151 subs(zr, super_check_offset.as_register(), sc_offset);
1152 if (L_failure == &L_fallthrough) {
1153 br(Assembler::EQ, *L_slow_path);
1154 } else {
1155 br(Assembler::NE, *L_failure);
1156 final_jmp(*L_slow_path);
1157 }
1158 } else if (super_check_offset.as_constant() == sc_offset) {
1159 // Need a slow path; fast failure is impossible.
1160 if (L_slow_path == &L_fallthrough) {
1161 br(Assembler::EQ, *L_success);
1162 } else {
1163 br(Assembler::NE, *L_slow_path);
1164 final_jmp(*L_success);
1165 }
1166 } else {
1167 // No slow path; it's a fast decision.
1168 if (L_failure == &L_fallthrough) {
1169 br(Assembler::EQ, *L_success);
1170 } else {
1171 br(Assembler::NE, *L_failure);
1172 final_jmp(*L_success);
1173 }
1174 }
1175
1176 bind(L_fallthrough);
1177
1178 #undef final_jmp
1179 }
1180
1181 // These two are taken from x86, but they look generally useful
1182
1183 // scans count pointer sized words at [addr] for occurence of value,
1184 // generic
repne_scan(Register addr,Register value,Register count,Register scratch)1185 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1186 Register scratch) {
1187 Label Lloop, Lexit;
1188 cbz(count, Lexit);
1189 bind(Lloop);
1190 ldr(scratch, post(addr, wordSize));
1191 cmp(value, scratch);
1192 br(EQ, Lexit);
1193 sub(count, count, 1);
1194 cbnz(count, Lloop);
1195 bind(Lexit);
1196 }
1197
1198 // scans count 4 byte words at [addr] for occurence of value,
1199 // generic
repne_scanw(Register addr,Register value,Register count,Register scratch)1200 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1201 Register scratch) {
1202 Label Lloop, Lexit;
1203 cbz(count, Lexit);
1204 bind(Lloop);
1205 ldrw(scratch, post(addr, wordSize));
1206 cmpw(value, scratch);
1207 br(EQ, Lexit);
1208 sub(count, count, 1);
1209 cbnz(count, Lloop);
1210 bind(Lexit);
1211 }
1212
check_klass_subtype_slow_path(Register sub_klass,Register super_klass,Register temp_reg,Register temp2_reg,Label * L_success,Label * L_failure,bool set_cond_codes)1213 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1214 Register super_klass,
1215 Register temp_reg,
1216 Register temp2_reg,
1217 Label* L_success,
1218 Label* L_failure,
1219 bool set_cond_codes) {
1220 assert_different_registers(sub_klass, super_klass, temp_reg);
1221 if (temp2_reg != noreg)
1222 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1223 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1224
1225 Label L_fallthrough;
1226 int label_nulls = 0;
1227 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1228 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1229 assert(label_nulls <= 1, "at most one NULL in the batch");
1230
1231 // a couple of useful fields in sub_klass:
1232 int ss_offset = in_bytes(Klass::secondary_supers_offset());
1233 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1234 Address secondary_supers_addr(sub_klass, ss_offset);
1235 Address super_cache_addr( sub_klass, sc_offset);
1236
1237 BLOCK_COMMENT("check_klass_subtype_slow_path");
1238
1239 // Do a linear scan of the secondary super-klass chain.
1240 // This code is rarely used, so simplicity is a virtue here.
1241 // The repne_scan instruction uses fixed registers, which we must spill.
1242 // Don't worry too much about pre-existing connections with the input regs.
1243
1244 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1245 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1246
1247 RegSet pushed_registers;
1248 if (!IS_A_TEMP(r2)) pushed_registers += r2;
1249 if (!IS_A_TEMP(r5)) pushed_registers += r5;
1250
1251 if (super_klass != r0 || UseCompressedOops) {
1252 if (!IS_A_TEMP(r0)) pushed_registers += r0;
1253 }
1254
1255 push(pushed_registers, sp);
1256
1257 // Get super_klass value into r0 (even if it was in r5 or r2).
1258 if (super_klass != r0) {
1259 mov(r0, super_klass);
1260 }
1261
1262 #ifndef PRODUCT
1263 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1264 Address pst_counter_addr(rscratch2);
1265 ldr(rscratch1, pst_counter_addr);
1266 add(rscratch1, rscratch1, 1);
1267 str(rscratch1, pst_counter_addr);
1268 #endif //PRODUCT
1269
1270 // We will consult the secondary-super array.
1271 ldr(r5, secondary_supers_addr);
1272 // Load the array length.
1273 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1274 // Skip to start of data.
1275 add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1276
1277 cmp(sp, zr); // Clear Z flag; SP is never zero
1278 // Scan R2 words at [R5] for an occurrence of R0.
1279 // Set NZ/Z based on last compare.
1280 repne_scan(r5, r0, r2, rscratch1);
1281
1282 // Unspill the temp. registers:
1283 pop(pushed_registers, sp);
1284
1285 br(Assembler::NE, *L_failure);
1286
1287 // Success. Cache the super we found and proceed in triumph.
1288 str(super_klass, super_cache_addr);
1289
1290 if (L_success != &L_fallthrough) {
1291 b(*L_success);
1292 }
1293
1294 #undef IS_A_TEMP
1295
1296 bind(L_fallthrough);
1297 }
1298
clinit_barrier(Register klass,Register scratch,Label * L_fast_path,Label * L_slow_path)1299 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1300 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
1301 assert_different_registers(klass, rthread, scratch);
1302
1303 Label L_fallthrough, L_tmp;
1304 if (L_fast_path == NULL) {
1305 L_fast_path = &L_fallthrough;
1306 } else if (L_slow_path == NULL) {
1307 L_slow_path = &L_fallthrough;
1308 }
1309 // Fast path check: class is fully initialized
1310 ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1311 subs(zr, scratch, InstanceKlass::fully_initialized);
1312 br(Assembler::EQ, *L_fast_path);
1313
1314 // Fast path check: current thread is initializer thread
1315 ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1316 cmp(rthread, scratch);
1317
1318 if (L_slow_path == &L_fallthrough) {
1319 br(Assembler::EQ, *L_fast_path);
1320 bind(*L_slow_path);
1321 } else if (L_fast_path == &L_fallthrough) {
1322 br(Assembler::NE, *L_slow_path);
1323 bind(*L_fast_path);
1324 } else {
1325 Unimplemented();
1326 }
1327 }
1328
verify_oop(Register reg,const char * s)1329 void MacroAssembler::verify_oop(Register reg, const char* s) {
1330 if (!VerifyOops) return;
1331
1332 // Pass register number to verify_oop_subroutine
1333 const char* b = NULL;
1334 {
1335 ResourceMark rm;
1336 stringStream ss;
1337 ss.print("verify_oop: %s: %s", reg->name(), s);
1338 b = code_string(ss.as_string());
1339 }
1340 BLOCK_COMMENT("verify_oop {");
1341
1342 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1343 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1344
1345 mov(r0, reg);
1346 mov(rscratch1, (address)b);
1347
1348 // call indirectly to solve generation ordering problem
1349 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1350 ldr(rscratch2, Address(rscratch2));
1351 blr(rscratch2);
1352
1353 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1354 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1355
1356 BLOCK_COMMENT("} verify_oop");
1357 }
1358
verify_oop_addr(Address addr,const char * s)1359 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1360 if (!VerifyOops) return;
1361
1362 const char* b = NULL;
1363 {
1364 ResourceMark rm;
1365 stringStream ss;
1366 ss.print("verify_oop_addr: %s", s);
1367 b = code_string(ss.as_string());
1368 }
1369 BLOCK_COMMENT("verify_oop_addr {");
1370
1371 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1372 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1373
1374 // addr may contain sp so we will have to adjust it based on the
1375 // pushes that we just did.
1376 if (addr.uses(sp)) {
1377 lea(r0, addr);
1378 ldr(r0, Address(r0, 4 * wordSize));
1379 } else {
1380 ldr(r0, addr);
1381 }
1382 mov(rscratch1, (address)b);
1383
1384 // call indirectly to solve generation ordering problem
1385 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1386 ldr(rscratch2, Address(rscratch2));
1387 blr(rscratch2);
1388
1389 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1390 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1391
1392 BLOCK_COMMENT("} verify_oop_addr");
1393 }
1394
argument_address(RegisterOrConstant arg_slot,int extra_slot_offset)1395 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1396 int extra_slot_offset) {
1397 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1398 int stackElementSize = Interpreter::stackElementSize;
1399 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1400 #ifdef ASSERT
1401 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1402 assert(offset1 - offset == stackElementSize, "correct arithmetic");
1403 #endif
1404 if (arg_slot.is_constant()) {
1405 return Address(esp, arg_slot.as_constant() * stackElementSize
1406 + offset);
1407 } else {
1408 add(rscratch1, esp, arg_slot.as_register(),
1409 ext::uxtx, exact_log2(stackElementSize));
1410 return Address(rscratch1, offset);
1411 }
1412 }
1413
call_VM_leaf_base(address entry_point,int number_of_arguments,Label * retaddr)1414 void MacroAssembler::call_VM_leaf_base(address entry_point,
1415 int number_of_arguments,
1416 Label *retaddr) {
1417 Label E, L;
1418
1419 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1420
1421 mov(rscratch1, entry_point);
1422 blr(rscratch1);
1423 if (retaddr)
1424 bind(*retaddr);
1425
1426 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1427 maybe_isb();
1428 }
1429
call_VM_leaf(address entry_point,int number_of_arguments)1430 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1431 call_VM_leaf_base(entry_point, number_of_arguments);
1432 }
1433
call_VM_leaf(address entry_point,Register arg_0)1434 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1435 pass_arg0(this, arg_0);
1436 call_VM_leaf_base(entry_point, 1);
1437 }
1438
call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1439 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1440 pass_arg0(this, arg_0);
1441 pass_arg1(this, arg_1);
1442 call_VM_leaf_base(entry_point, 2);
1443 }
1444
call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1445 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1446 Register arg_1, Register arg_2) {
1447 pass_arg0(this, arg_0);
1448 pass_arg1(this, arg_1);
1449 pass_arg2(this, arg_2);
1450 call_VM_leaf_base(entry_point, 3);
1451 }
1452
super_call_VM_leaf(address entry_point,Register arg_0)1453 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1454 pass_arg0(this, arg_0);
1455 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1456 }
1457
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1458 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1459
1460 assert(arg_0 != c_rarg1, "smashed arg");
1461 pass_arg1(this, arg_1);
1462 pass_arg0(this, arg_0);
1463 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1464 }
1465
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1466 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1467 assert(arg_0 != c_rarg2, "smashed arg");
1468 assert(arg_1 != c_rarg2, "smashed arg");
1469 pass_arg2(this, arg_2);
1470 assert(arg_0 != c_rarg1, "smashed arg");
1471 pass_arg1(this, arg_1);
1472 pass_arg0(this, arg_0);
1473 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1474 }
1475
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2,Register arg_3)1476 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1477 assert(arg_0 != c_rarg3, "smashed arg");
1478 assert(arg_1 != c_rarg3, "smashed arg");
1479 assert(arg_2 != c_rarg3, "smashed arg");
1480 pass_arg3(this, arg_3);
1481 assert(arg_0 != c_rarg2, "smashed arg");
1482 assert(arg_1 != c_rarg2, "smashed arg");
1483 pass_arg2(this, arg_2);
1484 assert(arg_0 != c_rarg1, "smashed arg");
1485 pass_arg1(this, arg_1);
1486 pass_arg0(this, arg_0);
1487 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1488 }
1489
null_check(Register reg,int offset)1490 void MacroAssembler::null_check(Register reg, int offset) {
1491 if (needs_explicit_null_check(offset)) {
1492 // provoke OS NULL exception if reg = NULL by
1493 // accessing M[reg] w/o changing any registers
1494 // NOTE: this is plenty to provoke a segv
1495 ldr(zr, Address(reg));
1496 } else {
1497 // nothing to do, (later) access of M[reg + offset]
1498 // will provoke OS NULL exception if reg = NULL
1499 }
1500 }
1501
1502 // MacroAssembler protected routines needed to implement
1503 // public methods
1504
mov(Register r,Address dest)1505 void MacroAssembler::mov(Register r, Address dest) {
1506 code_section()->relocate(pc(), dest.rspec());
1507 u_int64_t imm64 = (u_int64_t)dest.target();
1508 movptr(r, imm64);
1509 }
1510
1511 // Move a constant pointer into r. In AArch64 mode the virtual
1512 // address space is 48 bits in size, so we only need three
1513 // instructions to create a patchable instruction sequence that can
1514 // reach anywhere.
movptr(Register r,uintptr_t imm64)1515 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1516 #ifndef PRODUCT
1517 {
1518 char buffer[64];
1519 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1520 block_comment(buffer);
1521 }
1522 #endif
1523 assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1524 movz(r, imm64 & 0xffff);
1525 imm64 >>= 16;
1526 movk(r, imm64 & 0xffff, 16);
1527 imm64 >>= 16;
1528 movk(r, imm64 & 0xffff, 32);
1529 }
1530
1531 // Macro to mov replicated immediate to vector register.
1532 // Vd will get the following values for different arrangements in T
1533 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh
1534 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh
1535 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh
1536 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh
1537 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh
1538 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh
1539 // T1D/T2D: invalid
mov(FloatRegister Vd,SIMD_Arrangement T,u_int32_t imm32)1540 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1541 assert(T != T1D && T != T2D, "invalid arrangement");
1542 if (T == T8B || T == T16B) {
1543 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1544 movi(Vd, T, imm32 & 0xff, 0);
1545 return;
1546 }
1547 u_int32_t nimm32 = ~imm32;
1548 if (T == T4H || T == T8H) {
1549 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1550 imm32 &= 0xffff;
1551 nimm32 &= 0xffff;
1552 }
1553 u_int32_t x = imm32;
1554 int movi_cnt = 0;
1555 int movn_cnt = 0;
1556 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1557 x = nimm32;
1558 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1559 if (movn_cnt < movi_cnt) imm32 = nimm32;
1560 unsigned lsl = 0;
1561 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1562 if (movn_cnt < movi_cnt)
1563 mvni(Vd, T, imm32 & 0xff, lsl);
1564 else
1565 movi(Vd, T, imm32 & 0xff, lsl);
1566 imm32 >>= 8; lsl += 8;
1567 while (imm32) {
1568 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1569 if (movn_cnt < movi_cnt)
1570 bici(Vd, T, imm32 & 0xff, lsl);
1571 else
1572 orri(Vd, T, imm32 & 0xff, lsl);
1573 lsl += 8; imm32 >>= 8;
1574 }
1575 }
1576
mov_immediate64(Register dst,u_int64_t imm64)1577 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1578 {
1579 #ifndef PRODUCT
1580 {
1581 char buffer[64];
1582 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1583 block_comment(buffer);
1584 }
1585 #endif
1586 if (operand_valid_for_logical_immediate(false, imm64)) {
1587 orr(dst, zr, imm64);
1588 } else {
1589 // we can use a combination of MOVZ or MOVN with
1590 // MOVK to build up the constant
1591 u_int64_t imm_h[4];
1592 int zero_count = 0;
1593 int neg_count = 0;
1594 int i;
1595 for (i = 0; i < 4; i++) {
1596 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1597 if (imm_h[i] == 0) {
1598 zero_count++;
1599 } else if (imm_h[i] == 0xffffL) {
1600 neg_count++;
1601 }
1602 }
1603 if (zero_count == 4) {
1604 // one MOVZ will do
1605 movz(dst, 0);
1606 } else if (neg_count == 4) {
1607 // one MOVN will do
1608 movn(dst, 0);
1609 } else if (zero_count == 3) {
1610 for (i = 0; i < 4; i++) {
1611 if (imm_h[i] != 0L) {
1612 movz(dst, (u_int32_t)imm_h[i], (i << 4));
1613 break;
1614 }
1615 }
1616 } else if (neg_count == 3) {
1617 // one MOVN will do
1618 for (int i = 0; i < 4; i++) {
1619 if (imm_h[i] != 0xffffL) {
1620 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1621 break;
1622 }
1623 }
1624 } else if (zero_count == 2) {
1625 // one MOVZ and one MOVK will do
1626 for (i = 0; i < 3; i++) {
1627 if (imm_h[i] != 0L) {
1628 movz(dst, (u_int32_t)imm_h[i], (i << 4));
1629 i++;
1630 break;
1631 }
1632 }
1633 for (;i < 4; i++) {
1634 if (imm_h[i] != 0L) {
1635 movk(dst, (u_int32_t)imm_h[i], (i << 4));
1636 }
1637 }
1638 } else if (neg_count == 2) {
1639 // one MOVN and one MOVK will do
1640 for (i = 0; i < 4; i++) {
1641 if (imm_h[i] != 0xffffL) {
1642 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1643 i++;
1644 break;
1645 }
1646 }
1647 for (;i < 4; i++) {
1648 if (imm_h[i] != 0xffffL) {
1649 movk(dst, (u_int32_t)imm_h[i], (i << 4));
1650 }
1651 }
1652 } else if (zero_count == 1) {
1653 // one MOVZ and two MOVKs will do
1654 for (i = 0; i < 4; i++) {
1655 if (imm_h[i] != 0L) {
1656 movz(dst, (u_int32_t)imm_h[i], (i << 4));
1657 i++;
1658 break;
1659 }
1660 }
1661 for (;i < 4; i++) {
1662 if (imm_h[i] != 0x0L) {
1663 movk(dst, (u_int32_t)imm_h[i], (i << 4));
1664 }
1665 }
1666 } else if (neg_count == 1) {
1667 // one MOVN and two MOVKs will do
1668 for (i = 0; i < 4; i++) {
1669 if (imm_h[i] != 0xffffL) {
1670 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1671 i++;
1672 break;
1673 }
1674 }
1675 for (;i < 4; i++) {
1676 if (imm_h[i] != 0xffffL) {
1677 movk(dst, (u_int32_t)imm_h[i], (i << 4));
1678 }
1679 }
1680 } else {
1681 // use a MOVZ and 3 MOVKs (makes it easier to debug)
1682 movz(dst, (u_int32_t)imm_h[0], 0);
1683 for (i = 1; i < 4; i++) {
1684 movk(dst, (u_int32_t)imm_h[i], (i << 4));
1685 }
1686 }
1687 }
1688 }
1689
mov_immediate32(Register dst,u_int32_t imm32)1690 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1691 {
1692 #ifndef PRODUCT
1693 {
1694 char buffer[64];
1695 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1696 block_comment(buffer);
1697 }
1698 #endif
1699 if (operand_valid_for_logical_immediate(true, imm32)) {
1700 orrw(dst, zr, imm32);
1701 } else {
1702 // we can use MOVZ, MOVN or two calls to MOVK to build up the
1703 // constant
1704 u_int32_t imm_h[2];
1705 imm_h[0] = imm32 & 0xffff;
1706 imm_h[1] = ((imm32 >> 16) & 0xffff);
1707 if (imm_h[0] == 0) {
1708 movzw(dst, imm_h[1], 16);
1709 } else if (imm_h[0] == 0xffff) {
1710 movnw(dst, imm_h[1] ^ 0xffff, 16);
1711 } else if (imm_h[1] == 0) {
1712 movzw(dst, imm_h[0], 0);
1713 } else if (imm_h[1] == 0xffff) {
1714 movnw(dst, imm_h[0] ^ 0xffff, 0);
1715 } else {
1716 // use a MOVZ and MOVK (makes it easier to debug)
1717 movzw(dst, imm_h[0], 0);
1718 movkw(dst, imm_h[1], 16);
1719 }
1720 }
1721 }
1722
1723 // Form an address from base + offset in Rd. Rd may or may
1724 // not actually be used: you must use the Address that is returned.
1725 // It is up to you to ensure that the shift provided matches the size
1726 // of your data.
form_address(Register Rd,Register base,long byte_offset,int shift)1727 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1728 if (Address::offset_ok_for_immed(byte_offset, shift))
1729 // It fits; no need for any heroics
1730 return Address(base, byte_offset);
1731
1732 // Don't do anything clever with negative or misaligned offsets
1733 unsigned mask = (1 << shift) - 1;
1734 if (byte_offset < 0 || byte_offset & mask) {
1735 mov(Rd, byte_offset);
1736 add(Rd, base, Rd);
1737 return Address(Rd);
1738 }
1739
1740 // See if we can do this with two 12-bit offsets
1741 {
1742 unsigned long word_offset = byte_offset >> shift;
1743 unsigned long masked_offset = word_offset & 0xfff000;
1744 if (Address::offset_ok_for_immed(word_offset - masked_offset)
1745 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1746 add(Rd, base, masked_offset << shift);
1747 word_offset -= masked_offset;
1748 return Address(Rd, word_offset << shift);
1749 }
1750 }
1751
1752 // Do it the hard way
1753 mov(Rd, byte_offset);
1754 add(Rd, base, Rd);
1755 return Address(Rd);
1756 }
1757
atomic_incw(Register counter_addr,Register tmp,Register tmp2)1758 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1759 if (UseLSE) {
1760 mov(tmp, 1);
1761 ldadd(Assembler::word, tmp, zr, counter_addr);
1762 return;
1763 }
1764 Label retry_load;
1765 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1766 prfm(Address(counter_addr), PSTL1STRM);
1767 bind(retry_load);
1768 // flush and load exclusive from the memory location
1769 ldxrw(tmp, counter_addr);
1770 addw(tmp, tmp, 1);
1771 // if we store+flush with no intervening write tmp wil be zero
1772 stxrw(tmp2, tmp, counter_addr);
1773 cbnzw(tmp2, retry_load);
1774 }
1775
1776
corrected_idivl(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1777 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1778 bool want_remainder, Register scratch)
1779 {
1780 // Full implementation of Java idiv and irem. The function
1781 // returns the (pc) offset of the div instruction - may be needed
1782 // for implicit exceptions.
1783 //
1784 // constraint : ra/rb =/= scratch
1785 // normal case
1786 //
1787 // input : ra: dividend
1788 // rb: divisor
1789 //
1790 // result: either
1791 // quotient (= ra idiv rb)
1792 // remainder (= ra irem rb)
1793
1794 assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1795
1796 int idivl_offset = offset();
1797 if (! want_remainder) {
1798 sdivw(result, ra, rb);
1799 } else {
1800 sdivw(scratch, ra, rb);
1801 Assembler::msubw(result, scratch, rb, ra);
1802 }
1803
1804 return idivl_offset;
1805 }
1806
corrected_idivq(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1807 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1808 bool want_remainder, Register scratch)
1809 {
1810 // Full implementation of Java ldiv and lrem. The function
1811 // returns the (pc) offset of the div instruction - may be needed
1812 // for implicit exceptions.
1813 //
1814 // constraint : ra/rb =/= scratch
1815 // normal case
1816 //
1817 // input : ra: dividend
1818 // rb: divisor
1819 //
1820 // result: either
1821 // quotient (= ra idiv rb)
1822 // remainder (= ra irem rb)
1823
1824 assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1825
1826 int idivq_offset = offset();
1827 if (! want_remainder) {
1828 sdiv(result, ra, rb);
1829 } else {
1830 sdiv(scratch, ra, rb);
1831 Assembler::msub(result, scratch, rb, ra);
1832 }
1833
1834 return idivq_offset;
1835 }
1836
membar(Membar_mask_bits order_constraint)1837 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1838 address prev = pc() - NativeMembar::instruction_size;
1839 address last = code()->last_insn();
1840 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1841 NativeMembar *bar = NativeMembar_at(prev);
1842 // We are merging two memory barrier instructions. On AArch64 we
1843 // can do this simply by ORing them together.
1844 bar->set_kind(bar->get_kind() | order_constraint);
1845 BLOCK_COMMENT("merged membar");
1846 } else {
1847 code()->set_last_insn(pc());
1848 dmb(Assembler::barrier(order_constraint));
1849 }
1850 }
1851
try_merge_ldst(Register rt,const Address & adr,size_t size_in_bytes,bool is_store)1852 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1853 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1854 merge_ldst(rt, adr, size_in_bytes, is_store);
1855 code()->clear_last_insn();
1856 return true;
1857 } else {
1858 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1859 const unsigned mask = size_in_bytes - 1;
1860 if (adr.getMode() == Address::base_plus_offset &&
1861 (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1862 code()->set_last_insn(pc());
1863 }
1864 return false;
1865 }
1866 }
1867
ldr(Register Rx,const Address & adr)1868 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1869 // We always try to merge two adjacent loads into one ldp.
1870 if (!try_merge_ldst(Rx, adr, 8, false)) {
1871 Assembler::ldr(Rx, adr);
1872 }
1873 }
1874
ldrw(Register Rw,const Address & adr)1875 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1876 // We always try to merge two adjacent loads into one ldp.
1877 if (!try_merge_ldst(Rw, adr, 4, false)) {
1878 Assembler::ldrw(Rw, adr);
1879 }
1880 }
1881
str(Register Rx,const Address & adr)1882 void MacroAssembler::str(Register Rx, const Address &adr) {
1883 // We always try to merge two adjacent stores into one stp.
1884 if (!try_merge_ldst(Rx, adr, 8, true)) {
1885 Assembler::str(Rx, adr);
1886 }
1887 }
1888
strw(Register Rw,const Address & adr)1889 void MacroAssembler::strw(Register Rw, const Address &adr) {
1890 // We always try to merge two adjacent stores into one stp.
1891 if (!try_merge_ldst(Rw, adr, 4, true)) {
1892 Assembler::strw(Rw, adr);
1893 }
1894 }
1895
1896 // MacroAssembler routines found actually to be needed
1897
push(Register src)1898 void MacroAssembler::push(Register src)
1899 {
1900 str(src, Address(pre(esp, -1 * wordSize)));
1901 }
1902
pop(Register dst)1903 void MacroAssembler::pop(Register dst)
1904 {
1905 ldr(dst, Address(post(esp, 1 * wordSize)));
1906 }
1907
1908 // Note: load_unsigned_short used to be called load_unsigned_word.
load_unsigned_short(Register dst,Address src)1909 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1910 int off = offset();
1911 ldrh(dst, src);
1912 return off;
1913 }
1914
load_unsigned_byte(Register dst,Address src)1915 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1916 int off = offset();
1917 ldrb(dst, src);
1918 return off;
1919 }
1920
load_signed_short(Register dst,Address src)1921 int MacroAssembler::load_signed_short(Register dst, Address src) {
1922 int off = offset();
1923 ldrsh(dst, src);
1924 return off;
1925 }
1926
load_signed_byte(Register dst,Address src)1927 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1928 int off = offset();
1929 ldrsb(dst, src);
1930 return off;
1931 }
1932
load_signed_short32(Register dst,Address src)1933 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1934 int off = offset();
1935 ldrshw(dst, src);
1936 return off;
1937 }
1938
load_signed_byte32(Register dst,Address src)1939 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1940 int off = offset();
1941 ldrsbw(dst, src);
1942 return off;
1943 }
1944
load_sized_value(Register dst,Address src,size_t size_in_bytes,bool is_signed,Register dst2)1945 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1946 switch (size_in_bytes) {
1947 case 8: ldr(dst, src); break;
1948 case 4: ldrw(dst, src); break;
1949 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1950 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1951 default: ShouldNotReachHere();
1952 }
1953 }
1954
store_sized_value(Address dst,Register src,size_t size_in_bytes,Register src2)1955 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1956 switch (size_in_bytes) {
1957 case 8: str(src, dst); break;
1958 case 4: strw(src, dst); break;
1959 case 2: strh(src, dst); break;
1960 case 1: strb(src, dst); break;
1961 default: ShouldNotReachHere();
1962 }
1963 }
1964
decrementw(Register reg,int value)1965 void MacroAssembler::decrementw(Register reg, int value)
1966 {
1967 if (value < 0) { incrementw(reg, -value); return; }
1968 if (value == 0) { return; }
1969 if (value < (1 << 12)) { subw(reg, reg, value); return; }
1970 /* else */ {
1971 guarantee(reg != rscratch2, "invalid dst for register decrement");
1972 movw(rscratch2, (unsigned)value);
1973 subw(reg, reg, rscratch2);
1974 }
1975 }
1976
decrement(Register reg,int value)1977 void MacroAssembler::decrement(Register reg, int value)
1978 {
1979 if (value < 0) { increment(reg, -value); return; }
1980 if (value == 0) { return; }
1981 if (value < (1 << 12)) { sub(reg, reg, value); return; }
1982 /* else */ {
1983 assert(reg != rscratch2, "invalid dst for register decrement");
1984 mov(rscratch2, (unsigned long)value);
1985 sub(reg, reg, rscratch2);
1986 }
1987 }
1988
decrementw(Address dst,int value)1989 void MacroAssembler::decrementw(Address dst, int value)
1990 {
1991 assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1992 if (dst.getMode() == Address::literal) {
1993 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1994 lea(rscratch2, dst);
1995 dst = Address(rscratch2);
1996 }
1997 ldrw(rscratch1, dst);
1998 decrementw(rscratch1, value);
1999 strw(rscratch1, dst);
2000 }
2001
decrement(Address dst,int value)2002 void MacroAssembler::decrement(Address dst, int value)
2003 {
2004 assert(!dst.uses(rscratch1), "invalid address for decrement");
2005 if (dst.getMode() == Address::literal) {
2006 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2007 lea(rscratch2, dst);
2008 dst = Address(rscratch2);
2009 }
2010 ldr(rscratch1, dst);
2011 decrement(rscratch1, value);
2012 str(rscratch1, dst);
2013 }
2014
incrementw(Register reg,int value)2015 void MacroAssembler::incrementw(Register reg, int value)
2016 {
2017 if (value < 0) { decrementw(reg, -value); return; }
2018 if (value == 0) { return; }
2019 if (value < (1 << 12)) { addw(reg, reg, value); return; }
2020 /* else */ {
2021 assert(reg != rscratch2, "invalid dst for register increment");
2022 movw(rscratch2, (unsigned)value);
2023 addw(reg, reg, rscratch2);
2024 }
2025 }
2026
increment(Register reg,int value)2027 void MacroAssembler::increment(Register reg, int value)
2028 {
2029 if (value < 0) { decrement(reg, -value); return; }
2030 if (value == 0) { return; }
2031 if (value < (1 << 12)) { add(reg, reg, value); return; }
2032 /* else */ {
2033 assert(reg != rscratch2, "invalid dst for register increment");
2034 movw(rscratch2, (unsigned)value);
2035 add(reg, reg, rscratch2);
2036 }
2037 }
2038
incrementw(Address dst,int value)2039 void MacroAssembler::incrementw(Address dst, int value)
2040 {
2041 assert(!dst.uses(rscratch1), "invalid dst for address increment");
2042 if (dst.getMode() == Address::literal) {
2043 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2044 lea(rscratch2, dst);
2045 dst = Address(rscratch2);
2046 }
2047 ldrw(rscratch1, dst);
2048 incrementw(rscratch1, value);
2049 strw(rscratch1, dst);
2050 }
2051
increment(Address dst,int value)2052 void MacroAssembler::increment(Address dst, int value)
2053 {
2054 assert(!dst.uses(rscratch1), "invalid dst for address increment");
2055 if (dst.getMode() == Address::literal) {
2056 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2057 lea(rscratch2, dst);
2058 dst = Address(rscratch2);
2059 }
2060 ldr(rscratch1, dst);
2061 increment(rscratch1, value);
2062 str(rscratch1, dst);
2063 }
2064
2065
pusha()2066 void MacroAssembler::pusha() {
2067 push(0x7fffffff, sp);
2068 }
2069
popa()2070 void MacroAssembler::popa() {
2071 pop(0x7fffffff, sp);
2072 }
2073
2074 // Push lots of registers in the bit set supplied. Don't push sp.
2075 // Return the number of words pushed
push(unsigned int bitset,Register stack)2076 int MacroAssembler::push(unsigned int bitset, Register stack) {
2077 int words_pushed = 0;
2078
2079 // Scan bitset to accumulate register pairs
2080 unsigned char regs[32];
2081 int count = 0;
2082 for (int reg = 0; reg <= 30; reg++) {
2083 if (1 & bitset)
2084 regs[count++] = reg;
2085 bitset >>= 1;
2086 }
2087 regs[count++] = zr->encoding_nocheck();
2088 count &= ~1; // Only push an even nuber of regs
2089
2090 if (count) {
2091 stp(as_Register(regs[0]), as_Register(regs[1]),
2092 Address(pre(stack, -count * wordSize)));
2093 words_pushed += 2;
2094 }
2095 for (int i = 2; i < count; i += 2) {
2096 stp(as_Register(regs[i]), as_Register(regs[i+1]),
2097 Address(stack, i * wordSize));
2098 words_pushed += 2;
2099 }
2100
2101 assert(words_pushed == count, "oops, pushed != count");
2102
2103 return count;
2104 }
2105
pop(unsigned int bitset,Register stack)2106 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2107 int words_pushed = 0;
2108
2109 // Scan bitset to accumulate register pairs
2110 unsigned char regs[32];
2111 int count = 0;
2112 for (int reg = 0; reg <= 30; reg++) {
2113 if (1 & bitset)
2114 regs[count++] = reg;
2115 bitset >>= 1;
2116 }
2117 regs[count++] = zr->encoding_nocheck();
2118 count &= ~1;
2119
2120 for (int i = 2; i < count; i += 2) {
2121 ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2122 Address(stack, i * wordSize));
2123 words_pushed += 2;
2124 }
2125 if (count) {
2126 ldp(as_Register(regs[0]), as_Register(regs[1]),
2127 Address(post(stack, count * wordSize)));
2128 words_pushed += 2;
2129 }
2130
2131 assert(words_pushed == count, "oops, pushed != count");
2132
2133 return count;
2134 }
2135
2136 // Push lots of registers in the bit set supplied. Don't push sp.
2137 // Return the number of words pushed
push_fp(unsigned int bitset,Register stack)2138 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2139 int words_pushed = 0;
2140
2141 // Scan bitset to accumulate register pairs
2142 unsigned char regs[32];
2143 int count = 0;
2144 for (int reg = 0; reg <= 31; reg++) {
2145 if (1 & bitset)
2146 regs[count++] = reg;
2147 bitset >>= 1;
2148 }
2149 regs[count++] = zr->encoding_nocheck();
2150 count &= ~1; // Only push an even number of regs
2151
2152 // Always pushing full 128 bit registers.
2153 if (count) {
2154 stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -count * wordSize * 2)));
2155 words_pushed += 2;
2156 }
2157 for (int i = 2; i < count; i += 2) {
2158 stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2159 words_pushed += 2;
2160 }
2161
2162 assert(words_pushed == count, "oops, pushed != count");
2163 return count;
2164 }
2165
pop_fp(unsigned int bitset,Register stack)2166 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2167 int words_pushed = 0;
2168
2169 // Scan bitset to accumulate register pairs
2170 unsigned char regs[32];
2171 int count = 0;
2172 for (int reg = 0; reg <= 31; reg++) {
2173 if (1 & bitset)
2174 regs[count++] = reg;
2175 bitset >>= 1;
2176 }
2177 regs[count++] = zr->encoding_nocheck();
2178 count &= ~1;
2179
2180 for (int i = 2; i < count; i += 2) {
2181 ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2182 words_pushed += 2;
2183 }
2184 if (count) {
2185 ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, count * wordSize * 2)));
2186 words_pushed += 2;
2187 }
2188
2189 assert(words_pushed == count, "oops, pushed != count");
2190
2191 return count;
2192 }
2193
2194 #ifdef ASSERT
verify_heapbase(const char * msg)2195 void MacroAssembler::verify_heapbase(const char* msg) {
2196 #if 0
2197 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2198 assert (Universe::heap() != NULL, "java heap should be initialized");
2199 if (CheckCompressedOops) {
2200 Label ok;
2201 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2202 cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2203 br(Assembler::EQ, ok);
2204 stop(msg);
2205 bind(ok);
2206 pop(1 << rscratch1->encoding(), sp);
2207 }
2208 #endif
2209 }
2210 #endif
2211
resolve_jobject(Register value,Register thread,Register tmp)2212 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2213 Label done, not_weak;
2214 cbz(value, done); // Use NULL as-is.
2215
2216 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2217 tbz(r0, 0, not_weak); // Test for jweak tag.
2218
2219 // Resolve jweak.
2220 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2221 Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2222 verify_oop(value);
2223 b(done);
2224
2225 bind(not_weak);
2226 // Resolve (untagged) jobject.
2227 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2228 verify_oop(value);
2229 bind(done);
2230 }
2231
stop(const char * msg)2232 void MacroAssembler::stop(const char* msg) {
2233 address ip = pc();
2234 pusha();
2235 mov(c_rarg0, (address)msg);
2236 mov(c_rarg1, (address)ip);
2237 mov(c_rarg2, sp);
2238 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2239 blr(c_rarg3);
2240 hlt(0);
2241 }
2242
warn(const char * msg)2243 void MacroAssembler::warn(const char* msg) {
2244 pusha();
2245 mov(c_rarg0, (address)msg);
2246 mov(lr, CAST_FROM_FN_PTR(address, warning));
2247 blr(lr);
2248 popa();
2249 }
2250
unimplemented(const char * what)2251 void MacroAssembler::unimplemented(const char* what) {
2252 const char* buf = NULL;
2253 {
2254 ResourceMark rm;
2255 stringStream ss;
2256 ss.print("unimplemented: %s", what);
2257 buf = code_string(ss.as_string());
2258 }
2259 stop(buf);
2260 }
2261
2262 // If a constant does not fit in an immediate field, generate some
2263 // number of MOV instructions and then perform the operation.
wrap_add_sub_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2264 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2265 add_sub_imm_insn insn1,
2266 add_sub_reg_insn insn2) {
2267 assert(Rd != zr, "Rd = zr and not setting flags?");
2268 if (operand_valid_for_add_sub_immediate((int)imm)) {
2269 (this->*insn1)(Rd, Rn, imm);
2270 } else {
2271 if (uabs(imm) < (1 << 24)) {
2272 (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2273 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2274 } else {
2275 assert_different_registers(Rd, Rn);
2276 mov(Rd, (uint64_t)imm);
2277 (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2278 }
2279 }
2280 }
2281
2282 // Seperate vsn which sets the flags. Optimisations are more restricted
2283 // because we must set the flags correctly.
wrap_adds_subs_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2284 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2285 add_sub_imm_insn insn1,
2286 add_sub_reg_insn insn2) {
2287 if (operand_valid_for_add_sub_immediate((int)imm)) {
2288 (this->*insn1)(Rd, Rn, imm);
2289 } else {
2290 assert_different_registers(Rd, Rn);
2291 assert(Rd != zr, "overflow in immediate operand");
2292 mov(Rd, (uint64_t)imm);
2293 (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2294 }
2295 }
2296
2297
add(Register Rd,Register Rn,RegisterOrConstant increment)2298 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2299 if (increment.is_register()) {
2300 add(Rd, Rn, increment.as_register());
2301 } else {
2302 add(Rd, Rn, increment.as_constant());
2303 }
2304 }
2305
addw(Register Rd,Register Rn,RegisterOrConstant increment)2306 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2307 if (increment.is_register()) {
2308 addw(Rd, Rn, increment.as_register());
2309 } else {
2310 addw(Rd, Rn, increment.as_constant());
2311 }
2312 }
2313
sub(Register Rd,Register Rn,RegisterOrConstant decrement)2314 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2315 if (decrement.is_register()) {
2316 sub(Rd, Rn, decrement.as_register());
2317 } else {
2318 sub(Rd, Rn, decrement.as_constant());
2319 }
2320 }
2321
subw(Register Rd,Register Rn,RegisterOrConstant decrement)2322 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2323 if (decrement.is_register()) {
2324 subw(Rd, Rn, decrement.as_register());
2325 } else {
2326 subw(Rd, Rn, decrement.as_constant());
2327 }
2328 }
2329
reinit_heapbase()2330 void MacroAssembler::reinit_heapbase()
2331 {
2332 if (UseCompressedOops) {
2333 if (Universe::is_fully_initialized()) {
2334 mov(rheapbase, CompressedOops::ptrs_base());
2335 } else {
2336 lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2337 ldr(rheapbase, Address(rheapbase));
2338 }
2339 }
2340 }
2341
2342 // this simulates the behaviour of the x86 cmpxchg instruction using a
2343 // load linked/store conditional pair. we use the acquire/release
2344 // versions of these instructions so that we flush pending writes as
2345 // per Java semantics.
2346
2347 // n.b the x86 version assumes the old value to be compared against is
2348 // in rax and updates rax with the value located in memory if the
2349 // cmpxchg fails. we supply a register for the old value explicitly
2350
2351 // the aarch64 load linked/store conditional instructions do not
2352 // accept an offset. so, unlike x86, we must provide a plain register
2353 // to identify the memory word to be compared/exchanged rather than a
2354 // register+offset Address.
2355
cmpxchgptr(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2356 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2357 Label &succeed, Label *fail) {
2358 // oldv holds comparison value
2359 // newv holds value to write in exchange
2360 // addr identifies memory word to compare against/update
2361 if (UseLSE) {
2362 mov(tmp, oldv);
2363 casal(Assembler::xword, oldv, newv, addr);
2364 cmp(tmp, oldv);
2365 br(Assembler::EQ, succeed);
2366 membar(AnyAny);
2367 } else {
2368 Label retry_load, nope;
2369 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2370 prfm(Address(addr), PSTL1STRM);
2371 bind(retry_load);
2372 // flush and load exclusive from the memory location
2373 // and fail if it is not what we expect
2374 ldaxr(tmp, addr);
2375 cmp(tmp, oldv);
2376 br(Assembler::NE, nope);
2377 // if we store+flush with no intervening write tmp wil be zero
2378 stlxr(tmp, newv, addr);
2379 cbzw(tmp, succeed);
2380 // retry so we only ever return after a load fails to compare
2381 // ensures we don't return a stale value after a failed write.
2382 b(retry_load);
2383 // if the memory word differs we return it in oldv and signal a fail
2384 bind(nope);
2385 membar(AnyAny);
2386 mov(oldv, tmp);
2387 }
2388 if (fail)
2389 b(*fail);
2390 }
2391
cmpxchg_obj_header(Register oldv,Register newv,Register obj,Register tmp,Label & succeed,Label * fail)2392 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2393 Label &succeed, Label *fail) {
2394 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2395 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2396 }
2397
cmpxchgw(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2398 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2399 Label &succeed, Label *fail) {
2400 // oldv holds comparison value
2401 // newv holds value to write in exchange
2402 // addr identifies memory word to compare against/update
2403 // tmp returns 0/1 for success/failure
2404 if (UseLSE) {
2405 mov(tmp, oldv);
2406 casal(Assembler::word, oldv, newv, addr);
2407 cmp(tmp, oldv);
2408 br(Assembler::EQ, succeed);
2409 membar(AnyAny);
2410 } else {
2411 Label retry_load, nope;
2412 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2413 prfm(Address(addr), PSTL1STRM);
2414 bind(retry_load);
2415 // flush and load exclusive from the memory location
2416 // and fail if it is not what we expect
2417 ldaxrw(tmp, addr);
2418 cmp(tmp, oldv);
2419 br(Assembler::NE, nope);
2420 // if we store+flush with no intervening write tmp wil be zero
2421 stlxrw(tmp, newv, addr);
2422 cbzw(tmp, succeed);
2423 // retry so we only ever return after a load fails to compare
2424 // ensures we don't return a stale value after a failed write.
2425 b(retry_load);
2426 // if the memory word differs we return it in oldv and signal a fail
2427 bind(nope);
2428 membar(AnyAny);
2429 mov(oldv, tmp);
2430 }
2431 if (fail)
2432 b(*fail);
2433 }
2434
2435 // A generic CAS; success or failure is in the EQ flag. A weak CAS
2436 // doesn't retry and may fail spuriously. If the oldval is wanted,
2437 // Pass a register for the result, otherwise pass noreg.
2438
2439 // Clobbers rscratch1
cmpxchg(Register addr,Register expected,Register new_val,enum operand_size size,bool acquire,bool release,bool weak,Register result)2440 void MacroAssembler::cmpxchg(Register addr, Register expected,
2441 Register new_val,
2442 enum operand_size size,
2443 bool acquire, bool release,
2444 bool weak,
2445 Register result) {
2446 if (result == noreg) result = rscratch1;
2447 BLOCK_COMMENT("cmpxchg {");
2448 if (UseLSE) {
2449 mov(result, expected);
2450 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2451 compare_eq(result, expected, size);
2452 } else {
2453 Label retry_load, done;
2454 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2455 prfm(Address(addr), PSTL1STRM);
2456 bind(retry_load);
2457 load_exclusive(result, addr, size, acquire);
2458 compare_eq(result, expected, size);
2459 br(Assembler::NE, done);
2460 store_exclusive(rscratch1, new_val, addr, size, release);
2461 if (weak) {
2462 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller.
2463 } else {
2464 cbnzw(rscratch1, retry_load);
2465 }
2466 bind(done);
2467 }
2468 BLOCK_COMMENT("} cmpxchg");
2469 }
2470
2471 // A generic comparison. Only compares for equality, clobbers rscratch1.
compare_eq(Register rm,Register rn,enum operand_size size)2472 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2473 if (size == xword) {
2474 cmp(rm, rn);
2475 } else if (size == word) {
2476 cmpw(rm, rn);
2477 } else if (size == halfword) {
2478 eorw(rscratch1, rm, rn);
2479 ands(zr, rscratch1, 0xffff);
2480 } else if (size == byte) {
2481 eorw(rscratch1, rm, rn);
2482 ands(zr, rscratch1, 0xff);
2483 } else {
2484 ShouldNotReachHere();
2485 }
2486 }
2487
2488
different(Register a,RegisterOrConstant b,Register c)2489 static bool different(Register a, RegisterOrConstant b, Register c) {
2490 if (b.is_constant())
2491 return a != c;
2492 else
2493 return a != b.as_register() && a != c && b.as_register() != c;
2494 }
2495
2496 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \
2497 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2498 if (UseLSE) { \
2499 prev = prev->is_valid() ? prev : zr; \
2500 if (incr.is_register()) { \
2501 AOP(sz, incr.as_register(), prev, addr); \
2502 } else { \
2503 mov(rscratch2, incr.as_constant()); \
2504 AOP(sz, rscratch2, prev, addr); \
2505 } \
2506 return; \
2507 } \
2508 Register result = rscratch2; \
2509 if (prev->is_valid()) \
2510 result = different(prev, incr, addr) ? prev : rscratch2; \
2511 \
2512 Label retry_load; \
2513 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \
2514 prfm(Address(addr), PSTL1STRM); \
2515 bind(retry_load); \
2516 LDXR(result, addr); \
2517 OP(rscratch1, result, incr); \
2518 STXR(rscratch2, rscratch1, addr); \
2519 cbnzw(rscratch2, retry_load); \
2520 if (prev->is_valid() && prev != result) { \
2521 IOP(prev, rscratch1, incr); \
2522 } \
2523 }
2524
2525 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2526 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2527 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2528 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2529
2530 #undef ATOMIC_OP
2531
2532 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \
2533 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2534 if (UseLSE) { \
2535 prev = prev->is_valid() ? prev : zr; \
2536 AOP(sz, newv, prev, addr); \
2537 return; \
2538 } \
2539 Register result = rscratch2; \
2540 if (prev->is_valid()) \
2541 result = different(prev, newv, addr) ? prev : rscratch2; \
2542 \
2543 Label retry_load; \
2544 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \
2545 prfm(Address(addr), PSTL1STRM); \
2546 bind(retry_load); \
2547 LDXR(result, addr); \
2548 STXR(rscratch1, newv, addr); \
2549 cbnzw(rscratch1, retry_load); \
2550 if (prev->is_valid() && prev != result) \
2551 mov(prev, result); \
2552 }
2553
2554 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2555 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2556 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2557 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2558
2559 #undef ATOMIC_XCHG
2560
2561 #ifndef PRODUCT
2562 extern "C" void findpc(intptr_t x);
2563 #endif
2564
debug64(char * msg,int64_t pc,int64_t regs[])2565 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2566 {
2567 // In order to get locks to work, we need to fake a in_VM state
2568 if (ShowMessageBoxOnError ) {
2569 JavaThread* thread = JavaThread::current();
2570 JavaThreadState saved_state = thread->thread_state();
2571 thread->set_thread_state(_thread_in_vm);
2572 #ifndef PRODUCT
2573 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2574 ttyLocker ttyl;
2575 BytecodeCounter::print();
2576 }
2577 #endif
2578 if (os::message_box(msg, "Execution stopped, print registers?")) {
2579 ttyLocker ttyl;
2580 tty->print_cr(" pc = 0x%016lx", pc);
2581 #ifndef PRODUCT
2582 tty->cr();
2583 findpc(pc);
2584 tty->cr();
2585 #endif
2586 tty->print_cr(" r0 = 0x%016lx", regs[0]);
2587 tty->print_cr(" r1 = 0x%016lx", regs[1]);
2588 tty->print_cr(" r2 = 0x%016lx", regs[2]);
2589 tty->print_cr(" r3 = 0x%016lx", regs[3]);
2590 tty->print_cr(" r4 = 0x%016lx", regs[4]);
2591 tty->print_cr(" r5 = 0x%016lx", regs[5]);
2592 tty->print_cr(" r6 = 0x%016lx", regs[6]);
2593 tty->print_cr(" r7 = 0x%016lx", regs[7]);
2594 tty->print_cr(" r8 = 0x%016lx", regs[8]);
2595 tty->print_cr(" r9 = 0x%016lx", regs[9]);
2596 tty->print_cr("r10 = 0x%016lx", regs[10]);
2597 tty->print_cr("r11 = 0x%016lx", regs[11]);
2598 tty->print_cr("r12 = 0x%016lx", regs[12]);
2599 tty->print_cr("r13 = 0x%016lx", regs[13]);
2600 tty->print_cr("r14 = 0x%016lx", regs[14]);
2601 tty->print_cr("r15 = 0x%016lx", regs[15]);
2602 tty->print_cr("r16 = 0x%016lx", regs[16]);
2603 tty->print_cr("r17 = 0x%016lx", regs[17]);
2604 tty->print_cr("r18 = 0x%016lx", regs[18]);
2605 tty->print_cr("r19 = 0x%016lx", regs[19]);
2606 tty->print_cr("r20 = 0x%016lx", regs[20]);
2607 tty->print_cr("r21 = 0x%016lx", regs[21]);
2608 tty->print_cr("r22 = 0x%016lx", regs[22]);
2609 tty->print_cr("r23 = 0x%016lx", regs[23]);
2610 tty->print_cr("r24 = 0x%016lx", regs[24]);
2611 tty->print_cr("r25 = 0x%016lx", regs[25]);
2612 tty->print_cr("r26 = 0x%016lx", regs[26]);
2613 tty->print_cr("r27 = 0x%016lx", regs[27]);
2614 tty->print_cr("r28 = 0x%016lx", regs[28]);
2615 tty->print_cr("r30 = 0x%016lx", regs[30]);
2616 tty->print_cr("r31 = 0x%016lx", regs[31]);
2617 BREAKPOINT;
2618 }
2619 }
2620 fatal("DEBUG MESSAGE: %s", msg);
2621 }
2622
push_call_clobbered_registers()2623 void MacroAssembler::push_call_clobbered_registers() {
2624 int step = 4 * wordSize;
2625 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2626 sub(sp, sp, step);
2627 mov(rscratch1, -step);
2628 // Push v0-v7, v16-v31.
2629 for (int i = 31; i>= 4; i -= 4) {
2630 if (i <= v7->encoding() || i >= v16->encoding())
2631 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2632 as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2633 }
2634 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2635 as_FloatRegister(3), T1D, Address(sp));
2636 }
2637
pop_call_clobbered_registers()2638 void MacroAssembler::pop_call_clobbered_registers() {
2639 for (int i = 0; i < 32; i += 4) {
2640 if (i <= v7->encoding() || i >= v16->encoding())
2641 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2642 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2643 }
2644
2645 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2646 }
2647
push_CPU_state(bool save_vectors)2648 void MacroAssembler::push_CPU_state(bool save_vectors) {
2649 int step = (save_vectors ? 8 : 4) * wordSize;
2650 push(0x3fffffff, sp); // integer registers except lr & sp
2651 mov(rscratch1, -step);
2652 sub(sp, sp, step);
2653 for (int i = 28; i >= 4; i -= 4) {
2654 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2655 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2656 }
2657 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2658 }
2659
pop_CPU_state(bool restore_vectors)2660 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2661 int step = (restore_vectors ? 8 : 4) * wordSize;
2662 for (int i = 0; i <= 28; i += 4)
2663 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2664 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2665 pop(0x3fffffff, sp); // integer registers except lr & sp
2666 }
2667
2668 /**
2669 * Helpers for multiply_to_len().
2670 */
add2_with_carry(Register final_dest_hi,Register dest_hi,Register dest_lo,Register src1,Register src2)2671 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2672 Register src1, Register src2) {
2673 adds(dest_lo, dest_lo, src1);
2674 adc(dest_hi, dest_hi, zr);
2675 adds(dest_lo, dest_lo, src2);
2676 adc(final_dest_hi, dest_hi, zr);
2677 }
2678
2679 // Generate an address from (r + r1 extend offset). "size" is the
2680 // size of the operand. The result may be in rscratch2.
offsetted_address(Register r,Register r1,Address::extend ext,int offset,int size)2681 Address MacroAssembler::offsetted_address(Register r, Register r1,
2682 Address::extend ext, int offset, int size) {
2683 if (offset || (ext.shift() % size != 0)) {
2684 lea(rscratch2, Address(r, r1, ext));
2685 return Address(rscratch2, offset);
2686 } else {
2687 return Address(r, r1, ext);
2688 }
2689 }
2690
spill_address(int size,int offset,Register tmp)2691 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2692 {
2693 assert(offset >= 0, "spill to negative address?");
2694 // Offset reachable ?
2695 // Not aligned - 9 bits signed offset
2696 // Aligned - 12 bits unsigned offset shifted
2697 Register base = sp;
2698 if ((offset & (size-1)) && offset >= (1<<8)) {
2699 add(tmp, base, offset & ((1<<12)-1));
2700 base = tmp;
2701 offset &= -1u<<12;
2702 }
2703
2704 if (offset >= (1<<12) * size) {
2705 add(tmp, base, offset & (((1<<12)-1)<<12));
2706 base = tmp;
2707 offset &= ~(((1<<12)-1)<<12);
2708 }
2709
2710 return Address(base, offset);
2711 }
2712
2713 // Checks whether offset is aligned.
2714 // Returns true if it is, else false.
merge_alignment_check(Register base,size_t size,long cur_offset,long prev_offset) const2715 bool MacroAssembler::merge_alignment_check(Register base,
2716 size_t size,
2717 long cur_offset,
2718 long prev_offset) const {
2719 if (AvoidUnalignedAccesses) {
2720 if (base == sp) {
2721 // Checks whether low offset if aligned to pair of registers.
2722 long pair_mask = size * 2 - 1;
2723 long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2724 return (offset & pair_mask) == 0;
2725 } else { // If base is not sp, we can't guarantee the access is aligned.
2726 return false;
2727 }
2728 } else {
2729 long mask = size - 1;
2730 // Load/store pair instruction only supports element size aligned offset.
2731 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2732 }
2733 }
2734
2735 // Checks whether current and previous loads/stores can be merged.
2736 // Returns true if it can be merged, else false.
ldst_can_merge(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store) const2737 bool MacroAssembler::ldst_can_merge(Register rt,
2738 const Address &adr,
2739 size_t cur_size_in_bytes,
2740 bool is_store) const {
2741 address prev = pc() - NativeInstruction::instruction_size;
2742 address last = code()->last_insn();
2743
2744 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2745 return false;
2746 }
2747
2748 if (adr.getMode() != Address::base_plus_offset || prev != last) {
2749 return false;
2750 }
2751
2752 NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2753 size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2754
2755 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2756 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2757
2758 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2759 return false;
2760 }
2761
2762 long max_offset = 63 * prev_size_in_bytes;
2763 long min_offset = -64 * prev_size_in_bytes;
2764
2765 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2766
2767 // Only same base can be merged.
2768 if (adr.base() != prev_ldst->base()) {
2769 return false;
2770 }
2771
2772 long cur_offset = adr.offset();
2773 long prev_offset = prev_ldst->offset();
2774 size_t diff = abs(cur_offset - prev_offset);
2775 if (diff != prev_size_in_bytes) {
2776 return false;
2777 }
2778
2779 // Following cases can not be merged:
2780 // ldr x2, [x2, #8]
2781 // ldr x3, [x2, #16]
2782 // or:
2783 // ldr x2, [x3, #8]
2784 // ldr x2, [x3, #16]
2785 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2786 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2787 return false;
2788 }
2789
2790 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2791 // Offset range must be in ldp/stp instruction's range.
2792 if (low_offset > max_offset || low_offset < min_offset) {
2793 return false;
2794 }
2795
2796 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2797 return true;
2798 }
2799
2800 return false;
2801 }
2802
2803 // Merge current load/store with previous load/store into ldp/stp.
merge_ldst(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store)2804 void MacroAssembler::merge_ldst(Register rt,
2805 const Address &adr,
2806 size_t cur_size_in_bytes,
2807 bool is_store) {
2808
2809 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2810
2811 Register rt_low, rt_high;
2812 address prev = pc() - NativeInstruction::instruction_size;
2813 NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2814
2815 long offset;
2816
2817 if (adr.offset() < prev_ldst->offset()) {
2818 offset = adr.offset();
2819 rt_low = rt;
2820 rt_high = prev_ldst->target();
2821 } else {
2822 offset = prev_ldst->offset();
2823 rt_low = prev_ldst->target();
2824 rt_high = rt;
2825 }
2826
2827 Address adr_p = Address(prev_ldst->base(), offset);
2828 // Overwrite previous generated binary.
2829 code_section()->set_end(prev);
2830
2831 const int sz = prev_ldst->size_in_bytes();
2832 assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2833 if (!is_store) {
2834 BLOCK_COMMENT("merged ldr pair");
2835 if (sz == 8) {
2836 ldp(rt_low, rt_high, adr_p);
2837 } else {
2838 ldpw(rt_low, rt_high, adr_p);
2839 }
2840 } else {
2841 BLOCK_COMMENT("merged str pair");
2842 if (sz == 8) {
2843 stp(rt_low, rt_high, adr_p);
2844 } else {
2845 stpw(rt_low, rt_high, adr_p);
2846 }
2847 }
2848 }
2849
2850 /**
2851 * Multiply 64 bit by 64 bit first loop.
2852 */
multiply_64_x_64_loop(Register x,Register xstart,Register x_xstart,Register y,Register y_idx,Register z,Register carry,Register product,Register idx,Register kdx)2853 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2854 Register y, Register y_idx, Register z,
2855 Register carry, Register product,
2856 Register idx, Register kdx) {
2857 //
2858 // jlong carry, x[], y[], z[];
2859 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2860 // huge_128 product = y[idx] * x[xstart] + carry;
2861 // z[kdx] = (jlong)product;
2862 // carry = (jlong)(product >>> 64);
2863 // }
2864 // z[xstart] = carry;
2865 //
2866
2867 Label L_first_loop, L_first_loop_exit;
2868 Label L_one_x, L_one_y, L_multiply;
2869
2870 subsw(xstart, xstart, 1);
2871 br(Assembler::MI, L_one_x);
2872
2873 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2874 ldr(x_xstart, Address(rscratch1));
2875 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2876
2877 bind(L_first_loop);
2878 subsw(idx, idx, 1);
2879 br(Assembler::MI, L_first_loop_exit);
2880 subsw(idx, idx, 1);
2881 br(Assembler::MI, L_one_y);
2882 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2883 ldr(y_idx, Address(rscratch1));
2884 ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2885 bind(L_multiply);
2886
2887 // AArch64 has a multiply-accumulate instruction that we can't use
2888 // here because it has no way to process carries, so we have to use
2889 // separate add and adc instructions. Bah.
2890 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2891 mul(product, x_xstart, y_idx);
2892 adds(product, product, carry);
2893 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product
2894
2895 subw(kdx, kdx, 2);
2896 ror(product, product, 32); // back to big-endian
2897 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2898
2899 b(L_first_loop);
2900
2901 bind(L_one_y);
2902 ldrw(y_idx, Address(y, 0));
2903 b(L_multiply);
2904
2905 bind(L_one_x);
2906 ldrw(x_xstart, Address(x, 0));
2907 b(L_first_loop);
2908
2909 bind(L_first_loop_exit);
2910 }
2911
2912 /**
2913 * Multiply 128 bit by 128. Unrolled inner loop.
2914 *
2915 */
multiply_128_x_128_loop(Register y,Register z,Register carry,Register carry2,Register idx,Register jdx,Register yz_idx1,Register yz_idx2,Register tmp,Register tmp3,Register tmp4,Register tmp6,Register product_hi)2916 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2917 Register carry, Register carry2,
2918 Register idx, Register jdx,
2919 Register yz_idx1, Register yz_idx2,
2920 Register tmp, Register tmp3, Register tmp4,
2921 Register tmp6, Register product_hi) {
2922
2923 // jlong carry, x[], y[], z[];
2924 // int kdx = ystart+1;
2925 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2926 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2927 // jlong carry2 = (jlong)(tmp3 >>> 64);
2928 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2;
2929 // carry = (jlong)(tmp4 >>> 64);
2930 // z[kdx+idx+1] = (jlong)tmp3;
2931 // z[kdx+idx] = (jlong)tmp4;
2932 // }
2933 // idx += 2;
2934 // if (idx > 0) {
2935 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2936 // z[kdx+idx] = (jlong)yz_idx1;
2937 // carry = (jlong)(yz_idx1 >>> 64);
2938 // }
2939 //
2940
2941 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2942
2943 lsrw(jdx, idx, 2);
2944
2945 bind(L_third_loop);
2946
2947 subsw(jdx, jdx, 1);
2948 br(Assembler::MI, L_third_loop_exit);
2949 subw(idx, idx, 4);
2950
2951 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2952
2953 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2954
2955 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2956
2957 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2958 ror(yz_idx2, yz_idx2, 32);
2959
2960 ldp(rscratch2, rscratch1, Address(tmp6, 0));
2961
2962 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
2963 umulh(tmp4, product_hi, yz_idx1);
2964
2965 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2966 ror(rscratch2, rscratch2, 32);
2967
2968 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp
2969 umulh(carry2, product_hi, yz_idx2);
2970
2971 // propagate sum of both multiplications into carry:tmp4:tmp3
2972 adds(tmp3, tmp3, carry);
2973 adc(tmp4, tmp4, zr);
2974 adds(tmp3, tmp3, rscratch1);
2975 adcs(tmp4, tmp4, tmp);
2976 adc(carry, carry2, zr);
2977 adds(tmp4, tmp4, rscratch2);
2978 adc(carry, carry, zr);
2979
2980 ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2981 ror(tmp4, tmp4, 32);
2982 stp(tmp4, tmp3, Address(tmp6, 0));
2983
2984 b(L_third_loop);
2985 bind (L_third_loop_exit);
2986
2987 andw (idx, idx, 0x3);
2988 cbz(idx, L_post_third_loop_done);
2989
2990 Label L_check_1;
2991 subsw(idx, idx, 2);
2992 br(Assembler::MI, L_check_1);
2993
2994 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2995 ldr(yz_idx1, Address(rscratch1, 0));
2996 ror(yz_idx1, yz_idx1, 32);
2997 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
2998 umulh(tmp4, product_hi, yz_idx1);
2999 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3000 ldr(yz_idx2, Address(rscratch1, 0));
3001 ror(yz_idx2, yz_idx2, 32);
3002
3003 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3004
3005 ror(tmp3, tmp3, 32);
3006 str(tmp3, Address(rscratch1, 0));
3007
3008 bind (L_check_1);
3009
3010 andw (idx, idx, 0x1);
3011 subsw(idx, idx, 1);
3012 br(Assembler::MI, L_post_third_loop_done);
3013 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3014 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3
3015 umulh(carry2, tmp4, product_hi);
3016 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3017
3018 add2_with_carry(carry2, tmp3, tmp4, carry);
3019
3020 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3021 extr(carry, carry2, tmp3, 32);
3022
3023 bind(L_post_third_loop_done);
3024 }
3025
3026 /**
3027 * Code for BigInteger::multiplyToLen() instrinsic.
3028 *
3029 * r0: x
3030 * r1: xlen
3031 * r2: y
3032 * r3: ylen
3033 * r4: z
3034 * r5: zlen
3035 * r10: tmp1
3036 * r11: tmp2
3037 * r12: tmp3
3038 * r13: tmp4
3039 * r14: tmp5
3040 * r15: tmp6
3041 * r16: tmp7
3042 *
3043 */
multiply_to_len(Register x,Register xlen,Register y,Register ylen,Register z,Register zlen,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,Register product_hi)3044 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3045 Register z, Register zlen,
3046 Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3047 Register tmp5, Register tmp6, Register product_hi) {
3048
3049 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3050
3051 const Register idx = tmp1;
3052 const Register kdx = tmp2;
3053 const Register xstart = tmp3;
3054
3055 const Register y_idx = tmp4;
3056 const Register carry = tmp5;
3057 const Register product = xlen;
3058 const Register x_xstart = zlen; // reuse register
3059
3060 // First Loop.
3061 //
3062 // final static long LONG_MASK = 0xffffffffL;
3063 // int xstart = xlen - 1;
3064 // int ystart = ylen - 1;
3065 // long carry = 0;
3066 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3067 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3068 // z[kdx] = (int)product;
3069 // carry = product >>> 32;
3070 // }
3071 // z[xstart] = (int)carry;
3072 //
3073
3074 movw(idx, ylen); // idx = ylen;
3075 movw(kdx, zlen); // kdx = xlen+ylen;
3076 mov(carry, zr); // carry = 0;
3077
3078 Label L_done;
3079
3080 movw(xstart, xlen);
3081 subsw(xstart, xstart, 1);
3082 br(Assembler::MI, L_done);
3083
3084 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3085
3086 Label L_second_loop;
3087 cbzw(kdx, L_second_loop);
3088
3089 Label L_carry;
3090 subw(kdx, kdx, 1);
3091 cbzw(kdx, L_carry);
3092
3093 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3094 lsr(carry, carry, 32);
3095 subw(kdx, kdx, 1);
3096
3097 bind(L_carry);
3098 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3099
3100 // Second and third (nested) loops.
3101 //
3102 // for (int i = xstart-1; i >= 0; i--) { // Second loop
3103 // carry = 0;
3104 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3105 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3106 // (z[k] & LONG_MASK) + carry;
3107 // z[k] = (int)product;
3108 // carry = product >>> 32;
3109 // }
3110 // z[i] = (int)carry;
3111 // }
3112 //
3113 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3114
3115 const Register jdx = tmp1;
3116
3117 bind(L_second_loop);
3118 mov(carry, zr); // carry = 0;
3119 movw(jdx, ylen); // j = ystart+1
3120
3121 subsw(xstart, xstart, 1); // i = xstart-1;
3122 br(Assembler::MI, L_done);
3123
3124 str(z, Address(pre(sp, -4 * wordSize)));
3125
3126 Label L_last_x;
3127 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3128 subsw(xstart, xstart, 1); // i = xstart-1;
3129 br(Assembler::MI, L_last_x);
3130
3131 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3132 ldr(product_hi, Address(rscratch1));
3133 ror(product_hi, product_hi, 32); // convert big-endian to little-endian
3134
3135 Label L_third_loop_prologue;
3136 bind(L_third_loop_prologue);
3137
3138 str(ylen, Address(sp, wordSize));
3139 stp(x, xstart, Address(sp, 2 * wordSize));
3140 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3141 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3142 ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3143 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen
3144
3145 addw(tmp3, xlen, 1);
3146 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3147 subsw(tmp3, tmp3, 1);
3148 br(Assembler::MI, L_done);
3149
3150 lsr(carry, carry, 32);
3151 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3152 b(L_second_loop);
3153
3154 // Next infrequent code is moved outside loops.
3155 bind(L_last_x);
3156 ldrw(product_hi, Address(x, 0));
3157 b(L_third_loop_prologue);
3158
3159 bind(L_done);
3160 }
3161
3162 // Code for BigInteger::mulAdd instrinsic
3163 // out = r0
3164 // in = r1
3165 // offset = r2 (already out.length-offset)
3166 // len = r3
3167 // k = r4
3168 //
3169 // pseudo code from java implementation:
3170 // carry = 0;
3171 // offset = out.length-offset - 1;
3172 // for (int j=len-1; j >= 0; j--) {
3173 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3174 // out[offset--] = (int)product;
3175 // carry = product >>> 32;
3176 // }
3177 // return (int)carry;
mul_add(Register out,Register in,Register offset,Register len,Register k)3178 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3179 Register len, Register k) {
3180 Label LOOP, END;
3181 // pre-loop
3182 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3183 csel(out, zr, out, Assembler::EQ);
3184 br(Assembler::EQ, END);
3185 add(in, in, len, LSL, 2); // in[j+1] address
3186 add(offset, out, offset, LSL, 2); // out[offset + 1] address
3187 mov(out, zr); // used to keep carry now
3188 BIND(LOOP);
3189 ldrw(rscratch1, Address(pre(in, -4)));
3190 madd(rscratch1, rscratch1, k, out);
3191 ldrw(rscratch2, Address(pre(offset, -4)));
3192 add(rscratch1, rscratch1, rscratch2);
3193 strw(rscratch1, Address(offset));
3194 lsr(out, rscratch1, 32);
3195 subs(len, len, 1);
3196 br(Assembler::NE, LOOP);
3197 BIND(END);
3198 }
3199
3200 /**
3201 * Emits code to update CRC-32 with a byte value according to constants in table
3202 *
3203 * @param [in,out]crc Register containing the crc.
3204 * @param [in]val Register containing the byte to fold into the CRC.
3205 * @param [in]table Register containing the table of crc constants.
3206 *
3207 * uint32_t crc;
3208 * val = crc_table[(val ^ crc) & 0xFF];
3209 * crc = val ^ (crc >> 8);
3210 *
3211 */
update_byte_crc32(Register crc,Register val,Register table)3212 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3213 eor(val, val, crc);
3214 andr(val, val, 0xff);
3215 ldrw(val, Address(table, val, Address::lsl(2)));
3216 eor(crc, val, crc, Assembler::LSR, 8);
3217 }
3218
3219 /**
3220 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3221 *
3222 * @param [in,out]crc Register containing the crc.
3223 * @param [in]v Register containing the 32-bit to fold into the CRC.
3224 * @param [in]table0 Register containing table 0 of crc constants.
3225 * @param [in]table1 Register containing table 1 of crc constants.
3226 * @param [in]table2 Register containing table 2 of crc constants.
3227 * @param [in]table3 Register containing table 3 of crc constants.
3228 *
3229 * uint32_t crc;
3230 * v = crc ^ v
3231 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3232 *
3233 */
update_word_crc32(Register crc,Register v,Register tmp,Register table0,Register table1,Register table2,Register table3,bool upper)3234 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3235 Register table0, Register table1, Register table2, Register table3,
3236 bool upper) {
3237 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3238 uxtb(tmp, v);
3239 ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3240 ubfx(tmp, v, 8, 8);
3241 ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3242 eor(crc, crc, tmp);
3243 ubfx(tmp, v, 16, 8);
3244 ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3245 eor(crc, crc, tmp);
3246 ubfx(tmp, v, 24, 8);
3247 ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3248 eor(crc, crc, tmp);
3249 }
3250
kernel_crc32_using_crc32(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3251 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3252 Register len, Register tmp0, Register tmp1, Register tmp2,
3253 Register tmp3) {
3254 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3255 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3256
3257 mvnw(crc, crc);
3258
3259 subs(len, len, 128);
3260 br(Assembler::GE, CRC_by64_pre);
3261 BIND(CRC_less64);
3262 adds(len, len, 128-32);
3263 br(Assembler::GE, CRC_by32_loop);
3264 BIND(CRC_less32);
3265 adds(len, len, 32-4);
3266 br(Assembler::GE, CRC_by4_loop);
3267 adds(len, len, 4);
3268 br(Assembler::GT, CRC_by1_loop);
3269 b(L_exit);
3270
3271 BIND(CRC_by32_loop);
3272 ldp(tmp0, tmp1, Address(post(buf, 16)));
3273 subs(len, len, 32);
3274 crc32x(crc, crc, tmp0);
3275 ldr(tmp2, Address(post(buf, 8)));
3276 crc32x(crc, crc, tmp1);
3277 ldr(tmp3, Address(post(buf, 8)));
3278 crc32x(crc, crc, tmp2);
3279 crc32x(crc, crc, tmp3);
3280 br(Assembler::GE, CRC_by32_loop);
3281 cmn(len, 32);
3282 br(Assembler::NE, CRC_less32);
3283 b(L_exit);
3284
3285 BIND(CRC_by4_loop);
3286 ldrw(tmp0, Address(post(buf, 4)));
3287 subs(len, len, 4);
3288 crc32w(crc, crc, tmp0);
3289 br(Assembler::GE, CRC_by4_loop);
3290 adds(len, len, 4);
3291 br(Assembler::LE, L_exit);
3292 BIND(CRC_by1_loop);
3293 ldrb(tmp0, Address(post(buf, 1)));
3294 subs(len, len, 1);
3295 crc32b(crc, crc, tmp0);
3296 br(Assembler::GT, CRC_by1_loop);
3297 b(L_exit);
3298
3299 BIND(CRC_by64_pre);
3300 sub(buf, buf, 8);
3301 ldp(tmp0, tmp1, Address(buf, 8));
3302 crc32x(crc, crc, tmp0);
3303 ldr(tmp2, Address(buf, 24));
3304 crc32x(crc, crc, tmp1);
3305 ldr(tmp3, Address(buf, 32));
3306 crc32x(crc, crc, tmp2);
3307 ldr(tmp0, Address(buf, 40));
3308 crc32x(crc, crc, tmp3);
3309 ldr(tmp1, Address(buf, 48));
3310 crc32x(crc, crc, tmp0);
3311 ldr(tmp2, Address(buf, 56));
3312 crc32x(crc, crc, tmp1);
3313 ldr(tmp3, Address(pre(buf, 64)));
3314
3315 b(CRC_by64_loop);
3316
3317 align(CodeEntryAlignment);
3318 BIND(CRC_by64_loop);
3319 subs(len, len, 64);
3320 crc32x(crc, crc, tmp2);
3321 ldr(tmp0, Address(buf, 8));
3322 crc32x(crc, crc, tmp3);
3323 ldr(tmp1, Address(buf, 16));
3324 crc32x(crc, crc, tmp0);
3325 ldr(tmp2, Address(buf, 24));
3326 crc32x(crc, crc, tmp1);
3327 ldr(tmp3, Address(buf, 32));
3328 crc32x(crc, crc, tmp2);
3329 ldr(tmp0, Address(buf, 40));
3330 crc32x(crc, crc, tmp3);
3331 ldr(tmp1, Address(buf, 48));
3332 crc32x(crc, crc, tmp0);
3333 ldr(tmp2, Address(buf, 56));
3334 crc32x(crc, crc, tmp1);
3335 ldr(tmp3, Address(pre(buf, 64)));
3336 br(Assembler::GE, CRC_by64_loop);
3337
3338 // post-loop
3339 crc32x(crc, crc, tmp2);
3340 crc32x(crc, crc, tmp3);
3341
3342 sub(len, len, 64);
3343 add(buf, buf, 8);
3344 cmn(len, 128);
3345 br(Assembler::NE, CRC_less64);
3346 BIND(L_exit);
3347 mvnw(crc, crc);
3348 }
3349
3350 /**
3351 * @param crc register containing existing CRC (32-bit)
3352 * @param buf register pointing to input byte buffer (byte*)
3353 * @param len register containing number of bytes
3354 * @param table register that will contain address of CRC table
3355 * @param tmp scratch register
3356 */
kernel_crc32(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3357 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3358 Register table0, Register table1, Register table2, Register table3,
3359 Register tmp, Register tmp2, Register tmp3) {
3360 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3361 unsigned long offset;
3362
3363 if (UseCRC32) {
3364 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3365 return;
3366 }
3367
3368 mvnw(crc, crc);
3369
3370 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3371 if (offset) add(table0, table0, offset);
3372 add(table1, table0, 1*256*sizeof(juint));
3373 add(table2, table0, 2*256*sizeof(juint));
3374 add(table3, table0, 3*256*sizeof(juint));
3375
3376 if (UseNeon) {
3377 cmp(len, (u1)64);
3378 br(Assembler::LT, L_by16);
3379 eor(v16, T16B, v16, v16);
3380
3381 Label L_fold;
3382
3383 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3384
3385 ld1(v0, v1, T2D, post(buf, 32));
3386 ld1r(v4, T2D, post(tmp, 8));
3387 ld1r(v5, T2D, post(tmp, 8));
3388 ld1r(v6, T2D, post(tmp, 8));
3389 ld1r(v7, T2D, post(tmp, 8));
3390 mov(v16, T4S, 0, crc);
3391
3392 eor(v0, T16B, v0, v16);
3393 sub(len, len, 64);
3394
3395 BIND(L_fold);
3396 pmull(v22, T8H, v0, v5, T8B);
3397 pmull(v20, T8H, v0, v7, T8B);
3398 pmull(v23, T8H, v0, v4, T8B);
3399 pmull(v21, T8H, v0, v6, T8B);
3400
3401 pmull2(v18, T8H, v0, v5, T16B);
3402 pmull2(v16, T8H, v0, v7, T16B);
3403 pmull2(v19, T8H, v0, v4, T16B);
3404 pmull2(v17, T8H, v0, v6, T16B);
3405
3406 uzp1(v24, T8H, v20, v22);
3407 uzp2(v25, T8H, v20, v22);
3408 eor(v20, T16B, v24, v25);
3409
3410 uzp1(v26, T8H, v16, v18);
3411 uzp2(v27, T8H, v16, v18);
3412 eor(v16, T16B, v26, v27);
3413
3414 ushll2(v22, T4S, v20, T8H, 8);
3415 ushll(v20, T4S, v20, T4H, 8);
3416
3417 ushll2(v18, T4S, v16, T8H, 8);
3418 ushll(v16, T4S, v16, T4H, 8);
3419
3420 eor(v22, T16B, v23, v22);
3421 eor(v18, T16B, v19, v18);
3422 eor(v20, T16B, v21, v20);
3423 eor(v16, T16B, v17, v16);
3424
3425 uzp1(v17, T2D, v16, v20);
3426 uzp2(v21, T2D, v16, v20);
3427 eor(v17, T16B, v17, v21);
3428
3429 ushll2(v20, T2D, v17, T4S, 16);
3430 ushll(v16, T2D, v17, T2S, 16);
3431
3432 eor(v20, T16B, v20, v22);
3433 eor(v16, T16B, v16, v18);
3434
3435 uzp1(v17, T2D, v20, v16);
3436 uzp2(v21, T2D, v20, v16);
3437 eor(v28, T16B, v17, v21);
3438
3439 pmull(v22, T8H, v1, v5, T8B);
3440 pmull(v20, T8H, v1, v7, T8B);
3441 pmull(v23, T8H, v1, v4, T8B);
3442 pmull(v21, T8H, v1, v6, T8B);
3443
3444 pmull2(v18, T8H, v1, v5, T16B);
3445 pmull2(v16, T8H, v1, v7, T16B);
3446 pmull2(v19, T8H, v1, v4, T16B);
3447 pmull2(v17, T8H, v1, v6, T16B);
3448
3449 ld1(v0, v1, T2D, post(buf, 32));
3450
3451 uzp1(v24, T8H, v20, v22);
3452 uzp2(v25, T8H, v20, v22);
3453 eor(v20, T16B, v24, v25);
3454
3455 uzp1(v26, T8H, v16, v18);
3456 uzp2(v27, T8H, v16, v18);
3457 eor(v16, T16B, v26, v27);
3458
3459 ushll2(v22, T4S, v20, T8H, 8);
3460 ushll(v20, T4S, v20, T4H, 8);
3461
3462 ushll2(v18, T4S, v16, T8H, 8);
3463 ushll(v16, T4S, v16, T4H, 8);
3464
3465 eor(v22, T16B, v23, v22);
3466 eor(v18, T16B, v19, v18);
3467 eor(v20, T16B, v21, v20);
3468 eor(v16, T16B, v17, v16);
3469
3470 uzp1(v17, T2D, v16, v20);
3471 uzp2(v21, T2D, v16, v20);
3472 eor(v16, T16B, v17, v21);
3473
3474 ushll2(v20, T2D, v16, T4S, 16);
3475 ushll(v16, T2D, v16, T2S, 16);
3476
3477 eor(v20, T16B, v22, v20);
3478 eor(v16, T16B, v16, v18);
3479
3480 uzp1(v17, T2D, v20, v16);
3481 uzp2(v21, T2D, v20, v16);
3482 eor(v20, T16B, v17, v21);
3483
3484 shl(v16, T2D, v28, 1);
3485 shl(v17, T2D, v20, 1);
3486
3487 eor(v0, T16B, v0, v16);
3488 eor(v1, T16B, v1, v17);
3489
3490 subs(len, len, 32);
3491 br(Assembler::GE, L_fold);
3492
3493 mov(crc, 0);
3494 mov(tmp, v0, T1D, 0);
3495 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3496 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3497 mov(tmp, v0, T1D, 1);
3498 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3499 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3500 mov(tmp, v1, T1D, 0);
3501 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3502 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3503 mov(tmp, v1, T1D, 1);
3504 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3505 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3506
3507 add(len, len, 32);
3508 }
3509
3510 BIND(L_by16);
3511 subs(len, len, 16);
3512 br(Assembler::GE, L_by16_loop);
3513 adds(len, len, 16-4);
3514 br(Assembler::GE, L_by4_loop);
3515 adds(len, len, 4);
3516 br(Assembler::GT, L_by1_loop);
3517 b(L_exit);
3518
3519 BIND(L_by4_loop);
3520 ldrw(tmp, Address(post(buf, 4)));
3521 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3522 subs(len, len, 4);
3523 br(Assembler::GE, L_by4_loop);
3524 adds(len, len, 4);
3525 br(Assembler::LE, L_exit);
3526 BIND(L_by1_loop);
3527 subs(len, len, 1);
3528 ldrb(tmp, Address(post(buf, 1)));
3529 update_byte_crc32(crc, tmp, table0);
3530 br(Assembler::GT, L_by1_loop);
3531 b(L_exit);
3532
3533 align(CodeEntryAlignment);
3534 BIND(L_by16_loop);
3535 subs(len, len, 16);
3536 ldp(tmp, tmp3, Address(post(buf, 16)));
3537 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3538 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3539 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3540 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3541 br(Assembler::GE, L_by16_loop);
3542 adds(len, len, 16-4);
3543 br(Assembler::GE, L_by4_loop);
3544 adds(len, len, 4);
3545 br(Assembler::GT, L_by1_loop);
3546 BIND(L_exit);
3547 mvnw(crc, crc);
3548 }
3549
kernel_crc32c_using_crc32c(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3550 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3551 Register len, Register tmp0, Register tmp1, Register tmp2,
3552 Register tmp3) {
3553 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3554 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3555
3556 subs(len, len, 128);
3557 br(Assembler::GE, CRC_by64_pre);
3558 BIND(CRC_less64);
3559 adds(len, len, 128-32);
3560 br(Assembler::GE, CRC_by32_loop);
3561 BIND(CRC_less32);
3562 adds(len, len, 32-4);
3563 br(Assembler::GE, CRC_by4_loop);
3564 adds(len, len, 4);
3565 br(Assembler::GT, CRC_by1_loop);
3566 b(L_exit);
3567
3568 BIND(CRC_by32_loop);
3569 ldp(tmp0, tmp1, Address(post(buf, 16)));
3570 subs(len, len, 32);
3571 crc32cx(crc, crc, tmp0);
3572 ldr(tmp2, Address(post(buf, 8)));
3573 crc32cx(crc, crc, tmp1);
3574 ldr(tmp3, Address(post(buf, 8)));
3575 crc32cx(crc, crc, tmp2);
3576 crc32cx(crc, crc, tmp3);
3577 br(Assembler::GE, CRC_by32_loop);
3578 cmn(len, 32);
3579 br(Assembler::NE, CRC_less32);
3580 b(L_exit);
3581
3582 BIND(CRC_by4_loop);
3583 ldrw(tmp0, Address(post(buf, 4)));
3584 subs(len, len, 4);
3585 crc32cw(crc, crc, tmp0);
3586 br(Assembler::GE, CRC_by4_loop);
3587 adds(len, len, 4);
3588 br(Assembler::LE, L_exit);
3589 BIND(CRC_by1_loop);
3590 ldrb(tmp0, Address(post(buf, 1)));
3591 subs(len, len, 1);
3592 crc32cb(crc, crc, tmp0);
3593 br(Assembler::GT, CRC_by1_loop);
3594 b(L_exit);
3595
3596 BIND(CRC_by64_pre);
3597 sub(buf, buf, 8);
3598 ldp(tmp0, tmp1, Address(buf, 8));
3599 crc32cx(crc, crc, tmp0);
3600 ldr(tmp2, Address(buf, 24));
3601 crc32cx(crc, crc, tmp1);
3602 ldr(tmp3, Address(buf, 32));
3603 crc32cx(crc, crc, tmp2);
3604 ldr(tmp0, Address(buf, 40));
3605 crc32cx(crc, crc, tmp3);
3606 ldr(tmp1, Address(buf, 48));
3607 crc32cx(crc, crc, tmp0);
3608 ldr(tmp2, Address(buf, 56));
3609 crc32cx(crc, crc, tmp1);
3610 ldr(tmp3, Address(pre(buf, 64)));
3611
3612 b(CRC_by64_loop);
3613
3614 align(CodeEntryAlignment);
3615 BIND(CRC_by64_loop);
3616 subs(len, len, 64);
3617 crc32cx(crc, crc, tmp2);
3618 ldr(tmp0, Address(buf, 8));
3619 crc32cx(crc, crc, tmp3);
3620 ldr(tmp1, Address(buf, 16));
3621 crc32cx(crc, crc, tmp0);
3622 ldr(tmp2, Address(buf, 24));
3623 crc32cx(crc, crc, tmp1);
3624 ldr(tmp3, Address(buf, 32));
3625 crc32cx(crc, crc, tmp2);
3626 ldr(tmp0, Address(buf, 40));
3627 crc32cx(crc, crc, tmp3);
3628 ldr(tmp1, Address(buf, 48));
3629 crc32cx(crc, crc, tmp0);
3630 ldr(tmp2, Address(buf, 56));
3631 crc32cx(crc, crc, tmp1);
3632 ldr(tmp3, Address(pre(buf, 64)));
3633 br(Assembler::GE, CRC_by64_loop);
3634
3635 // post-loop
3636 crc32cx(crc, crc, tmp2);
3637 crc32cx(crc, crc, tmp3);
3638
3639 sub(len, len, 64);
3640 add(buf, buf, 8);
3641 cmn(len, 128);
3642 br(Assembler::NE, CRC_less64);
3643 BIND(L_exit);
3644 }
3645
3646 /**
3647 * @param crc register containing existing CRC (32-bit)
3648 * @param buf register pointing to input byte buffer (byte*)
3649 * @param len register containing number of bytes
3650 * @param table register that will contain address of CRC table
3651 * @param tmp scratch register
3652 */
kernel_crc32c(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3653 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3654 Register table0, Register table1, Register table2, Register table3,
3655 Register tmp, Register tmp2, Register tmp3) {
3656 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3657 }
3658
3659
SkipIfEqual(MacroAssembler * masm,const bool * flag_addr,bool value)3660 SkipIfEqual::SkipIfEqual(
3661 MacroAssembler* masm, const bool* flag_addr, bool value) {
3662 _masm = masm;
3663 unsigned long offset;
3664 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3665 _masm->ldrb(rscratch1, Address(rscratch1, offset));
3666 _masm->cbzw(rscratch1, _label);
3667 }
3668
~SkipIfEqual()3669 SkipIfEqual::~SkipIfEqual() {
3670 _masm->bind(_label);
3671 }
3672
addptr(const Address & dst,int32_t src)3673 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3674 Address adr;
3675 switch(dst.getMode()) {
3676 case Address::base_plus_offset:
3677 // This is the expected mode, although we allow all the other
3678 // forms below.
3679 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3680 break;
3681 default:
3682 lea(rscratch2, dst);
3683 adr = Address(rscratch2);
3684 break;
3685 }
3686 ldr(rscratch1, adr);
3687 add(rscratch1, rscratch1, src);
3688 str(rscratch1, adr);
3689 }
3690
cmpptr(Register src1,Address src2)3691 void MacroAssembler::cmpptr(Register src1, Address src2) {
3692 unsigned long offset;
3693 adrp(rscratch1, src2, offset);
3694 ldr(rscratch1, Address(rscratch1, offset));
3695 cmp(src1, rscratch1);
3696 }
3697
cmpoop(Register obj1,Register obj2)3698 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3699 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3700 bs->obj_equals(this, obj1, obj2);
3701 }
3702
load_method_holder(Register holder,Register method)3703 void MacroAssembler::load_method_holder(Register holder, Register method) {
3704 ldr(holder, Address(method, Method::const_offset())); // ConstMethod*
3705 ldr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
3706 ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
3707 }
3708
load_klass(Register dst,Register src)3709 void MacroAssembler::load_klass(Register dst, Register src) {
3710 if (UseCompressedClassPointers) {
3711 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3712 decode_klass_not_null(dst);
3713 } else {
3714 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3715 }
3716 }
3717
3718 // ((OopHandle)result).resolve();
resolve_oop_handle(Register result,Register tmp)3719 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3720 // OopHandle::resolve is an indirection.
3721 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3722 }
3723
load_mirror(Register dst,Register method,Register tmp)3724 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3725 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3726 ldr(dst, Address(rmethod, Method::const_offset()));
3727 ldr(dst, Address(dst, ConstMethod::constants_offset()));
3728 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3729 ldr(dst, Address(dst, mirror_offset));
3730 resolve_oop_handle(dst, tmp);
3731 }
3732
cmp_klass(Register oop,Register trial_klass,Register tmp)3733 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3734 if (UseCompressedClassPointers) {
3735 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3736 if (CompressedKlassPointers::base() == NULL) {
3737 cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3738 return;
3739 } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3740 && CompressedKlassPointers::shift() == 0) {
3741 // Only the bottom 32 bits matter
3742 cmpw(trial_klass, tmp);
3743 return;
3744 }
3745 decode_klass_not_null(tmp);
3746 } else {
3747 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3748 }
3749 cmp(trial_klass, tmp);
3750 }
3751
load_prototype_header(Register dst,Register src)3752 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3753 load_klass(dst, src);
3754 ldr(dst, Address(dst, Klass::prototype_header_offset()));
3755 }
3756
store_klass(Register dst,Register src)3757 void MacroAssembler::store_klass(Register dst, Register src) {
3758 // FIXME: Should this be a store release? concurrent gcs assumes
3759 // klass length is valid if klass field is not null.
3760 if (UseCompressedClassPointers) {
3761 encode_klass_not_null(src);
3762 strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3763 } else {
3764 str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3765 }
3766 }
3767
store_klass_gap(Register dst,Register src)3768 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3769 if (UseCompressedClassPointers) {
3770 // Store to klass gap in destination
3771 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3772 }
3773 }
3774
3775 // Algorithm must match CompressedOops::encode.
encode_heap_oop(Register d,Register s)3776 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3777 #ifdef ASSERT
3778 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3779 #endif
3780 verify_oop(s, "broken oop in encode_heap_oop");
3781 if (CompressedOops::base() == NULL) {
3782 if (CompressedOops::shift() != 0) {
3783 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3784 lsr(d, s, LogMinObjAlignmentInBytes);
3785 } else {
3786 mov(d, s);
3787 }
3788 } else {
3789 subs(d, s, rheapbase);
3790 csel(d, d, zr, Assembler::HS);
3791 lsr(d, d, LogMinObjAlignmentInBytes);
3792
3793 /* Old algorithm: is this any worse?
3794 Label nonnull;
3795 cbnz(r, nonnull);
3796 sub(r, r, rheapbase);
3797 bind(nonnull);
3798 lsr(r, r, LogMinObjAlignmentInBytes);
3799 */
3800 }
3801 }
3802
encode_heap_oop_not_null(Register r)3803 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3804 #ifdef ASSERT
3805 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3806 if (CheckCompressedOops) {
3807 Label ok;
3808 cbnz(r, ok);
3809 stop("null oop passed to encode_heap_oop_not_null");
3810 bind(ok);
3811 }
3812 #endif
3813 verify_oop(r, "broken oop in encode_heap_oop_not_null");
3814 if (CompressedOops::base() != NULL) {
3815 sub(r, r, rheapbase);
3816 }
3817 if (CompressedOops::shift() != 0) {
3818 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3819 lsr(r, r, LogMinObjAlignmentInBytes);
3820 }
3821 }
3822
encode_heap_oop_not_null(Register dst,Register src)3823 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3824 #ifdef ASSERT
3825 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3826 if (CheckCompressedOops) {
3827 Label ok;
3828 cbnz(src, ok);
3829 stop("null oop passed to encode_heap_oop_not_null2");
3830 bind(ok);
3831 }
3832 #endif
3833 verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3834
3835 Register data = src;
3836 if (CompressedOops::base() != NULL) {
3837 sub(dst, src, rheapbase);
3838 data = dst;
3839 }
3840 if (CompressedOops::shift() != 0) {
3841 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3842 lsr(dst, data, LogMinObjAlignmentInBytes);
3843 data = dst;
3844 }
3845 if (data == src)
3846 mov(dst, src);
3847 }
3848
decode_heap_oop(Register d,Register s)3849 void MacroAssembler::decode_heap_oop(Register d, Register s) {
3850 #ifdef ASSERT
3851 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3852 #endif
3853 if (CompressedOops::base() == NULL) {
3854 if (CompressedOops::shift() != 0 || d != s) {
3855 lsl(d, s, CompressedOops::shift());
3856 }
3857 } else {
3858 Label done;
3859 if (d != s)
3860 mov(d, s);
3861 cbz(s, done);
3862 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3863 bind(done);
3864 }
3865 verify_oop(d, "broken oop in decode_heap_oop");
3866 }
3867
decode_heap_oop_not_null(Register r)3868 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3869 assert (UseCompressedOops, "should only be used for compressed headers");
3870 assert (Universe::heap() != NULL, "java heap should be initialized");
3871 // Cannot assert, unverified entry point counts instructions (see .ad file)
3872 // vtableStubs also counts instructions in pd_code_size_limit.
3873 // Also do not verify_oop as this is called by verify_oop.
3874 if (CompressedOops::shift() != 0) {
3875 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3876 if (CompressedOops::base() != NULL) {
3877 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3878 } else {
3879 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3880 }
3881 } else {
3882 assert (CompressedOops::base() == NULL, "sanity");
3883 }
3884 }
3885
decode_heap_oop_not_null(Register dst,Register src)3886 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3887 assert (UseCompressedOops, "should only be used for compressed headers");
3888 assert (Universe::heap() != NULL, "java heap should be initialized");
3889 // Cannot assert, unverified entry point counts instructions (see .ad file)
3890 // vtableStubs also counts instructions in pd_code_size_limit.
3891 // Also do not verify_oop as this is called by verify_oop.
3892 if (CompressedOops::shift() != 0) {
3893 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3894 if (CompressedOops::base() != NULL) {
3895 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3896 } else {
3897 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3898 }
3899 } else {
3900 assert (CompressedOops::base() == NULL, "sanity");
3901 if (dst != src) {
3902 mov(dst, src);
3903 }
3904 }
3905 }
3906
encode_klass_not_null(Register dst,Register src)3907 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3908 if (CompressedKlassPointers::base() == NULL) {
3909 if (CompressedKlassPointers::shift() != 0) {
3910 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3911 lsr(dst, src, LogKlassAlignmentInBytes);
3912 } else {
3913 if (dst != src) mov(dst, src);
3914 }
3915 return;
3916 }
3917
3918 if (use_XOR_for_compressed_class_base) {
3919 if (CompressedKlassPointers::shift() != 0) {
3920 eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3921 lsr(dst, dst, LogKlassAlignmentInBytes);
3922 } else {
3923 eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3924 }
3925 return;
3926 }
3927
3928 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3929 && CompressedKlassPointers::shift() == 0) {
3930 movw(dst, src);
3931 return;
3932 }
3933
3934 #ifdef ASSERT
3935 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3936 #endif
3937
3938 Register rbase = dst;
3939 if (dst == src) rbase = rheapbase;
3940 mov(rbase, (uint64_t)CompressedKlassPointers::base());
3941 sub(dst, src, rbase);
3942 if (CompressedKlassPointers::shift() != 0) {
3943 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3944 lsr(dst, dst, LogKlassAlignmentInBytes);
3945 }
3946 if (dst == src) reinit_heapbase();
3947 }
3948
encode_klass_not_null(Register r)3949 void MacroAssembler::encode_klass_not_null(Register r) {
3950 encode_klass_not_null(r, r);
3951 }
3952
decode_klass_not_null(Register dst,Register src)3953 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3954 Register rbase = dst;
3955 assert (UseCompressedClassPointers, "should only be used for compressed headers");
3956
3957 if (CompressedKlassPointers::base() == NULL) {
3958 if (CompressedKlassPointers::shift() != 0) {
3959 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3960 lsl(dst, src, LogKlassAlignmentInBytes);
3961 } else {
3962 if (dst != src) mov(dst, src);
3963 }
3964 return;
3965 }
3966
3967 if (use_XOR_for_compressed_class_base) {
3968 if (CompressedKlassPointers::shift() != 0) {
3969 lsl(dst, src, LogKlassAlignmentInBytes);
3970 eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
3971 } else {
3972 eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3973 }
3974 return;
3975 }
3976
3977 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3978 && CompressedKlassPointers::shift() == 0) {
3979 if (dst != src)
3980 movw(dst, src);
3981 movk(dst, (uint64_t)CompressedKlassPointers::base() >> 32, 32);
3982 return;
3983 }
3984
3985 // Cannot assert, unverified entry point counts instructions (see .ad file)
3986 // vtableStubs also counts instructions in pd_code_size_limit.
3987 // Also do not verify_oop as this is called by verify_oop.
3988 if (dst == src) rbase = rheapbase;
3989 mov(rbase, (uint64_t)CompressedKlassPointers::base());
3990 if (CompressedKlassPointers::shift() != 0) {
3991 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3992 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3993 } else {
3994 add(dst, rbase, src);
3995 }
3996 if (dst == src) reinit_heapbase();
3997 }
3998
decode_klass_not_null(Register r)3999 void MacroAssembler::decode_klass_not_null(Register r) {
4000 decode_klass_not_null(r, r);
4001 }
4002
set_narrow_oop(Register dst,jobject obj)4003 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4004 #ifdef ASSERT
4005 {
4006 ThreadInVMfromUnknown tiv;
4007 assert (UseCompressedOops, "should only be used for compressed oops");
4008 assert (Universe::heap() != NULL, "java heap should be initialized");
4009 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4010 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4011 }
4012 #endif
4013 int oop_index = oop_recorder()->find_index(obj);
4014 InstructionMark im(this);
4015 RelocationHolder rspec = oop_Relocation::spec(oop_index);
4016 code_section()->relocate(inst_mark(), rspec);
4017 movz(dst, 0xDEAD, 16);
4018 movk(dst, 0xBEEF);
4019 }
4020
set_narrow_klass(Register dst,Klass * k)4021 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4022 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4023 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4024 int index = oop_recorder()->find_index(k);
4025 assert(! Universe::heap()->is_in(k), "should not be an oop");
4026
4027 InstructionMark im(this);
4028 RelocationHolder rspec = metadata_Relocation::spec(index);
4029 code_section()->relocate(inst_mark(), rspec);
4030 narrowKlass nk = CompressedKlassPointers::encode(k);
4031 movz(dst, (nk >> 16), 16);
4032 movk(dst, nk & 0xffff);
4033 }
4034
access_load_at(BasicType type,DecoratorSet decorators,Register dst,Address src,Register tmp1,Register thread_tmp)4035 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4036 Register dst, Address src,
4037 Register tmp1, Register thread_tmp) {
4038 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4039 decorators = AccessInternal::decorator_fixup(decorators);
4040 bool as_raw = (decorators & AS_RAW) != 0;
4041 if (as_raw) {
4042 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4043 } else {
4044 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4045 }
4046 }
4047
access_store_at(BasicType type,DecoratorSet decorators,Address dst,Register src,Register tmp1,Register thread_tmp)4048 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4049 Address dst, Register src,
4050 Register tmp1, Register thread_tmp) {
4051 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4052 decorators = AccessInternal::decorator_fixup(decorators);
4053 bool as_raw = (decorators & AS_RAW) != 0;
4054 if (as_raw) {
4055 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4056 } else {
4057 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4058 }
4059 }
4060
resolve(DecoratorSet decorators,Register obj)4061 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4062 // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4063 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4064 decorators |= ACCESS_READ | ACCESS_WRITE;
4065 }
4066 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4067 return bs->resolve(this, decorators, obj);
4068 }
4069
load_heap_oop(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4070 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4071 Register thread_tmp, DecoratorSet decorators) {
4072 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4073 }
4074
load_heap_oop_not_null(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4075 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4076 Register thread_tmp, DecoratorSet decorators) {
4077 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4078 }
4079
store_heap_oop(Address dst,Register src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4080 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4081 Register thread_tmp, DecoratorSet decorators) {
4082 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4083 }
4084
4085 // Used for storing NULLs.
store_heap_oop_null(Address dst)4086 void MacroAssembler::store_heap_oop_null(Address dst) {
4087 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4088 }
4089
allocate_metadata_address(Metadata * obj)4090 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4091 assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4092 int index = oop_recorder()->allocate_metadata_index(obj);
4093 RelocationHolder rspec = metadata_Relocation::spec(index);
4094 return Address((address)obj, rspec);
4095 }
4096
4097 // Move an oop into a register. immediate is true if we want
4098 // immediate instrcutions, i.e. we are not going to patch this
4099 // instruction while the code is being executed by another thread. In
4100 // that case we can use move immediates rather than the constant pool.
movoop(Register dst,jobject obj,bool immediate)4101 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4102 int oop_index;
4103 if (obj == NULL) {
4104 oop_index = oop_recorder()->allocate_oop_index(obj);
4105 } else {
4106 #ifdef ASSERT
4107 {
4108 ThreadInVMfromUnknown tiv;
4109 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4110 }
4111 #endif
4112 oop_index = oop_recorder()->find_index(obj);
4113 }
4114 RelocationHolder rspec = oop_Relocation::spec(oop_index);
4115 if (! immediate) {
4116 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4117 ldr_constant(dst, Address(dummy, rspec));
4118 } else
4119 mov(dst, Address((address)obj, rspec));
4120 }
4121
4122 // Move a metadata address into a register.
mov_metadata(Register dst,Metadata * obj)4123 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4124 int oop_index;
4125 if (obj == NULL) {
4126 oop_index = oop_recorder()->allocate_metadata_index(obj);
4127 } else {
4128 oop_index = oop_recorder()->find_index(obj);
4129 }
4130 RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4131 mov(dst, Address((address)obj, rspec));
4132 }
4133
constant_oop_address(jobject obj)4134 Address MacroAssembler::constant_oop_address(jobject obj) {
4135 #ifdef ASSERT
4136 {
4137 ThreadInVMfromUnknown tiv;
4138 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4139 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4140 }
4141 #endif
4142 int oop_index = oop_recorder()->find_index(obj);
4143 return Address((address)obj, oop_Relocation::spec(oop_index));
4144 }
4145
4146 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
tlab_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Register t2,Label & slow_case)4147 void MacroAssembler::tlab_allocate(Register obj,
4148 Register var_size_in_bytes,
4149 int con_size_in_bytes,
4150 Register t1,
4151 Register t2,
4152 Label& slow_case) {
4153 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4154 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4155 }
4156
4157 // Defines obj, preserves var_size_in_bytes
eden_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Label & slow_case)4158 void MacroAssembler::eden_allocate(Register obj,
4159 Register var_size_in_bytes,
4160 int con_size_in_bytes,
4161 Register t1,
4162 Label& slow_case) {
4163 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4164 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4165 }
4166
4167 // Zero words; len is in bytes
4168 // Destroys all registers except addr
4169 // len must be a nonzero multiple of wordSize
zero_memory(Register addr,Register len,Register t1)4170 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4171 assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4172
4173 #ifdef ASSERT
4174 { Label L;
4175 tst(len, BytesPerWord - 1);
4176 br(Assembler::EQ, L);
4177 stop("len is not a multiple of BytesPerWord");
4178 bind(L);
4179 }
4180 #endif
4181
4182 #ifndef PRODUCT
4183 block_comment("zero memory");
4184 #endif
4185
4186 Label loop;
4187 Label entry;
4188
4189 // Algorithm:
4190 //
4191 // scratch1 = cnt & 7;
4192 // cnt -= scratch1;
4193 // p += scratch1;
4194 // switch (scratch1) {
4195 // do {
4196 // cnt -= 8;
4197 // p[-8] = 0;
4198 // case 7:
4199 // p[-7] = 0;
4200 // case 6:
4201 // p[-6] = 0;
4202 // // ...
4203 // case 1:
4204 // p[-1] = 0;
4205 // case 0:
4206 // p += 8;
4207 // } while (cnt);
4208 // }
4209
4210 const int unroll = 8; // Number of str(zr) instructions we'll unroll
4211
4212 lsr(len, len, LogBytesPerWord);
4213 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll
4214 sub(len, len, rscratch1); // cnt -= unroll
4215 // t1 always points to the end of the region we're about to zero
4216 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4217 adr(rscratch2, entry);
4218 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4219 br(rscratch2);
4220 bind(loop);
4221 sub(len, len, unroll);
4222 for (int i = -unroll; i < 0; i++)
4223 Assembler::str(zr, Address(t1, i * wordSize));
4224 bind(entry);
4225 add(t1, t1, unroll * wordSize);
4226 cbnz(len, loop);
4227 }
4228
verify_tlab()4229 void MacroAssembler::verify_tlab() {
4230 #ifdef ASSERT
4231 if (UseTLAB && VerifyOops) {
4232 Label next, ok;
4233
4234 stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4235
4236 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4237 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4238 cmp(rscratch2, rscratch1);
4239 br(Assembler::HS, next);
4240 STOP("assert(top >= start)");
4241 should_not_reach_here();
4242
4243 bind(next);
4244 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4245 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4246 cmp(rscratch2, rscratch1);
4247 br(Assembler::HS, ok);
4248 STOP("assert(top <= end)");
4249 should_not_reach_here();
4250
4251 bind(ok);
4252 ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4253 }
4254 #endif
4255 }
4256
4257 // Writes to stack successive pages until offset reached to check for
4258 // stack overflow + shadow pages. This clobbers tmp.
bang_stack_size(Register size,Register tmp)4259 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4260 assert_different_registers(tmp, size, rscratch1);
4261 mov(tmp, sp);
4262 // Bang stack for total size given plus shadow page size.
4263 // Bang one page at a time because large size can bang beyond yellow and
4264 // red zones.
4265 Label loop;
4266 mov(rscratch1, os::vm_page_size());
4267 bind(loop);
4268 lea(tmp, Address(tmp, -os::vm_page_size()));
4269 subsw(size, size, rscratch1);
4270 str(size, Address(tmp));
4271 br(Assembler::GT, loop);
4272
4273 // Bang down shadow pages too.
4274 // At this point, (tmp-0) is the last address touched, so don't
4275 // touch it again. (It was touched as (tmp-pagesize) but then tmp
4276 // was post-decremented.) Skip this address by starting at i=1, and
4277 // touch a few more pages below. N.B. It is important to touch all
4278 // the way down to and including i=StackShadowPages.
4279 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4280 // this could be any sized move but this is can be a debugging crumb
4281 // so the bigger the better.
4282 lea(tmp, Address(tmp, -os::vm_page_size()));
4283 str(size, Address(tmp));
4284 }
4285 }
4286
4287
4288 // Move the address of the polling page into dest.
get_polling_page(Register dest,address page,relocInfo::relocType rtype)4289 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4290 if (SafepointMechanism::uses_thread_local_poll()) {
4291 ldr(dest, Address(rthread, Thread::polling_page_offset()));
4292 } else {
4293 unsigned long off;
4294 adrp(dest, Address(page, rtype), off);
4295 assert(off == 0, "polling page must be page aligned");
4296 }
4297 }
4298
4299 // Move the address of the polling page into r, then read the polling
4300 // page.
read_polling_page(Register r,address page,relocInfo::relocType rtype)4301 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4302 get_polling_page(r, page, rtype);
4303 return read_polling_page(r, rtype);
4304 }
4305
4306 // Read the polling page. The address of the polling page must
4307 // already be in r.
read_polling_page(Register r,relocInfo::relocType rtype)4308 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4309 InstructionMark im(this);
4310 code_section()->relocate(inst_mark(), rtype);
4311 ldrw(zr, Address(r, 0));
4312 return inst_mark();
4313 }
4314
adrp(Register reg1,const Address & dest,unsigned long & byte_offset)4315 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4316 relocInfo::relocType rtype = dest.rspec().reloc()->type();
4317 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4318 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4319 unsigned long dest_page = (unsigned long)dest.target() >> 12;
4320 long offset_low = dest_page - low_page;
4321 long offset_high = dest_page - high_page;
4322
4323 assert(is_valid_AArch64_address(dest.target()), "bad address");
4324 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4325
4326 InstructionMark im(this);
4327 code_section()->relocate(inst_mark(), dest.rspec());
4328 // 8143067: Ensure that the adrp can reach the dest from anywhere within
4329 // the code cache so that if it is relocated we know it will still reach
4330 if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4331 _adrp(reg1, dest.target());
4332 } else {
4333 unsigned long target = (unsigned long)dest.target();
4334 unsigned long adrp_target
4335 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4336
4337 _adrp(reg1, (address)adrp_target);
4338 movk(reg1, target >> 32, 32);
4339 }
4340 byte_offset = (unsigned long)dest.target() & 0xfff;
4341 }
4342
load_byte_map_base(Register reg)4343 void MacroAssembler::load_byte_map_base(Register reg) {
4344 CardTable::CardValue* byte_map_base =
4345 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4346
4347 if (is_valid_AArch64_address((address)byte_map_base)) {
4348 // Strictly speaking the byte_map_base isn't an address at all,
4349 // and it might even be negative.
4350 unsigned long offset;
4351 adrp(reg, ExternalAddress((address)byte_map_base), offset);
4352 // We expect offset to be zero with most collectors.
4353 if (offset != 0) {
4354 add(reg, reg, offset);
4355 }
4356 } else {
4357 mov(reg, (uint64_t)byte_map_base);
4358 }
4359 }
4360
build_frame(int framesize)4361 void MacroAssembler::build_frame(int framesize) {
4362 assert(framesize > 0, "framesize must be > 0");
4363 if (framesize < ((1 << 9) + 2 * wordSize)) {
4364 sub(sp, sp, framesize);
4365 stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4366 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4367 } else {
4368 stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4369 if (PreserveFramePointer) mov(rfp, sp);
4370 if (framesize < ((1 << 12) + 2 * wordSize))
4371 sub(sp, sp, framesize - 2 * wordSize);
4372 else {
4373 mov(rscratch1, framesize - 2 * wordSize);
4374 sub(sp, sp, rscratch1);
4375 }
4376 }
4377 }
4378
remove_frame(int framesize)4379 void MacroAssembler::remove_frame(int framesize) {
4380 assert(framesize > 0, "framesize must be > 0");
4381 if (framesize < ((1 << 9) + 2 * wordSize)) {
4382 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4383 add(sp, sp, framesize);
4384 } else {
4385 if (framesize < ((1 << 12) + 2 * wordSize))
4386 add(sp, sp, framesize - 2 * wordSize);
4387 else {
4388 mov(rscratch1, framesize - 2 * wordSize);
4389 add(sp, sp, rscratch1);
4390 }
4391 ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4392 }
4393 }
4394
4395 #ifdef COMPILER2
4396 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4397
4398 // Search for str1 in str2 and return index or -1
string_indexof(Register str2,Register str1,Register cnt2,Register cnt1,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,int icnt1,Register result,int ae)4399 void MacroAssembler::string_indexof(Register str2, Register str1,
4400 Register cnt2, Register cnt1,
4401 Register tmp1, Register tmp2,
4402 Register tmp3, Register tmp4,
4403 Register tmp5, Register tmp6,
4404 int icnt1, Register result, int ae) {
4405 // NOTE: tmp5, tmp6 can be zr depending on specific method version
4406 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4407
4408 Register ch1 = rscratch1;
4409 Register ch2 = rscratch2;
4410 Register cnt1tmp = tmp1;
4411 Register cnt2tmp = tmp2;
4412 Register cnt1_neg = cnt1;
4413 Register cnt2_neg = cnt2;
4414 Register result_tmp = tmp4;
4415
4416 bool isL = ae == StrIntrinsicNode::LL;
4417
4418 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4419 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4420 int str1_chr_shift = str1_isL ? 0:1;
4421 int str2_chr_shift = str2_isL ? 0:1;
4422 int str1_chr_size = str1_isL ? 1:2;
4423 int str2_chr_size = str2_isL ? 1:2;
4424 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4425 (chr_insn)&MacroAssembler::ldrh;
4426 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4427 (chr_insn)&MacroAssembler::ldrh;
4428 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4429 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4430
4431 // Note, inline_string_indexOf() generates checks:
4432 // if (substr.count > string.count) return -1;
4433 // if (substr.count == 0) return 0;
4434
4435 // We have two strings, a source string in str2, cnt2 and a pattern string
4436 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4437
4438 // For larger pattern and source we use a simplified Boyer Moore algorithm.
4439 // With a small pattern and source we use linear scan.
4440
4441 if (icnt1 == -1) {
4442 sub(result_tmp, cnt2, cnt1);
4443 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4444 br(LT, LINEARSEARCH);
4445 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4446 subs(zr, cnt1, 256);
4447 lsr(tmp1, cnt2, 2);
4448 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4449 br(GE, LINEARSTUB);
4450 }
4451
4452 // The Boyer Moore alogorithm is based on the description here:-
4453 //
4454 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4455 //
4456 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4457 // and the 'Good Suffix' rule.
4458 //
4459 // These rules are essentially heuristics for how far we can shift the
4460 // pattern along the search string.
4461 //
4462 // The implementation here uses the 'Bad Character' rule only because of the
4463 // complexity of initialisation for the 'Good Suffix' rule.
4464 //
4465 // This is also known as the Boyer-Moore-Horspool algorithm:-
4466 //
4467 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4468 //
4469 // This particular implementation has few java-specific optimizations.
4470 //
4471 // #define ASIZE 256
4472 //
4473 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
4474 // int i, j;
4475 // unsigned c;
4476 // unsigned char bc[ASIZE];
4477 //
4478 // /* Preprocessing */
4479 // for (i = 0; i < ASIZE; ++i)
4480 // bc[i] = m;
4481 // for (i = 0; i < m - 1; ) {
4482 // c = x[i];
4483 // ++i;
4484 // // c < 256 for Latin1 string, so, no need for branch
4485 // #ifdef PATTERN_STRING_IS_LATIN1
4486 // bc[c] = m - i;
4487 // #else
4488 // if (c < ASIZE) bc[c] = m - i;
4489 // #endif
4490 // }
4491 //
4492 // /* Searching */
4493 // j = 0;
4494 // while (j <= n - m) {
4495 // c = y[i+j];
4496 // if (x[m-1] == c)
4497 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4498 // if (i < 0) return j;
4499 // // c < 256 for Latin1 string, so, no need for branch
4500 // #ifdef SOURCE_STRING_IS_LATIN1
4501 // // LL case: (c< 256) always true. Remove branch
4502 // j += bc[y[j+m-1]];
4503 // #endif
4504 // #ifndef PATTERN_STRING_IS_UTF
4505 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4506 // if (c < ASIZE)
4507 // j += bc[y[j+m-1]];
4508 // else
4509 // j += 1
4510 // #endif
4511 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4512 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4513 // if (c < ASIZE)
4514 // j += bc[y[j+m-1]];
4515 // else
4516 // j += m
4517 // #endif
4518 // }
4519 // }
4520
4521 if (icnt1 == -1) {
4522 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4523 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4524 Register cnt1end = tmp2;
4525 Register str2end = cnt2;
4526 Register skipch = tmp2;
4527
4528 // str1 length is >=8, so, we can read at least 1 register for cases when
4529 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4530 // UL case. We'll re-read last character in inner pre-loop code to have
4531 // single outer pre-loop load
4532 const int firstStep = isL ? 7 : 3;
4533
4534 const int ASIZE = 256;
4535 const int STORED_BYTES = 32; // amount of bytes stored per instruction
4536 sub(sp, sp, ASIZE);
4537 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4538 mov(ch1, sp);
4539 BIND(BM_INIT_LOOP);
4540 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4541 subs(tmp5, tmp5, 1);
4542 br(GT, BM_INIT_LOOP);
4543
4544 sub(cnt1tmp, cnt1, 1);
4545 mov(tmp5, str2);
4546 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4547 sub(ch2, cnt1, 1);
4548 mov(tmp3, str1);
4549 BIND(BCLOOP);
4550 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4551 if (!str1_isL) {
4552 subs(zr, ch1, ASIZE);
4553 br(HS, BCSKIP);
4554 }
4555 strb(ch2, Address(sp, ch1));
4556 BIND(BCSKIP);
4557 subs(ch2, ch2, 1);
4558 br(GT, BCLOOP);
4559
4560 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4561 if (str1_isL == str2_isL) {
4562 // load last 8 bytes (8LL/4UU symbols)
4563 ldr(tmp6, Address(tmp6, -wordSize));
4564 } else {
4565 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4566 // convert Latin1 to UTF. We'll have to wait until load completed, but
4567 // it's still faster than per-character loads+checks
4568 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4569 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4570 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4571 andr(tmp6, tmp6, 0xFF); // str1[N-4]
4572 orr(ch2, ch1, ch2, LSL, 16);
4573 orr(tmp6, tmp6, tmp3, LSL, 48);
4574 orr(tmp6, tmp6, ch2, LSL, 16);
4575 }
4576 BIND(BMLOOPSTR2);
4577 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4578 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4579 if (str1_isL == str2_isL) {
4580 // re-init tmp3. It's for free because it's executed in parallel with
4581 // load above. Alternative is to initialize it before loop, but it'll
4582 // affect performance on in-order systems with 2 or more ld/st pipelines
4583 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4584 }
4585 if (!isL) { // UU/UL case
4586 lsl(ch2, cnt1tmp, 1); // offset in bytes
4587 }
4588 cmp(tmp3, skipch);
4589 br(NE, BMSKIP);
4590 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4591 mov(ch1, tmp6);
4592 if (isL) {
4593 b(BMLOOPSTR1_AFTER_LOAD);
4594 } else {
4595 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4596 b(BMLOOPSTR1_CMP);
4597 }
4598 BIND(BMLOOPSTR1);
4599 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4600 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4601 BIND(BMLOOPSTR1_AFTER_LOAD);
4602 subs(cnt1tmp, cnt1tmp, 1);
4603 br(LT, BMLOOPSTR1_LASTCMP);
4604 BIND(BMLOOPSTR1_CMP);
4605 cmp(ch1, ch2);
4606 br(EQ, BMLOOPSTR1);
4607 BIND(BMSKIP);
4608 if (!isL) {
4609 // if we've met UTF symbol while searching Latin1 pattern, then we can
4610 // skip cnt1 symbols
4611 if (str1_isL != str2_isL) {
4612 mov(result_tmp, cnt1);
4613 } else {
4614 mov(result_tmp, 1);
4615 }
4616 subs(zr, skipch, ASIZE);
4617 br(HS, BMADV);
4618 }
4619 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4620 BIND(BMADV);
4621 sub(cnt1tmp, cnt1, 1);
4622 add(str2, str2, result_tmp, LSL, str2_chr_shift);
4623 cmp(str2, str2end);
4624 br(LE, BMLOOPSTR2);
4625 add(sp, sp, ASIZE);
4626 b(NOMATCH);
4627 BIND(BMLOOPSTR1_LASTCMP);
4628 cmp(ch1, ch2);
4629 br(NE, BMSKIP);
4630 BIND(BMMATCH);
4631 sub(result, str2, tmp5);
4632 if (!str2_isL) lsr(result, result, 1);
4633 add(sp, sp, ASIZE);
4634 b(DONE);
4635
4636 BIND(LINEARSTUB);
4637 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4638 br(LT, LINEAR_MEDIUM);
4639 mov(result, zr);
4640 RuntimeAddress stub = NULL;
4641 if (isL) {
4642 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4643 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4644 } else if (str1_isL) {
4645 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4646 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4647 } else {
4648 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4649 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4650 }
4651 trampoline_call(stub);
4652 b(DONE);
4653 }
4654
4655 BIND(LINEARSEARCH);
4656 {
4657 Label DO1, DO2, DO3;
4658
4659 Register str2tmp = tmp2;
4660 Register first = tmp3;
4661
4662 if (icnt1 == -1)
4663 {
4664 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4665
4666 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4667 br(LT, DOSHORT);
4668 BIND(LINEAR_MEDIUM);
4669 (this->*str1_load_1chr)(first, Address(str1));
4670 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4671 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4672 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4673 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4674
4675 BIND(FIRST_LOOP);
4676 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4677 cmp(first, ch2);
4678 br(EQ, STR1_LOOP);
4679 BIND(STR2_NEXT);
4680 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4681 br(LE, FIRST_LOOP);
4682 b(NOMATCH);
4683
4684 BIND(STR1_LOOP);
4685 adds(cnt1tmp, cnt1_neg, str1_chr_size);
4686 add(cnt2tmp, cnt2_neg, str2_chr_size);
4687 br(GE, MATCH);
4688
4689 BIND(STR1_NEXT);
4690 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4691 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4692 cmp(ch1, ch2);
4693 br(NE, STR2_NEXT);
4694 adds(cnt1tmp, cnt1tmp, str1_chr_size);
4695 add(cnt2tmp, cnt2tmp, str2_chr_size);
4696 br(LT, STR1_NEXT);
4697 b(MATCH);
4698
4699 BIND(DOSHORT);
4700 if (str1_isL == str2_isL) {
4701 cmp(cnt1, (u1)2);
4702 br(LT, DO1);
4703 br(GT, DO3);
4704 }
4705 }
4706
4707 if (icnt1 == 4) {
4708 Label CH1_LOOP;
4709
4710 (this->*load_4chr)(ch1, str1);
4711 sub(result_tmp, cnt2, 4);
4712 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4713 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4714
4715 BIND(CH1_LOOP);
4716 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4717 cmp(ch1, ch2);
4718 br(EQ, MATCH);
4719 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4720 br(LE, CH1_LOOP);
4721 b(NOMATCH);
4722 }
4723
4724 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4725 Label CH1_LOOP;
4726
4727 BIND(DO2);
4728 (this->*load_2chr)(ch1, str1);
4729 if (icnt1 == 2) {
4730 sub(result_tmp, cnt2, 2);
4731 }
4732 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4733 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4734 BIND(CH1_LOOP);
4735 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4736 cmp(ch1, ch2);
4737 br(EQ, MATCH);
4738 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4739 br(LE, CH1_LOOP);
4740 b(NOMATCH);
4741 }
4742
4743 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4744 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4745
4746 BIND(DO3);
4747 (this->*load_2chr)(first, str1);
4748 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4749 if (icnt1 == 3) {
4750 sub(result_tmp, cnt2, 3);
4751 }
4752 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4753 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4754 BIND(FIRST_LOOP);
4755 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4756 cmpw(first, ch2);
4757 br(EQ, STR1_LOOP);
4758 BIND(STR2_NEXT);
4759 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4760 br(LE, FIRST_LOOP);
4761 b(NOMATCH);
4762
4763 BIND(STR1_LOOP);
4764 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4765 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4766 cmp(ch1, ch2);
4767 br(NE, STR2_NEXT);
4768 b(MATCH);
4769 }
4770
4771 if (icnt1 == -1 || icnt1 == 1) {
4772 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4773
4774 BIND(DO1);
4775 (this->*str1_load_1chr)(ch1, str1);
4776 cmp(cnt2, (u1)8);
4777 br(LT, DO1_SHORT);
4778
4779 sub(result_tmp, cnt2, 8/str2_chr_size);
4780 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4781 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4782 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4783
4784 if (str2_isL) {
4785 orr(ch1, ch1, ch1, LSL, 8);
4786 }
4787 orr(ch1, ch1, ch1, LSL, 16);
4788 orr(ch1, ch1, ch1, LSL, 32);
4789 BIND(CH1_LOOP);
4790 ldr(ch2, Address(str2, cnt2_neg));
4791 eor(ch2, ch1, ch2);
4792 sub(tmp1, ch2, tmp3);
4793 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4794 bics(tmp1, tmp1, tmp2);
4795 br(NE, HAS_ZERO);
4796 adds(cnt2_neg, cnt2_neg, 8);
4797 br(LT, CH1_LOOP);
4798
4799 cmp(cnt2_neg, (u1)8);
4800 mov(cnt2_neg, 0);
4801 br(LT, CH1_LOOP);
4802 b(NOMATCH);
4803
4804 BIND(HAS_ZERO);
4805 rev(tmp1, tmp1);
4806 clz(tmp1, tmp1);
4807 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4808 b(MATCH);
4809
4810 BIND(DO1_SHORT);
4811 mov(result_tmp, cnt2);
4812 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4813 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4814 BIND(DO1_LOOP);
4815 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4816 cmpw(ch1, ch2);
4817 br(EQ, MATCH);
4818 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4819 br(LT, DO1_LOOP);
4820 }
4821 }
4822 BIND(NOMATCH);
4823 mov(result, -1);
4824 b(DONE);
4825 BIND(MATCH);
4826 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4827 BIND(DONE);
4828 }
4829
4830 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4831 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4832
string_indexof_char(Register str1,Register cnt1,Register ch,Register result,Register tmp1,Register tmp2,Register tmp3)4833 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4834 Register ch, Register result,
4835 Register tmp1, Register tmp2, Register tmp3)
4836 {
4837 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4838 Register cnt1_neg = cnt1;
4839 Register ch1 = rscratch1;
4840 Register result_tmp = rscratch2;
4841
4842 cbz(cnt1, NOMATCH);
4843
4844 cmp(cnt1, (u1)4);
4845 br(LT, DO1_SHORT);
4846
4847 orr(ch, ch, ch, LSL, 16);
4848 orr(ch, ch, ch, LSL, 32);
4849
4850 sub(cnt1, cnt1, 4);
4851 mov(result_tmp, cnt1);
4852 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4853 sub(cnt1_neg, zr, cnt1, LSL, 1);
4854
4855 mov(tmp3, 0x0001000100010001);
4856
4857 BIND(CH1_LOOP);
4858 ldr(ch1, Address(str1, cnt1_neg));
4859 eor(ch1, ch, ch1);
4860 sub(tmp1, ch1, tmp3);
4861 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4862 bics(tmp1, tmp1, tmp2);
4863 br(NE, HAS_ZERO);
4864 adds(cnt1_neg, cnt1_neg, 8);
4865 br(LT, CH1_LOOP);
4866
4867 cmp(cnt1_neg, (u1)8);
4868 mov(cnt1_neg, 0);
4869 br(LT, CH1_LOOP);
4870 b(NOMATCH);
4871
4872 BIND(HAS_ZERO);
4873 rev(tmp1, tmp1);
4874 clz(tmp1, tmp1);
4875 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4876 b(MATCH);
4877
4878 BIND(DO1_SHORT);
4879 mov(result_tmp, cnt1);
4880 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4881 sub(cnt1_neg, zr, cnt1, LSL, 1);
4882 BIND(DO1_LOOP);
4883 ldrh(ch1, Address(str1, cnt1_neg));
4884 cmpw(ch, ch1);
4885 br(EQ, MATCH);
4886 adds(cnt1_neg, cnt1_neg, 2);
4887 br(LT, DO1_LOOP);
4888 BIND(NOMATCH);
4889 mov(result, -1);
4890 b(DONE);
4891 BIND(MATCH);
4892 add(result, result_tmp, cnt1_neg, ASR, 1);
4893 BIND(DONE);
4894 }
4895
4896 // Compare strings.
string_compare(Register str1,Register str2,Register cnt1,Register cnt2,Register result,Register tmp1,Register tmp2,FloatRegister vtmp1,FloatRegister vtmp2,FloatRegister vtmp3,int ae)4897 void MacroAssembler::string_compare(Register str1, Register str2,
4898 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4899 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4900 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4901 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4902 SHORT_LOOP_START, TAIL_CHECK;
4903
4904 const u1 STUB_THRESHOLD = 64 + 8;
4905 bool isLL = ae == StrIntrinsicNode::LL;
4906 bool isLU = ae == StrIntrinsicNode::LU;
4907 bool isUL = ae == StrIntrinsicNode::UL;
4908
4909 bool str1_isL = isLL || isLU;
4910 bool str2_isL = isLL || isUL;
4911
4912 int str1_chr_shift = str1_isL ? 0 : 1;
4913 int str2_chr_shift = str2_isL ? 0 : 1;
4914 int str1_chr_size = str1_isL ? 1 : 2;
4915 int str2_chr_size = str2_isL ? 1 : 2;
4916 int minCharsInWord = isLL ? wordSize : wordSize/2;
4917
4918 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4919 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4920 (chr_insn)&MacroAssembler::ldrh;
4921 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4922 (chr_insn)&MacroAssembler::ldrh;
4923 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4924 (uxt_insn)&MacroAssembler::uxthw;
4925
4926 BLOCK_COMMENT("string_compare {");
4927
4928 // Bizzarely, the counts are passed in bytes, regardless of whether they
4929 // are L or U strings, however the result is always in characters.
4930 if (!str1_isL) asrw(cnt1, cnt1, 1);
4931 if (!str2_isL) asrw(cnt2, cnt2, 1);
4932
4933 // Compute the minimum of the string lengths and save the difference.
4934 subsw(result, cnt1, cnt2);
4935 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4936
4937 // A very short string
4938 cmpw(cnt2, minCharsInWord);
4939 br(Assembler::LE, SHORT_STRING);
4940
4941 // Compare longwords
4942 // load first parts of strings and finish initialization while loading
4943 {
4944 if (str1_isL == str2_isL) { // LL or UU
4945 ldr(tmp1, Address(str1));
4946 cmp(str1, str2);
4947 br(Assembler::EQ, DONE);
4948 ldr(tmp2, Address(str2));
4949 cmp(cnt2, STUB_THRESHOLD);
4950 br(GE, STUB);
4951 subsw(cnt2, cnt2, minCharsInWord);
4952 br(EQ, TAIL_CHECK);
4953 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4954 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4955 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4956 } else if (isLU) {
4957 ldrs(vtmp, Address(str1));
4958 cmp(str1, str2);
4959 br(Assembler::EQ, DONE);
4960 ldr(tmp2, Address(str2));
4961 cmp(cnt2, STUB_THRESHOLD);
4962 br(GE, STUB);
4963 subw(cnt2, cnt2, 4);
4964 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4965 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4966 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4967 zip1(vtmp, T8B, vtmp, vtmpZ);
4968 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4969 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4970 add(cnt1, cnt1, 4);
4971 fmovd(tmp1, vtmp);
4972 } else { // UL case
4973 ldr(tmp1, Address(str1));
4974 cmp(str1, str2);
4975 br(Assembler::EQ, DONE);
4976 ldrs(vtmp, Address(str2));
4977 cmp(cnt2, STUB_THRESHOLD);
4978 br(GE, STUB);
4979 subw(cnt2, cnt2, 4);
4980 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4981 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4982 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4983 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4984 zip1(vtmp, T8B, vtmp, vtmpZ);
4985 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4986 add(cnt1, cnt1, 8);
4987 fmovd(tmp2, vtmp);
4988 }
4989 adds(cnt2, cnt2, isUL ? 4 : 8);
4990 br(GE, TAIL);
4991 eor(rscratch2, tmp1, tmp2);
4992 cbnz(rscratch2, DIFFERENCE);
4993 // main loop
4994 bind(NEXT_WORD);
4995 if (str1_isL == str2_isL) {
4996 ldr(tmp1, Address(str1, cnt2));
4997 ldr(tmp2, Address(str2, cnt2));
4998 adds(cnt2, cnt2, 8);
4999 } else if (isLU) {
5000 ldrs(vtmp, Address(str1, cnt1));
5001 ldr(tmp2, Address(str2, cnt2));
5002 add(cnt1, cnt1, 4);
5003 zip1(vtmp, T8B, vtmp, vtmpZ);
5004 fmovd(tmp1, vtmp);
5005 adds(cnt2, cnt2, 8);
5006 } else { // UL
5007 ldrs(vtmp, Address(str2, cnt2));
5008 ldr(tmp1, Address(str1, cnt1));
5009 zip1(vtmp, T8B, vtmp, vtmpZ);
5010 add(cnt1, cnt1, 8);
5011 fmovd(tmp2, vtmp);
5012 adds(cnt2, cnt2, 4);
5013 }
5014 br(GE, TAIL);
5015
5016 eor(rscratch2, tmp1, tmp2);
5017 cbz(rscratch2, NEXT_WORD);
5018 b(DIFFERENCE);
5019 bind(TAIL);
5020 eor(rscratch2, tmp1, tmp2);
5021 cbnz(rscratch2, DIFFERENCE);
5022 // Last longword. In the case where length == 4 we compare the
5023 // same longword twice, but that's still faster than another
5024 // conditional branch.
5025 if (str1_isL == str2_isL) {
5026 ldr(tmp1, Address(str1));
5027 ldr(tmp2, Address(str2));
5028 } else if (isLU) {
5029 ldrs(vtmp, Address(str1));
5030 ldr(tmp2, Address(str2));
5031 zip1(vtmp, T8B, vtmp, vtmpZ);
5032 fmovd(tmp1, vtmp);
5033 } else { // UL
5034 ldrs(vtmp, Address(str2));
5035 ldr(tmp1, Address(str1));
5036 zip1(vtmp, T8B, vtmp, vtmpZ);
5037 fmovd(tmp2, vtmp);
5038 }
5039 bind(TAIL_CHECK);
5040 eor(rscratch2, tmp1, tmp2);
5041 cbz(rscratch2, DONE);
5042
5043 // Find the first different characters in the longwords and
5044 // compute their difference.
5045 bind(DIFFERENCE);
5046 rev(rscratch2, rscratch2);
5047 clz(rscratch2, rscratch2);
5048 andr(rscratch2, rscratch2, isLL ? -8 : -16);
5049 lsrv(tmp1, tmp1, rscratch2);
5050 (this->*ext_chr)(tmp1, tmp1);
5051 lsrv(tmp2, tmp2, rscratch2);
5052 (this->*ext_chr)(tmp2, tmp2);
5053 subw(result, tmp1, tmp2);
5054 b(DONE);
5055 }
5056
5057 bind(STUB);
5058 RuntimeAddress stub = NULL;
5059 switch(ae) {
5060 case StrIntrinsicNode::LL:
5061 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5062 break;
5063 case StrIntrinsicNode::UU:
5064 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5065 break;
5066 case StrIntrinsicNode::LU:
5067 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5068 break;
5069 case StrIntrinsicNode::UL:
5070 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5071 break;
5072 default:
5073 ShouldNotReachHere();
5074 }
5075 assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5076 trampoline_call(stub);
5077 b(DONE);
5078
5079 bind(SHORT_STRING);
5080 // Is the minimum length zero?
5081 cbz(cnt2, DONE);
5082 // arrange code to do most branches while loading and loading next characters
5083 // while comparing previous
5084 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5085 subs(cnt2, cnt2, 1);
5086 br(EQ, SHORT_LAST_INIT);
5087 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5088 b(SHORT_LOOP_START);
5089 bind(SHORT_LOOP);
5090 subs(cnt2, cnt2, 1);
5091 br(EQ, SHORT_LAST);
5092 bind(SHORT_LOOP_START);
5093 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5094 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5095 cmp(tmp1, cnt1);
5096 br(NE, SHORT_LOOP_TAIL);
5097 subs(cnt2, cnt2, 1);
5098 br(EQ, SHORT_LAST2);
5099 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5100 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5101 cmp(tmp2, rscratch1);
5102 br(EQ, SHORT_LOOP);
5103 sub(result, tmp2, rscratch1);
5104 b(DONE);
5105 bind(SHORT_LOOP_TAIL);
5106 sub(result, tmp1, cnt1);
5107 b(DONE);
5108 bind(SHORT_LAST2);
5109 cmp(tmp2, rscratch1);
5110 br(EQ, DONE);
5111 sub(result, tmp2, rscratch1);
5112
5113 b(DONE);
5114 bind(SHORT_LAST_INIT);
5115 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5116 bind(SHORT_LAST);
5117 cmp(tmp1, cnt1);
5118 br(EQ, DONE);
5119 sub(result, tmp1, cnt1);
5120
5121 bind(DONE);
5122
5123 BLOCK_COMMENT("} string_compare");
5124 }
5125 #endif // COMPILER2
5126
5127 // This method checks if provided byte array contains byte with highest bit set.
has_negatives(Register ary1,Register len,Register result)5128 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5129 // Simple and most common case of aligned small array which is not at the
5130 // end of memory page is placed here. All other cases are in stub.
5131 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5132 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5133 assert_different_registers(ary1, len, result);
5134
5135 cmpw(len, 0);
5136 br(LE, SET_RESULT);
5137 cmpw(len, 4 * wordSize);
5138 br(GE, STUB_LONG); // size > 32 then go to stub
5139
5140 int shift = 64 - exact_log2(os::vm_page_size());
5141 lsl(rscratch1, ary1, shift);
5142 mov(rscratch2, (size_t)(4 * wordSize) << shift);
5143 adds(rscratch2, rscratch1, rscratch2); // At end of page?
5144 br(CS, STUB); // at the end of page then go to stub
5145 subs(len, len, wordSize);
5146 br(LT, END);
5147
5148 BIND(LOOP);
5149 ldr(rscratch1, Address(post(ary1, wordSize)));
5150 tst(rscratch1, UPPER_BIT_MASK);
5151 br(NE, SET_RESULT);
5152 subs(len, len, wordSize);
5153 br(GE, LOOP);
5154 cmpw(len, -wordSize);
5155 br(EQ, SET_RESULT);
5156
5157 BIND(END);
5158 ldr(result, Address(ary1));
5159 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5160 lslv(result, result, len);
5161 tst(result, UPPER_BIT_MASK);
5162 b(SET_RESULT);
5163
5164 BIND(STUB);
5165 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives());
5166 assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5167 trampoline_call(has_neg);
5168 b(DONE);
5169
5170 BIND(STUB_LONG);
5171 RuntimeAddress has_neg_long = RuntimeAddress(
5172 StubRoutines::aarch64::has_negatives_long());
5173 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5174 trampoline_call(has_neg_long);
5175 b(DONE);
5176
5177 BIND(SET_RESULT);
5178 cset(result, NE); // set true or false
5179
5180 BIND(DONE);
5181 }
5182
arrays_equals(Register a1,Register a2,Register tmp3,Register tmp4,Register tmp5,Register result,Register cnt1,int elem_size)5183 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5184 Register tmp4, Register tmp5, Register result,
5185 Register cnt1, int elem_size) {
5186 Label DONE, SAME;
5187 Register tmp1 = rscratch1;
5188 Register tmp2 = rscratch2;
5189 Register cnt2 = tmp2; // cnt2 only used in array length compare
5190 int elem_per_word = wordSize/elem_size;
5191 int log_elem_size = exact_log2(elem_size);
5192 int length_offset = arrayOopDesc::length_offset_in_bytes();
5193 int base_offset
5194 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5195 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5196
5197 assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5198 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5199
5200 #ifndef PRODUCT
5201 {
5202 const char kind = (elem_size == 2) ? 'U' : 'L';
5203 char comment[64];
5204 snprintf(comment, sizeof comment, "array_equals%c{", kind);
5205 BLOCK_COMMENT(comment);
5206 }
5207 #endif
5208
5209 // if (a1 == a2)
5210 // return true;
5211 cmpoop(a1, a2); // May have read barriers for a1 and a2.
5212 br(EQ, SAME);
5213
5214 if (UseSimpleArrayEquals) {
5215 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5216 // if (a1 == null || a2 == null)
5217 // return false;
5218 // a1 & a2 == 0 means (some-pointer is null) or
5219 // (very-rare-or-even-probably-impossible-pointer-values)
5220 // so, we can save one branch in most cases
5221 tst(a1, a2);
5222 mov(result, false);
5223 br(EQ, A_MIGHT_BE_NULL);
5224 // if (a1.length != a2.length)
5225 // return false;
5226 bind(A_IS_NOT_NULL);
5227 ldrw(cnt1, Address(a1, length_offset));
5228 ldrw(cnt2, Address(a2, length_offset));
5229 eorw(tmp5, cnt1, cnt2);
5230 cbnzw(tmp5, DONE);
5231 lea(a1, Address(a1, base_offset));
5232 lea(a2, Address(a2, base_offset));
5233 // Check for short strings, i.e. smaller than wordSize.
5234 subs(cnt1, cnt1, elem_per_word);
5235 br(Assembler::LT, SHORT);
5236 // Main 8 byte comparison loop.
5237 bind(NEXT_WORD); {
5238 ldr(tmp1, Address(post(a1, wordSize)));
5239 ldr(tmp2, Address(post(a2, wordSize)));
5240 subs(cnt1, cnt1, elem_per_word);
5241 eor(tmp5, tmp1, tmp2);
5242 cbnz(tmp5, DONE);
5243 } br(GT, NEXT_WORD);
5244 // Last longword. In the case where length == 4 we compare the
5245 // same longword twice, but that's still faster than another
5246 // conditional branch.
5247 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5248 // length == 4.
5249 if (log_elem_size > 0)
5250 lsl(cnt1, cnt1, log_elem_size);
5251 ldr(tmp3, Address(a1, cnt1));
5252 ldr(tmp4, Address(a2, cnt1));
5253 eor(tmp5, tmp3, tmp4);
5254 cbnz(tmp5, DONE);
5255 b(SAME);
5256 bind(A_MIGHT_BE_NULL);
5257 // in case both a1 and a2 are not-null, proceed with loads
5258 cbz(a1, DONE);
5259 cbz(a2, DONE);
5260 b(A_IS_NOT_NULL);
5261 bind(SHORT);
5262
5263 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5264 {
5265 ldrw(tmp1, Address(post(a1, 4)));
5266 ldrw(tmp2, Address(post(a2, 4)));
5267 eorw(tmp5, tmp1, tmp2);
5268 cbnzw(tmp5, DONE);
5269 }
5270 bind(TAIL03);
5271 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5272 {
5273 ldrh(tmp3, Address(post(a1, 2)));
5274 ldrh(tmp4, Address(post(a2, 2)));
5275 eorw(tmp5, tmp3, tmp4);
5276 cbnzw(tmp5, DONE);
5277 }
5278 bind(TAIL01);
5279 if (elem_size == 1) { // Only needed when comparing byte arrays.
5280 tbz(cnt1, 0, SAME); // 0-1 bytes left.
5281 {
5282 ldrb(tmp1, a1);
5283 ldrb(tmp2, a2);
5284 eorw(tmp5, tmp1, tmp2);
5285 cbnzw(tmp5, DONE);
5286 }
5287 }
5288 } else {
5289 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5290 CSET_EQ, LAST_CHECK;
5291 mov(result, false);
5292 cbz(a1, DONE);
5293 ldrw(cnt1, Address(a1, length_offset));
5294 cbz(a2, DONE);
5295 ldrw(cnt2, Address(a2, length_offset));
5296 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5297 // faster to perform another branch before comparing a1 and a2
5298 cmp(cnt1, (u1)elem_per_word);
5299 br(LE, SHORT); // short or same
5300 ldr(tmp3, Address(pre(a1, base_offset)));
5301 subs(zr, cnt1, stubBytesThreshold);
5302 br(GE, STUB);
5303 ldr(tmp4, Address(pre(a2, base_offset)));
5304 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5305 cmp(cnt2, cnt1);
5306 br(NE, DONE);
5307
5308 // Main 16 byte comparison loop with 2 exits
5309 bind(NEXT_DWORD); {
5310 ldr(tmp1, Address(pre(a1, wordSize)));
5311 ldr(tmp2, Address(pre(a2, wordSize)));
5312 subs(cnt1, cnt1, 2 * elem_per_word);
5313 br(LE, TAIL);
5314 eor(tmp4, tmp3, tmp4);
5315 cbnz(tmp4, DONE);
5316 ldr(tmp3, Address(pre(a1, wordSize)));
5317 ldr(tmp4, Address(pre(a2, wordSize)));
5318 cmp(cnt1, (u1)elem_per_word);
5319 br(LE, TAIL2);
5320 cmp(tmp1, tmp2);
5321 } br(EQ, NEXT_DWORD);
5322 b(DONE);
5323
5324 bind(TAIL);
5325 eor(tmp4, tmp3, tmp4);
5326 eor(tmp2, tmp1, tmp2);
5327 lslv(tmp2, tmp2, tmp5);
5328 orr(tmp5, tmp4, tmp2);
5329 cmp(tmp5, zr);
5330 b(CSET_EQ);
5331
5332 bind(TAIL2);
5333 eor(tmp2, tmp1, tmp2);
5334 cbnz(tmp2, DONE);
5335 b(LAST_CHECK);
5336
5337 bind(STUB);
5338 ldr(tmp4, Address(pre(a2, base_offset)));
5339 cmp(cnt2, cnt1);
5340 br(NE, DONE);
5341 if (elem_size == 2) { // convert to byte counter
5342 lsl(cnt1, cnt1, 1);
5343 }
5344 eor(tmp5, tmp3, tmp4);
5345 cbnz(tmp5, DONE);
5346 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5347 assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5348 trampoline_call(stub);
5349 b(DONE);
5350
5351 bind(EARLY_OUT);
5352 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5353 // so, if a2 == null => return false(0), else return true, so we can return a2
5354 mov(result, a2);
5355 b(DONE);
5356 bind(SHORT);
5357 cmp(cnt2, cnt1);
5358 br(NE, DONE);
5359 cbz(cnt1, SAME);
5360 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5361 ldr(tmp3, Address(a1, base_offset));
5362 ldr(tmp4, Address(a2, base_offset));
5363 bind(LAST_CHECK);
5364 eor(tmp4, tmp3, tmp4);
5365 lslv(tmp5, tmp4, tmp5);
5366 cmp(tmp5, zr);
5367 bind(CSET_EQ);
5368 cset(result, EQ);
5369 b(DONE);
5370 }
5371
5372 bind(SAME);
5373 mov(result, true);
5374 // That's it.
5375 bind(DONE);
5376
5377 BLOCK_COMMENT("} array_equals");
5378 }
5379
5380 // Compare Strings
5381
5382 // For Strings we're passed the address of the first characters in a1
5383 // and a2 and the length in cnt1.
5384 // elem_size is the element size in bytes: either 1 or 2.
5385 // There are two implementations. For arrays >= 8 bytes, all
5386 // comparisons (including the final one, which may overlap) are
5387 // performed 8 bytes at a time. For strings < 8 bytes, we compare a
5388 // halfword, then a short, and then a byte.
5389
string_equals(Register a1,Register a2,Register result,Register cnt1,int elem_size)5390 void MacroAssembler::string_equals(Register a1, Register a2,
5391 Register result, Register cnt1, int elem_size)
5392 {
5393 Label SAME, DONE, SHORT, NEXT_WORD;
5394 Register tmp1 = rscratch1;
5395 Register tmp2 = rscratch2;
5396 Register cnt2 = tmp2; // cnt2 only used in array length compare
5397
5398 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5399 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5400
5401 #ifndef PRODUCT
5402 {
5403 const char kind = (elem_size == 2) ? 'U' : 'L';
5404 char comment[64];
5405 snprintf(comment, sizeof comment, "{string_equals%c", kind);
5406 BLOCK_COMMENT(comment);
5407 }
5408 #endif
5409
5410 mov(result, false);
5411
5412 // Check for short strings, i.e. smaller than wordSize.
5413 subs(cnt1, cnt1, wordSize);
5414 br(Assembler::LT, SHORT);
5415 // Main 8 byte comparison loop.
5416 bind(NEXT_WORD); {
5417 ldr(tmp1, Address(post(a1, wordSize)));
5418 ldr(tmp2, Address(post(a2, wordSize)));
5419 subs(cnt1, cnt1, wordSize);
5420 eor(tmp1, tmp1, tmp2);
5421 cbnz(tmp1, DONE);
5422 } br(GT, NEXT_WORD);
5423 // Last longword. In the case where length == 4 we compare the
5424 // same longword twice, but that's still faster than another
5425 // conditional branch.
5426 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5427 // length == 4.
5428 ldr(tmp1, Address(a1, cnt1));
5429 ldr(tmp2, Address(a2, cnt1));
5430 eor(tmp2, tmp1, tmp2);
5431 cbnz(tmp2, DONE);
5432 b(SAME);
5433
5434 bind(SHORT);
5435 Label TAIL03, TAIL01;
5436
5437 tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5438 {
5439 ldrw(tmp1, Address(post(a1, 4)));
5440 ldrw(tmp2, Address(post(a2, 4)));
5441 eorw(tmp1, tmp1, tmp2);
5442 cbnzw(tmp1, DONE);
5443 }
5444 bind(TAIL03);
5445 tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5446 {
5447 ldrh(tmp1, Address(post(a1, 2)));
5448 ldrh(tmp2, Address(post(a2, 2)));
5449 eorw(tmp1, tmp1, tmp2);
5450 cbnzw(tmp1, DONE);
5451 }
5452 bind(TAIL01);
5453 if (elem_size == 1) { // Only needed when comparing 1-byte elements
5454 tbz(cnt1, 0, SAME); // 0-1 bytes left.
5455 {
5456 ldrb(tmp1, a1);
5457 ldrb(tmp2, a2);
5458 eorw(tmp1, tmp1, tmp2);
5459 cbnzw(tmp1, DONE);
5460 }
5461 }
5462 // Arrays are equal.
5463 bind(SAME);
5464 mov(result, true);
5465
5466 // That's it.
5467 bind(DONE);
5468 BLOCK_COMMENT("} string_equals");
5469 }
5470
5471
5472 // The size of the blocks erased by the zero_blocks stub. We must
5473 // handle anything smaller than this ourselves in zero_words().
5474 const int MacroAssembler::zero_words_block_size = 8;
5475
5476 // zero_words() is used by C2 ClearArray patterns. It is as small as
5477 // possible, handling small word counts locally and delegating
5478 // anything larger to the zero_blocks stub. It is expanded many times
5479 // in compiled code, so it is important to keep it short.
5480
5481 // ptr: Address of a buffer to be zeroed.
5482 // cnt: Count in HeapWords.
5483 //
5484 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
zero_words(Register ptr,Register cnt)5485 void MacroAssembler::zero_words(Register ptr, Register cnt)
5486 {
5487 assert(is_power_of_2(zero_words_block_size), "adjust this");
5488 assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5489
5490 BLOCK_COMMENT("zero_words {");
5491 cmp(cnt, (u1)zero_words_block_size);
5492 Label around;
5493 br(LO, around);
5494 {
5495 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5496 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5497 if (StubRoutines::aarch64::complete()) {
5498 trampoline_call(zero_blocks);
5499 } else {
5500 bl(zero_blocks);
5501 }
5502 }
5503 bind(around);
5504 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5505 Label l;
5506 tbz(cnt, exact_log2(i), l);
5507 for (int j = 0; j < i; j += 2) {
5508 stp(zr, zr, post(ptr, 16));
5509 }
5510 bind(l);
5511 }
5512 {
5513 Label l;
5514 tbz(cnt, 0, l);
5515 str(zr, Address(ptr));
5516 bind(l);
5517 }
5518 BLOCK_COMMENT("} zero_words");
5519 }
5520
5521 // base: Address of a buffer to be zeroed, 8 bytes aligned.
5522 // cnt: Immediate count in HeapWords.
5523 #define SmallArraySize (18 * BytesPerLong)
zero_words(Register base,u_int64_t cnt)5524 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5525 {
5526 BLOCK_COMMENT("zero_words {");
5527 int i = cnt & 1; // store any odd word to start
5528 if (i) str(zr, Address(base));
5529
5530 if (cnt <= SmallArraySize / BytesPerLong) {
5531 for (; i < (int)cnt; i += 2)
5532 stp(zr, zr, Address(base, i * wordSize));
5533 } else {
5534 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5535 int remainder = cnt % (2 * unroll);
5536 for (; i < remainder; i += 2)
5537 stp(zr, zr, Address(base, i * wordSize));
5538
5539 Label loop;
5540 Register cnt_reg = rscratch1;
5541 Register loop_base = rscratch2;
5542 cnt = cnt - remainder;
5543 mov(cnt_reg, cnt);
5544 // adjust base and prebias by -2 * wordSize so we can pre-increment
5545 add(loop_base, base, (remainder - 2) * wordSize);
5546 bind(loop);
5547 sub(cnt_reg, cnt_reg, 2 * unroll);
5548 for (i = 1; i < unroll; i++)
5549 stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5550 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5551 cbnz(cnt_reg, loop);
5552 }
5553 BLOCK_COMMENT("} zero_words");
5554 }
5555
5556 // Zero blocks of memory by using DC ZVA.
5557 //
5558 // Aligns the base address first sufficently for DC ZVA, then uses
5559 // DC ZVA repeatedly for every full block. cnt is the size to be
5560 // zeroed in HeapWords. Returns the count of words left to be zeroed
5561 // in cnt.
5562 //
5563 // NOTE: This is intended to be used in the zero_blocks() stub. If
5564 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
zero_dcache_blocks(Register base,Register cnt)5565 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5566 Register tmp = rscratch1;
5567 Register tmp2 = rscratch2;
5568 int zva_length = VM_Version::zva_length();
5569 Label initial_table_end, loop_zva;
5570 Label fini;
5571
5572 // Base must be 16 byte aligned. If not just return and let caller handle it
5573 tst(base, 0x0f);
5574 br(Assembler::NE, fini);
5575 // Align base with ZVA length.
5576 neg(tmp, base);
5577 andr(tmp, tmp, zva_length - 1);
5578
5579 // tmp: the number of bytes to be filled to align the base with ZVA length.
5580 add(base, base, tmp);
5581 sub(cnt, cnt, tmp, Assembler::ASR, 3);
5582 adr(tmp2, initial_table_end);
5583 sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5584 br(tmp2);
5585
5586 for (int i = -zva_length + 16; i < 0; i += 16)
5587 stp(zr, zr, Address(base, i));
5588 bind(initial_table_end);
5589
5590 sub(cnt, cnt, zva_length >> 3);
5591 bind(loop_zva);
5592 dc(Assembler::ZVA, base);
5593 subs(cnt, cnt, zva_length >> 3);
5594 add(base, base, zva_length);
5595 br(Assembler::GE, loop_zva);
5596 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5597 bind(fini);
5598 }
5599
5600 // base: Address of a buffer to be filled, 8 bytes aligned.
5601 // cnt: Count in 8-byte unit.
5602 // value: Value to be filled with.
5603 // base will point to the end of the buffer after filling.
fill_words(Register base,Register cnt,Register value)5604 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5605 {
5606 // Algorithm:
5607 //
5608 // scratch1 = cnt & 7;
5609 // cnt -= scratch1;
5610 // p += scratch1;
5611 // switch (scratch1) {
5612 // do {
5613 // cnt -= 8;
5614 // p[-8] = v;
5615 // case 7:
5616 // p[-7] = v;
5617 // case 6:
5618 // p[-6] = v;
5619 // // ...
5620 // case 1:
5621 // p[-1] = v;
5622 // case 0:
5623 // p += 8;
5624 // } while (cnt);
5625 // }
5626
5627 assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5628
5629 Label fini, skip, entry, loop;
5630 const int unroll = 8; // Number of stp instructions we'll unroll
5631
5632 cbz(cnt, fini);
5633 tbz(base, 3, skip);
5634 str(value, Address(post(base, 8)));
5635 sub(cnt, cnt, 1);
5636 bind(skip);
5637
5638 andr(rscratch1, cnt, (unroll-1) * 2);
5639 sub(cnt, cnt, rscratch1);
5640 add(base, base, rscratch1, Assembler::LSL, 3);
5641 adr(rscratch2, entry);
5642 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5643 br(rscratch2);
5644
5645 bind(loop);
5646 add(base, base, unroll * 16);
5647 for (int i = -unroll; i < 0; i++)
5648 stp(value, value, Address(base, i * 16));
5649 bind(entry);
5650 subs(cnt, cnt, unroll * 2);
5651 br(Assembler::GE, loop);
5652
5653 tbz(cnt, 0, fini);
5654 str(value, Address(post(base, 8)));
5655 bind(fini);
5656 }
5657
5658 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5659 // java/lang/StringUTF16.compress.
encode_iso_array(Register src,Register dst,Register len,Register result,FloatRegister Vtmp1,FloatRegister Vtmp2,FloatRegister Vtmp3,FloatRegister Vtmp4)5660 void MacroAssembler::encode_iso_array(Register src, Register dst,
5661 Register len, Register result,
5662 FloatRegister Vtmp1, FloatRegister Vtmp2,
5663 FloatRegister Vtmp3, FloatRegister Vtmp4)
5664 {
5665 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5666 NEXT_32_START, NEXT_32_PRFM_START;
5667 Register tmp1 = rscratch1, tmp2 = rscratch2;
5668
5669 mov(result, len); // Save initial len
5670
5671 cmp(len, (u1)8); // handle shortest strings first
5672 br(LT, LOOP_1);
5673 cmp(len, (u1)32);
5674 br(LT, NEXT_8);
5675 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5676 // to convert chars to bytes
5677 if (SoftwarePrefetchHintDistance >= 0) {
5678 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5679 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5680 br(LE, NEXT_32_START);
5681 b(NEXT_32_PRFM_START);
5682 BIND(NEXT_32_PRFM);
5683 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5684 BIND(NEXT_32_PRFM_START);
5685 prfm(Address(src, SoftwarePrefetchHintDistance));
5686 orr(v4, T16B, Vtmp1, Vtmp2);
5687 orr(v5, T16B, Vtmp3, Vtmp4);
5688 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5689 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5690 uzp2(v5, T16B, v4, v5); // high bytes
5691 umov(tmp2, v5, D, 1);
5692 fmovd(tmp1, v5);
5693 orr(tmp1, tmp1, tmp2);
5694 cbnz(tmp1, LOOP_8);
5695 stpq(Vtmp1, Vtmp3, dst);
5696 sub(len, len, 32);
5697 add(dst, dst, 32);
5698 add(src, src, 64);
5699 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5700 br(GE, NEXT_32_PRFM);
5701 cmp(len, (u1)32);
5702 br(LT, LOOP_8);
5703 BIND(NEXT_32);
5704 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5705 BIND(NEXT_32_START);
5706 } else {
5707 BIND(NEXT_32);
5708 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5709 }
5710 prfm(Address(src, SoftwarePrefetchHintDistance));
5711 uzp1(v4, T16B, Vtmp1, Vtmp2);
5712 uzp1(v5, T16B, Vtmp3, Vtmp4);
5713 orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5714 orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5715 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5716 umov(tmp2, Vtmp1, D, 1);
5717 fmovd(tmp1, Vtmp1);
5718 orr(tmp1, tmp1, tmp2);
5719 cbnz(tmp1, LOOP_8);
5720 stpq(v4, v5, dst);
5721 sub(len, len, 32);
5722 add(dst, dst, 32);
5723 add(src, src, 64);
5724 cmp(len, (u1)32);
5725 br(GE, NEXT_32);
5726 cbz(len, DONE);
5727
5728 BIND(LOOP_8);
5729 cmp(len, (u1)8);
5730 br(LT, LOOP_1);
5731 BIND(NEXT_8);
5732 ld1(Vtmp1, T8H, src);
5733 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5734 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5735 fmovd(tmp1, Vtmp3);
5736 cbnz(tmp1, NEXT_1);
5737 strd(Vtmp2, dst);
5738
5739 sub(len, len, 8);
5740 add(dst, dst, 8);
5741 add(src, src, 16);
5742 cmp(len, (u1)8);
5743 br(GE, NEXT_8);
5744
5745 BIND(LOOP_1);
5746
5747 cbz(len, DONE);
5748 BIND(NEXT_1);
5749 ldrh(tmp1, Address(post(src, 2)));
5750 tst(tmp1, 0xff00);
5751 br(NE, SET_RESULT);
5752 strb(tmp1, Address(post(dst, 1)));
5753 subs(len, len, 1);
5754 br(GT, NEXT_1);
5755
5756 BIND(SET_RESULT);
5757 sub(result, result, len); // Return index where we stopped
5758 // Return len == 0 if we processed all
5759 // characters
5760 BIND(DONE);
5761 }
5762
5763
5764 // Inflate byte[] array to char[].
byte_array_inflate(Register src,Register dst,Register len,FloatRegister vtmp1,FloatRegister vtmp2,FloatRegister vtmp3,Register tmp4)5765 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5766 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5767 Register tmp4) {
5768 Label big, done, after_init, to_stub;
5769
5770 assert_different_registers(src, dst, len, tmp4, rscratch1);
5771
5772 fmovd(vtmp1, zr);
5773 lsrw(tmp4, len, 3);
5774 bind(after_init);
5775 cbnzw(tmp4, big);
5776 // Short string: less than 8 bytes.
5777 {
5778 Label loop, tiny;
5779
5780 cmpw(len, 4);
5781 br(LT, tiny);
5782 // Use SIMD to do 4 bytes.
5783 ldrs(vtmp2, post(src, 4));
5784 zip1(vtmp3, T8B, vtmp2, vtmp1);
5785 subw(len, len, 4);
5786 strd(vtmp3, post(dst, 8));
5787
5788 cbzw(len, done);
5789
5790 // Do the remaining bytes by steam.
5791 bind(loop);
5792 ldrb(tmp4, post(src, 1));
5793 strh(tmp4, post(dst, 2));
5794 subw(len, len, 1);
5795
5796 bind(tiny);
5797 cbnz(len, loop);
5798
5799 b(done);
5800 }
5801
5802 if (SoftwarePrefetchHintDistance >= 0) {
5803 bind(to_stub);
5804 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5805 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5806 trampoline_call(stub);
5807 b(after_init);
5808 }
5809
5810 // Unpack the bytes 8 at a time.
5811 bind(big);
5812 {
5813 Label loop, around, loop_last, loop_start;
5814
5815 if (SoftwarePrefetchHintDistance >= 0) {
5816 const int large_loop_threshold = (64 + 16)/8;
5817 ldrd(vtmp2, post(src, 8));
5818 andw(len, len, 7);
5819 cmp(tmp4, (u1)large_loop_threshold);
5820 br(GE, to_stub);
5821 b(loop_start);
5822
5823 bind(loop);
5824 ldrd(vtmp2, post(src, 8));
5825 bind(loop_start);
5826 subs(tmp4, tmp4, 1);
5827 br(EQ, loop_last);
5828 zip1(vtmp2, T16B, vtmp2, vtmp1);
5829 ldrd(vtmp3, post(src, 8));
5830 st1(vtmp2, T8H, post(dst, 16));
5831 subs(tmp4, tmp4, 1);
5832 zip1(vtmp3, T16B, vtmp3, vtmp1);
5833 st1(vtmp3, T8H, post(dst, 16));
5834 br(NE, loop);
5835 b(around);
5836 bind(loop_last);
5837 zip1(vtmp2, T16B, vtmp2, vtmp1);
5838 st1(vtmp2, T8H, post(dst, 16));
5839 bind(around);
5840 cbz(len, done);
5841 } else {
5842 andw(len, len, 7);
5843 bind(loop);
5844 ldrd(vtmp2, post(src, 8));
5845 sub(tmp4, tmp4, 1);
5846 zip1(vtmp3, T16B, vtmp2, vtmp1);
5847 st1(vtmp3, T8H, post(dst, 16));
5848 cbnz(tmp4, loop);
5849 }
5850 }
5851
5852 // Do the tail of up to 8 bytes.
5853 add(src, src, len);
5854 ldrd(vtmp3, Address(src, -8));
5855 add(dst, dst, len, ext::uxtw, 1);
5856 zip1(vtmp3, T16B, vtmp3, vtmp1);
5857 strq(vtmp3, Address(dst, -16));
5858
5859 bind(done);
5860 }
5861
5862 // Compress char[] array to byte[].
char_array_compress(Register src,Register dst,Register len,FloatRegister tmp1Reg,FloatRegister tmp2Reg,FloatRegister tmp3Reg,FloatRegister tmp4Reg,Register result)5863 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5864 FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5865 FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5866 Register result) {
5867 encode_iso_array(src, dst, len, result,
5868 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5869 cmp(len, zr);
5870 csel(result, result, zr, EQ);
5871 }
5872
5873 // get_thread() can be called anywhere inside generated code so we
5874 // need to save whatever non-callee save context might get clobbered
5875 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5876 // the call setup code.
5877 //
5878 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5879 //
get_thread(Register dst)5880 void MacroAssembler::get_thread(Register dst) {
5881 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5882 push(saved_regs, sp);
5883
5884 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5885 blr(lr);
5886 if (dst != c_rarg0) {
5887 mov(dst, c_rarg0);
5888 }
5889
5890 pop(saved_regs, sp);
5891 }
5892
cache_wb(Address line)5893 void MacroAssembler::cache_wb(Address line) {
5894 assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
5895 assert(line.index() == noreg, "index should be noreg");
5896 assert(line.offset() == 0, "offset should be 0");
5897 // would like to assert this
5898 // assert(line._ext.shift == 0, "shift should be zero");
5899 if (VM_Version::supports_dcpop()) {
5900 // writeback using clear virtual address to point of persistence
5901 dc(Assembler::CVAP, line.base());
5902 } else {
5903 // no need to generate anything as Unsafe.writebackMemory should
5904 // never invoke this stub
5905 }
5906 }
5907
cache_wbsync(bool is_pre)5908 void MacroAssembler::cache_wbsync(bool is_pre) {
5909 // we only need a barrier post sync
5910 if (!is_pre) {
5911 membar(Assembler::AnyAny);
5912 }
5913 }
5914