1 /*
2 * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include <sys/types.h>
27
28 #include "precompiled.hpp"
29 #include "jvm.h"
30 #include "asm/assembler.hpp"
31 #include "asm/assembler.inline.hpp"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/cardTable.hpp"
34 #include "gc/shared/barrierSetAssembler.hpp"
35 #include "gc/shared/cardTableBarrierSet.hpp"
36 #include "interpreter/interpreter.hpp"
37 #include "compiler/disassembler.hpp"
38 #include "memory/resourceArea.hpp"
39 #include "memory/universe.hpp"
40 #include "nativeInst_aarch64.hpp"
41 #include "oops/accessDecorators.hpp"
42 #include "oops/compressedOops.inline.hpp"
43 #include "oops/klass.inline.hpp"
44 #include "runtime/biasedLocking.hpp"
45 #include "runtime/icache.hpp"
46 #include "runtime/interfaceSupport.inline.hpp"
47 #include "runtime/jniHandles.inline.hpp"
48 #include "runtime/sharedRuntime.hpp"
49 #include "runtime/thread.hpp"
50 #include "utilities/powerOfTwo.hpp"
51 #ifdef COMPILER1
52 #include "c1/c1_LIRAssembler.hpp"
53 #endif
54 #ifdef COMPILER2
55 #include "oops/oop.hpp"
56 #include "opto/compile.hpp"
57 #include "opto/node.hpp"
58 #include "opto/output.hpp"
59 #endif
60
61 #ifdef PRODUCT
62 #define BLOCK_COMMENT(str) /* nothing */
63 #else
64 #define BLOCK_COMMENT(str) block_comment(str)
65 #endif
66 #define STOP(str) stop(str);
67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
68
69 // Patch any kind of instruction; there may be several instructions.
70 // Return the total length (in bytes) of the instructions.
pd_patch_instruction_size(address branch,address target)71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
72 int instructions = 1;
73 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
74 long offset = (target - branch) >> 2;
75 unsigned insn = *(unsigned*)branch;
76 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
77 // Load register (literal)
78 Instruction_aarch64::spatch(branch, 23, 5, offset);
79 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
80 // Unconditional branch (immediate)
81 Instruction_aarch64::spatch(branch, 25, 0, offset);
82 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
83 // Conditional branch (immediate)
84 Instruction_aarch64::spatch(branch, 23, 5, offset);
85 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
86 // Compare & branch (immediate)
87 Instruction_aarch64::spatch(branch, 23, 5, offset);
88 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
89 // Test & branch (immediate)
90 Instruction_aarch64::spatch(branch, 18, 5, offset);
91 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
92 // PC-rel. addressing
93 offset = target-branch;
94 int shift = Instruction_aarch64::extract(insn, 31, 31);
95 if (shift) {
96 u_int64_t dest = (u_int64_t)target;
97 uint64_t pc_page = (uint64_t)branch >> 12;
98 uint64_t adr_page = (uint64_t)target >> 12;
99 unsigned offset_lo = dest & 0xfff;
100 offset = adr_page - pc_page;
101
102 // We handle 4 types of PC relative addressing
103 // 1 - adrp Rx, target_page
104 // ldr/str Ry, [Rx, #offset_in_page]
105 // 2 - adrp Rx, target_page
106 // add Ry, Rx, #offset_in_page
107 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0)
108 // movk Rx, #imm16<<32
109 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0)
110 // In the first 3 cases we must check that Rx is the same in the adrp and the
111 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
112 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
113 // to be followed by a random unrelated ldr/str, add or movk instruction.
114 //
115 unsigned insn2 = ((unsigned*)branch)[1];
116 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
117 Instruction_aarch64::extract(insn, 4, 0) ==
118 Instruction_aarch64::extract(insn2, 9, 5)) {
119 // Load/store register (unsigned immediate)
120 unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
121 Instruction_aarch64::patch(branch + sizeof (unsigned),
122 21, 10, offset_lo >> size);
123 guarantee(((dest >> size) << size) == dest, "misaligned target");
124 instructions = 2;
125 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
126 Instruction_aarch64::extract(insn, 4, 0) ==
127 Instruction_aarch64::extract(insn2, 4, 0)) {
128 // add (immediate)
129 Instruction_aarch64::patch(branch + sizeof (unsigned),
130 21, 10, offset_lo);
131 instructions = 2;
132 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
133 Instruction_aarch64::extract(insn, 4, 0) ==
134 Instruction_aarch64::extract(insn2, 4, 0)) {
135 // movk #imm16<<32
136 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
137 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
138 long pc_page = (long)branch >> 12;
139 long adr_page = (long)dest >> 12;
140 offset = adr_page - pc_page;
141 instructions = 2;
142 }
143 }
144 int offset_lo = offset & 3;
145 offset >>= 2;
146 Instruction_aarch64::spatch(branch, 23, 5, offset);
147 Instruction_aarch64::patch(branch, 30, 29, offset_lo);
148 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
149 u_int64_t dest = (u_int64_t)target;
150 // Move wide constant
151 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
152 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
153 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
154 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
155 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
156 assert(target_addr_for_insn(branch) == target, "should be");
157 instructions = 3;
158 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
159 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
160 // nothing to do
161 assert(target == 0, "did not expect to relocate target for polling page load");
162 } else {
163 ShouldNotReachHere();
164 }
165 return instructions * NativeInstruction::instruction_size;
166 }
167
patch_oop(address insn_addr,address o)168 int MacroAssembler::patch_oop(address insn_addr, address o) {
169 int instructions;
170 unsigned insn = *(unsigned*)insn_addr;
171 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
172
173 // OOPs are either narrow (32 bits) or wide (48 bits). We encode
174 // narrow OOPs by setting the upper 16 bits in the first
175 // instruction.
176 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
177 // Move narrow OOP
178 narrowOop n = CompressedOops::encode((oop)o);
179 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
180 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
181 instructions = 2;
182 } else {
183 // Move wide OOP
184 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
185 uintptr_t dest = (uintptr_t)o;
186 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
187 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
188 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
189 instructions = 3;
190 }
191 return instructions * NativeInstruction::instruction_size;
192 }
193
patch_narrow_klass(address insn_addr,narrowKlass n)194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
195 // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
196 // We encode narrow ones by setting the upper 16 bits in the first
197 // instruction.
198 NativeInstruction *insn = nativeInstruction_at(insn_addr);
199 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
200 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
201
202 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
203 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
204 return 2 * NativeInstruction::instruction_size;
205 }
206
target_addr_for_insn(address insn_addr,unsigned insn)207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
208 long offset = 0;
209 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
210 // Load register (literal)
211 offset = Instruction_aarch64::sextract(insn, 23, 5);
212 return address(((uint64_t)insn_addr + (offset << 2)));
213 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
214 // Unconditional branch (immediate)
215 offset = Instruction_aarch64::sextract(insn, 25, 0);
216 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
217 // Conditional branch (immediate)
218 offset = Instruction_aarch64::sextract(insn, 23, 5);
219 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
220 // Compare & branch (immediate)
221 offset = Instruction_aarch64::sextract(insn, 23, 5);
222 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
223 // Test & branch (immediate)
224 offset = Instruction_aarch64::sextract(insn, 18, 5);
225 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
226 // PC-rel. addressing
227 offset = Instruction_aarch64::extract(insn, 30, 29);
228 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
229 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
230 if (shift) {
231 offset <<= shift;
232 uint64_t target_page = ((uint64_t)insn_addr) + offset;
233 target_page &= ((uint64_t)-1) << shift;
234 // Return the target address for the following sequences
235 // 1 - adrp Rx, target_page
236 // ldr/str Ry, [Rx, #offset_in_page]
237 // 2 - adrp Rx, target_page
238 // add Ry, Rx, #offset_in_page
239 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0)
240 // movk Rx, #imm12<<32
241 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0)
242 //
243 // In the first two cases we check that the register is the same and
244 // return the target_page + the offset within the page.
245 // Otherwise we assume it is a page aligned relocation and return
246 // the target page only.
247 //
248 unsigned insn2 = ((unsigned*)insn_addr)[1];
249 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
250 Instruction_aarch64::extract(insn, 4, 0) ==
251 Instruction_aarch64::extract(insn2, 9, 5)) {
252 // Load/store register (unsigned immediate)
253 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
254 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
255 return address(target_page + (byte_offset << size));
256 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
257 Instruction_aarch64::extract(insn, 4, 0) ==
258 Instruction_aarch64::extract(insn2, 4, 0)) {
259 // add (immediate)
260 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
261 return address(target_page + byte_offset);
262 } else {
263 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
264 Instruction_aarch64::extract(insn, 4, 0) ==
265 Instruction_aarch64::extract(insn2, 4, 0)) {
266 target_page = (target_page & 0xffffffff) |
267 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
268 }
269 return (address)target_page;
270 }
271 } else {
272 ShouldNotReachHere();
273 }
274 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
275 u_int32_t *insns = (u_int32_t *)insn_addr;
276 // Move wide constant: movz, movk, movk. See movptr().
277 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
278 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
279 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
280 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
281 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
282 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
283 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
284 return 0;
285 } else {
286 ShouldNotReachHere();
287 }
288 return address(((uint64_t)insn_addr + (offset << 2)));
289 }
290
safepoint_poll(Label & slow_path)291 void MacroAssembler::safepoint_poll(Label& slow_path) {
292 ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
293 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
294 }
295
296 // Just like safepoint_poll, but use an acquiring load for thread-
297 // local polling.
298 //
299 // We need an acquire here to ensure that any subsequent load of the
300 // global SafepointSynchronize::_state flag is ordered after this load
301 // of the local Thread::_polling page. We don't want this poll to
302 // return false (i.e. not safepointing) and a later poll of the global
303 // SafepointSynchronize::_state spuriously to return true.
304 //
305 // This is to avoid a race when we're in a native->Java transition
306 // racing the code which wakes up from a safepoint.
307 //
safepoint_poll_acquire(Label & slow_path)308 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
309 lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
310 ldar(rscratch1, rscratch1);
311 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
312 }
313
reset_last_Java_frame(bool clear_fp)314 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
315 // we must set sp to zero to clear frame
316 str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
317
318 // must clear fp, so that compiled frames are not confused; it is
319 // possible that we need it only for debugging
320 if (clear_fp) {
321 str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
322 }
323
324 // Always clear the pc because it could have been set by make_walkable()
325 str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
326 }
327
328 // Calls to C land
329 //
330 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
331 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
332 // has to be reset to 0. This is required to allow proper stack traversal.
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Register last_java_pc,Register scratch)333 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
334 Register last_java_fp,
335 Register last_java_pc,
336 Register scratch) {
337
338 if (last_java_pc->is_valid()) {
339 str(last_java_pc, Address(rthread,
340 JavaThread::frame_anchor_offset()
341 + JavaFrameAnchor::last_Java_pc_offset()));
342 }
343
344 // determine last_java_sp register
345 if (last_java_sp == sp) {
346 mov(scratch, sp);
347 last_java_sp = scratch;
348 } else if (!last_java_sp->is_valid()) {
349 last_java_sp = esp;
350 }
351
352 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
353
354 // last_java_fp is optional
355 if (last_java_fp->is_valid()) {
356 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
357 }
358 }
359
set_last_Java_frame(Register last_java_sp,Register last_java_fp,address last_java_pc,Register scratch)360 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
361 Register last_java_fp,
362 address last_java_pc,
363 Register scratch) {
364 assert(last_java_pc != NULL, "must provide a valid PC");
365
366 adr(scratch, last_java_pc);
367 str(scratch, Address(rthread,
368 JavaThread::frame_anchor_offset()
369 + JavaFrameAnchor::last_Java_pc_offset()));
370
371 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
372 }
373
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Label & L,Register scratch)374 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
375 Register last_java_fp,
376 Label &L,
377 Register scratch) {
378 if (L.is_bound()) {
379 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
380 } else {
381 InstructionMark im(this);
382 L.add_patch_at(code(), locator());
383 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
384 }
385 }
386
far_call(Address entry,CodeBuffer * cbuf,Register tmp)387 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
388 assert(ReservedCodeCacheSize < 4*G, "branch out of range");
389 assert(CodeCache::find_blob(entry.target()) != NULL,
390 "destination of far call not found in code cache");
391 if (far_branches()) {
392 unsigned long offset;
393 // We can use ADRP here because we know that the total size of
394 // the code cache cannot exceed 2Gb.
395 adrp(tmp, entry, offset);
396 add(tmp, tmp, offset);
397 if (cbuf) cbuf->set_insts_mark();
398 blr(tmp);
399 } else {
400 if (cbuf) cbuf->set_insts_mark();
401 bl(entry);
402 }
403 }
404
far_jump(Address entry,CodeBuffer * cbuf,Register tmp)405 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
406 assert(ReservedCodeCacheSize < 4*G, "branch out of range");
407 assert(CodeCache::find_blob(entry.target()) != NULL,
408 "destination of far call not found in code cache");
409 if (far_branches()) {
410 unsigned long offset;
411 // We can use ADRP here because we know that the total size of
412 // the code cache cannot exceed 2Gb.
413 adrp(tmp, entry, offset);
414 add(tmp, tmp, offset);
415 if (cbuf) cbuf->set_insts_mark();
416 br(tmp);
417 } else {
418 if (cbuf) cbuf->set_insts_mark();
419 b(entry);
420 }
421 }
422
reserved_stack_check()423 void MacroAssembler::reserved_stack_check() {
424 // testing if reserved zone needs to be enabled
425 Label no_reserved_zone_enabling;
426
427 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
428 cmp(sp, rscratch1);
429 br(Assembler::LO, no_reserved_zone_enabling);
430
431 enter(); // LR and FP are live.
432 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
433 mov(c_rarg0, rthread);
434 blr(rscratch1);
435 leave();
436
437 // We have already removed our own frame.
438 // throw_delayed_StackOverflowError will think that it's been
439 // called by our caller.
440 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
441 br(rscratch1);
442 should_not_reach_here();
443
444 bind(no_reserved_zone_enabling);
445 }
446
biased_locking_enter(Register lock_reg,Register obj_reg,Register swap_reg,Register tmp_reg,bool swap_reg_contains_mark,Label & done,Label * slow_case,BiasedLockingCounters * counters)447 int MacroAssembler::biased_locking_enter(Register lock_reg,
448 Register obj_reg,
449 Register swap_reg,
450 Register tmp_reg,
451 bool swap_reg_contains_mark,
452 Label& done,
453 Label* slow_case,
454 BiasedLockingCounters* counters) {
455 assert(UseBiasedLocking, "why call this otherwise?");
456 assert_different_registers(lock_reg, obj_reg, swap_reg);
457
458 if (PrintBiasedLockingStatistics && counters == NULL)
459 counters = BiasedLocking::counters();
460
461 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
462 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
463 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
464 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes());
465 Address saved_mark_addr(lock_reg, 0);
466
467 // Biased locking
468 // See whether the lock is currently biased toward our thread and
469 // whether the epoch is still valid
470 // Note that the runtime guarantees sufficient alignment of JavaThread
471 // pointers to allow age to be placed into low bits
472 // First check to see whether biasing is even enabled for this object
473 Label cas_label;
474 int null_check_offset = -1;
475 if (!swap_reg_contains_mark) {
476 null_check_offset = offset();
477 ldr(swap_reg, mark_addr);
478 }
479 andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place);
480 cmp(tmp_reg, (u1)markWord::biased_lock_pattern);
481 br(Assembler::NE, cas_label);
482 // The bias pattern is present in the object's header. Need to check
483 // whether the bias owner and the epoch are both still current.
484 load_prototype_header(tmp_reg, obj_reg);
485 orr(tmp_reg, tmp_reg, rthread);
486 eor(tmp_reg, swap_reg, tmp_reg);
487 andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place));
488 if (counters != NULL) {
489 Label around;
490 cbnz(tmp_reg, around);
491 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
492 b(done);
493 bind(around);
494 } else {
495 cbz(tmp_reg, done);
496 }
497
498 Label try_revoke_bias;
499 Label try_rebias;
500
501 // At this point we know that the header has the bias pattern and
502 // that we are not the bias owner in the current epoch. We need to
503 // figure out more details about the state of the header in order to
504 // know what operations can be legally performed on the object's
505 // header.
506
507 // If the low three bits in the xor result aren't clear, that means
508 // the prototype header is no longer biased and we have to revoke
509 // the bias on this object.
510 andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place);
511 cbnz(rscratch1, try_revoke_bias);
512
513 // Biasing is still enabled for this data type. See whether the
514 // epoch of the current bias is still valid, meaning that the epoch
515 // bits of the mark word are equal to the epoch bits of the
516 // prototype header. (Note that the prototype header's epoch bits
517 // only change at a safepoint.) If not, attempt to rebias the object
518 // toward the current thread. Note that we must be absolutely sure
519 // that the current epoch is invalid in order to do this because
520 // otherwise the manipulations it performs on the mark word are
521 // illegal.
522 andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place);
523 cbnz(rscratch1, try_rebias);
524
525 // The epoch of the current bias is still valid but we know nothing
526 // about the owner; it might be set or it might be clear. Try to
527 // acquire the bias of the object using an atomic operation. If this
528 // fails we will go in to the runtime to revoke the object's bias.
529 // Note that we first construct the presumed unbiased header so we
530 // don't accidentally blow away another thread's valid bias.
531 {
532 Label here;
533 mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
534 andr(swap_reg, swap_reg, rscratch1);
535 orr(tmp_reg, swap_reg, rthread);
536 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
537 // If the biasing toward our thread failed, this means that
538 // another thread succeeded in biasing it toward itself and we
539 // need to revoke that bias. The revocation will occur in the
540 // interpreter runtime in the slow case.
541 bind(here);
542 if (counters != NULL) {
543 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
544 tmp_reg, rscratch1, rscratch2);
545 }
546 }
547 b(done);
548
549 bind(try_rebias);
550 // At this point we know the epoch has expired, meaning that the
551 // current "bias owner", if any, is actually invalid. Under these
552 // circumstances _only_, we are allowed to use the current header's
553 // value as the comparison value when doing the cas to acquire the
554 // bias in the current epoch. In other words, we allow transfer of
555 // the bias from one thread to another directly in this situation.
556 //
557 // FIXME: due to a lack of registers we currently blow away the age
558 // bits in this situation. Should attempt to preserve them.
559 {
560 Label here;
561 load_prototype_header(tmp_reg, obj_reg);
562 orr(tmp_reg, rthread, tmp_reg);
563 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
564 // If the biasing toward our thread failed, then another thread
565 // succeeded in biasing it toward itself and we need to revoke that
566 // bias. The revocation will occur in the runtime in the slow case.
567 bind(here);
568 if (counters != NULL) {
569 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
570 tmp_reg, rscratch1, rscratch2);
571 }
572 }
573 b(done);
574
575 bind(try_revoke_bias);
576 // The prototype mark in the klass doesn't have the bias bit set any
577 // more, indicating that objects of this data type are not supposed
578 // to be biased any more. We are going to try to reset the mark of
579 // this object to the prototype value and fall through to the
580 // CAS-based locking scheme. Note that if our CAS fails, it means
581 // that another thread raced us for the privilege of revoking the
582 // bias of this particular object, so it's okay to continue in the
583 // normal locking code.
584 //
585 // FIXME: due to a lack of registers we currently blow away the age
586 // bits in this situation. Should attempt to preserve them.
587 {
588 Label here, nope;
589 load_prototype_header(tmp_reg, obj_reg);
590 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
591 bind(here);
592
593 // Fall through to the normal CAS-based lock, because no matter what
594 // the result of the above CAS, some thread must have succeeded in
595 // removing the bias bit from the object's header.
596 if (counters != NULL) {
597 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
598 rscratch1, rscratch2);
599 }
600 bind(nope);
601 }
602
603 bind(cas_label);
604
605 return null_check_offset;
606 }
607
biased_locking_exit(Register obj_reg,Register temp_reg,Label & done)608 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
609 assert(UseBiasedLocking, "why call this otherwise?");
610
611 // Check for biased locking unlock case, which is a no-op
612 // Note: we do not have to check the thread ID for two reasons.
613 // First, the interpreter checks for IllegalMonitorStateException at
614 // a higher level. Second, if the bias was revoked while we held the
615 // lock, the object could not be rebiased toward another thread, so
616 // the bias bit would be clear.
617 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
618 andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
619 cmp(temp_reg, (u1)markWord::biased_lock_pattern);
620 br(Assembler::EQ, done);
621 }
622
pass_arg0(MacroAssembler * masm,Register arg)623 static void pass_arg0(MacroAssembler* masm, Register arg) {
624 if (c_rarg0 != arg ) {
625 masm->mov(c_rarg0, arg);
626 }
627 }
628
pass_arg1(MacroAssembler * masm,Register arg)629 static void pass_arg1(MacroAssembler* masm, Register arg) {
630 if (c_rarg1 != arg ) {
631 masm->mov(c_rarg1, arg);
632 }
633 }
634
pass_arg2(MacroAssembler * masm,Register arg)635 static void pass_arg2(MacroAssembler* masm, Register arg) {
636 if (c_rarg2 != arg ) {
637 masm->mov(c_rarg2, arg);
638 }
639 }
640
pass_arg3(MacroAssembler * masm,Register arg)641 static void pass_arg3(MacroAssembler* masm, Register arg) {
642 if (c_rarg3 != arg ) {
643 masm->mov(c_rarg3, arg);
644 }
645 }
646
call_VM_base(Register oop_result,Register java_thread,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)647 void MacroAssembler::call_VM_base(Register oop_result,
648 Register java_thread,
649 Register last_java_sp,
650 address entry_point,
651 int number_of_arguments,
652 bool check_exceptions) {
653 // determine java_thread register
654 if (!java_thread->is_valid()) {
655 java_thread = rthread;
656 }
657
658 // determine last_java_sp register
659 if (!last_java_sp->is_valid()) {
660 last_java_sp = esp;
661 }
662
663 // debugging support
664 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
665 assert(java_thread == rthread, "unexpected register");
666 #ifdef ASSERT
667 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
668 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
669 #endif // ASSERT
670
671 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
672 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
673
674 // push java thread (becomes first argument of C function)
675
676 mov(c_rarg0, java_thread);
677
678 // set last Java frame before call
679 assert(last_java_sp != rfp, "can't use rfp");
680
681 Label l;
682 set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
683
684 // do the call, remove parameters
685 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
686
687 // lr could be poisoned with PAC signature during throw_pending_exception
688 // if it was tail-call optimized by compiler, since lr is not callee-saved
689 // reload it with proper value
690 adr(lr, l);
691
692 // reset last Java frame
693 // Only interpreter should have to clear fp
694 reset_last_Java_frame(true);
695
696 // C++ interp handles this in the interpreter
697 check_and_handle_popframe(java_thread);
698 check_and_handle_earlyret(java_thread);
699
700 if (check_exceptions) {
701 // check for pending exceptions (java_thread is set upon return)
702 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
703 Label ok;
704 cbz(rscratch1, ok);
705 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
706 br(rscratch1);
707 bind(ok);
708 }
709
710 // get oop result if there is one and reset the value in the thread
711 if (oop_result->is_valid()) {
712 get_vm_result(oop_result, java_thread);
713 }
714 }
715
call_VM_helper(Register oop_result,address entry_point,int number_of_arguments,bool check_exceptions)716 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
717 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
718 }
719
720 // Maybe emit a call via a trampoline. If the code cache is small
721 // trampolines won't be emitted.
722
trampoline_call(Address entry,CodeBuffer * cbuf)723 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
724 assert(JavaThread::current()->is_Compiler_thread(), "just checking");
725 assert(entry.rspec().type() == relocInfo::runtime_call_type
726 || entry.rspec().type() == relocInfo::opt_virtual_call_type
727 || entry.rspec().type() == relocInfo::static_call_type
728 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
729
730 // We need a trampoline if branches are far.
731 if (far_branches()) {
732 bool in_scratch_emit_size = false;
733 #ifdef COMPILER2
734 // We don't want to emit a trampoline if C2 is generating dummy
735 // code during its branch shortening phase.
736 CompileTask* task = ciEnv::current()->task();
737 in_scratch_emit_size =
738 (task != NULL && is_c2_compile(task->comp_level()) &&
739 Compile::current()->output()->in_scratch_emit_size());
740 #endif
741 if (!in_scratch_emit_size) {
742 address stub = emit_trampoline_stub(offset(), entry.target());
743 if (stub == NULL) {
744 postcond(pc() == badAddress);
745 return NULL; // CodeCache is full
746 }
747 }
748 }
749
750 if (cbuf) cbuf->set_insts_mark();
751 relocate(entry.rspec());
752 if (!far_branches()) {
753 bl(entry.target());
754 } else {
755 bl(pc());
756 }
757 // just need to return a non-null address
758 postcond(pc() != badAddress);
759 return pc();
760 }
761
762
763 // Emit a trampoline stub for a call to a target which is too far away.
764 //
765 // code sequences:
766 //
767 // call-site:
768 // branch-and-link to <destination> or <trampoline stub>
769 //
770 // Related trampoline stub for this call site in the stub section:
771 // load the call target from the constant pool
772 // branch (LR still points to the call site above)
773
emit_trampoline_stub(int insts_call_instruction_offset,address dest)774 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
775 address dest) {
776 // Max stub size: alignment nop, TrampolineStub.
777 address stub = start_a_stub(NativeInstruction::instruction_size
778 + NativeCallTrampolineStub::instruction_size);
779 if (stub == NULL) {
780 return NULL; // CodeBuffer::expand failed
781 }
782
783 // Create a trampoline stub relocation which relates this trampoline stub
784 // with the call instruction at insts_call_instruction_offset in the
785 // instructions code-section.
786 align(wordSize);
787 relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
788 + insts_call_instruction_offset));
789 const int stub_start_offset = offset();
790
791 // Now, create the trampoline stub's code:
792 // - load the call
793 // - call
794 Label target;
795 ldr(rscratch1, target);
796 br(rscratch1);
797 bind(target);
798 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
799 "should be");
800 emit_int64((int64_t)dest);
801
802 const address stub_start_addr = addr_at(stub_start_offset);
803
804 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
805
806 end_a_stub();
807 return stub_start_addr;
808 }
809
emit_static_call_stub()810 void MacroAssembler::emit_static_call_stub() {
811 // CompiledDirectStaticCall::set_to_interpreted knows the
812 // exact layout of this stub.
813
814 isb();
815 mov_metadata(rmethod, (Metadata*)NULL);
816
817 // Jump to the entry point of the i2c stub.
818 movptr(rscratch1, 0);
819 br(rscratch1);
820 }
821
c2bool(Register x)822 void MacroAssembler::c2bool(Register x) {
823 // implements x == 0 ? 0 : 1
824 // note: must only look at least-significant byte of x
825 // since C-style booleans are stored in one byte
826 // only! (was bug)
827 tst(x, 0xff);
828 cset(x, Assembler::NE);
829 }
830
ic_call(address entry,jint method_index)831 address MacroAssembler::ic_call(address entry, jint method_index) {
832 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
833 // address const_ptr = long_constant((jlong)Universe::non_oop_word());
834 // unsigned long offset;
835 // ldr_constant(rscratch2, const_ptr);
836 movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
837 return trampoline_call(Address(entry, rh));
838 }
839
840 // Implementation of call_VM versions
841
call_VM(Register oop_result,address entry_point,bool check_exceptions)842 void MacroAssembler::call_VM(Register oop_result,
843 address entry_point,
844 bool check_exceptions) {
845 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
846 }
847
call_VM(Register oop_result,address entry_point,Register arg_1,bool check_exceptions)848 void MacroAssembler::call_VM(Register oop_result,
849 address entry_point,
850 Register arg_1,
851 bool check_exceptions) {
852 pass_arg1(this, arg_1);
853 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
854 }
855
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)856 void MacroAssembler::call_VM(Register oop_result,
857 address entry_point,
858 Register arg_1,
859 Register arg_2,
860 bool check_exceptions) {
861 assert(arg_1 != c_rarg2, "smashed arg");
862 pass_arg2(this, arg_2);
863 pass_arg1(this, arg_1);
864 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
865 }
866
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)867 void MacroAssembler::call_VM(Register oop_result,
868 address entry_point,
869 Register arg_1,
870 Register arg_2,
871 Register arg_3,
872 bool check_exceptions) {
873 assert(arg_1 != c_rarg3, "smashed arg");
874 assert(arg_2 != c_rarg3, "smashed arg");
875 pass_arg3(this, arg_3);
876
877 assert(arg_1 != c_rarg2, "smashed arg");
878 pass_arg2(this, arg_2);
879
880 pass_arg1(this, arg_1);
881 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
882 }
883
call_VM(Register oop_result,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)884 void MacroAssembler::call_VM(Register oop_result,
885 Register last_java_sp,
886 address entry_point,
887 int number_of_arguments,
888 bool check_exceptions) {
889 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
890 }
891
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,bool check_exceptions)892 void MacroAssembler::call_VM(Register oop_result,
893 Register last_java_sp,
894 address entry_point,
895 Register arg_1,
896 bool check_exceptions) {
897 pass_arg1(this, arg_1);
898 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
899 }
900
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)901 void MacroAssembler::call_VM(Register oop_result,
902 Register last_java_sp,
903 address entry_point,
904 Register arg_1,
905 Register arg_2,
906 bool check_exceptions) {
907
908 assert(arg_1 != c_rarg2, "smashed arg");
909 pass_arg2(this, arg_2);
910 pass_arg1(this, arg_1);
911 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
912 }
913
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)914 void MacroAssembler::call_VM(Register oop_result,
915 Register last_java_sp,
916 address entry_point,
917 Register arg_1,
918 Register arg_2,
919 Register arg_3,
920 bool check_exceptions) {
921 assert(arg_1 != c_rarg3, "smashed arg");
922 assert(arg_2 != c_rarg3, "smashed arg");
923 pass_arg3(this, arg_3);
924 assert(arg_1 != c_rarg2, "smashed arg");
925 pass_arg2(this, arg_2);
926 pass_arg1(this, arg_1);
927 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
928 }
929
930
get_vm_result(Register oop_result,Register java_thread)931 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
932 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
933 str(zr, Address(java_thread, JavaThread::vm_result_offset()));
934 verify_oop(oop_result, "broken oop in call_VM_base");
935 }
936
get_vm_result_2(Register metadata_result,Register java_thread)937 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
938 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
939 str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
940 }
941
align(int modulus)942 void MacroAssembler::align(int modulus) {
943 while (offset() % modulus != 0) nop();
944 }
945
946 // these are no-ops overridden by InterpreterMacroAssembler
947
check_and_handle_earlyret(Register java_thread)948 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
949
check_and_handle_popframe(Register java_thread)950 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
951
952
delayed_value_impl(intptr_t * delayed_value_addr,Register tmp,int offset)953 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
954 Register tmp,
955 int offset) {
956 intptr_t value = *delayed_value_addr;
957 if (value != 0)
958 return RegisterOrConstant(value + offset);
959
960 // load indirectly to solve generation ordering problem
961 ldr(tmp, ExternalAddress((address) delayed_value_addr));
962
963 if (offset != 0)
964 add(tmp, tmp, offset);
965
966 return RegisterOrConstant(tmp);
967 }
968
969 // Look up the method for a megamorphic invokeinterface call.
970 // The target method is determined by <intf_klass, itable_index>.
971 // The receiver klass is in recv_klass.
972 // On success, the result will be in method_result, and execution falls through.
973 // On failure, execution transfers to the given label.
lookup_interface_method(Register recv_klass,Register intf_klass,RegisterOrConstant itable_index,Register method_result,Register scan_temp,Label & L_no_such_interface,bool return_method)974 void MacroAssembler::lookup_interface_method(Register recv_klass,
975 Register intf_klass,
976 RegisterOrConstant itable_index,
977 Register method_result,
978 Register scan_temp,
979 Label& L_no_such_interface,
980 bool return_method) {
981 assert_different_registers(recv_klass, intf_klass, scan_temp);
982 assert_different_registers(method_result, intf_klass, scan_temp);
983 assert(recv_klass != method_result || !return_method,
984 "recv_klass can be destroyed when method isn't needed");
985 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
986 "caller must use same register for non-constant itable index as for method");
987
988 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
989 int vtable_base = in_bytes(Klass::vtable_start_offset());
990 int itentry_off = itableMethodEntry::method_offset_in_bytes();
991 int scan_step = itableOffsetEntry::size() * wordSize;
992 int vte_size = vtableEntry::size_in_bytes();
993 assert(vte_size == wordSize, "else adjust times_vte_scale");
994
995 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
996
997 // %%% Could store the aligned, prescaled offset in the klassoop.
998 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
999 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1000 add(scan_temp, scan_temp, vtable_base);
1001
1002 if (return_method) {
1003 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1004 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1005 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1006 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1007 if (itentry_off)
1008 add(recv_klass, recv_klass, itentry_off);
1009 }
1010
1011 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1012 // if (scan->interface() == intf) {
1013 // result = (klass + scan->offset() + itable_index);
1014 // }
1015 // }
1016 Label search, found_method;
1017
1018 for (int peel = 1; peel >= 0; peel--) {
1019 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1020 cmp(intf_klass, method_result);
1021
1022 if (peel) {
1023 br(Assembler::EQ, found_method);
1024 } else {
1025 br(Assembler::NE, search);
1026 // (invert the test to fall through to found_method...)
1027 }
1028
1029 if (!peel) break;
1030
1031 bind(search);
1032
1033 // Check that the previous entry is non-null. A null entry means that
1034 // the receiver class doesn't implement the interface, and wasn't the
1035 // same as when the caller was compiled.
1036 cbz(method_result, L_no_such_interface);
1037 add(scan_temp, scan_temp, scan_step);
1038 }
1039
1040 bind(found_method);
1041
1042 // Got a hit.
1043 if (return_method) {
1044 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1045 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1046 }
1047 }
1048
1049 // virtual method calling
lookup_virtual_method(Register recv_klass,RegisterOrConstant vtable_index,Register method_result)1050 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1051 RegisterOrConstant vtable_index,
1052 Register method_result) {
1053 const int base = in_bytes(Klass::vtable_start_offset());
1054 assert(vtableEntry::size() * wordSize == 8,
1055 "adjust the scaling in the code below");
1056 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1057
1058 if (vtable_index.is_register()) {
1059 lea(method_result, Address(recv_klass,
1060 vtable_index.as_register(),
1061 Address::lsl(LogBytesPerWord)));
1062 ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1063 } else {
1064 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1065 ldr(method_result,
1066 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1067 }
1068 }
1069
check_klass_subtype(Register sub_klass,Register super_klass,Register temp_reg,Label & L_success)1070 void MacroAssembler::check_klass_subtype(Register sub_klass,
1071 Register super_klass,
1072 Register temp_reg,
1073 Label& L_success) {
1074 Label L_failure;
1075 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL);
1076 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1077 bind(L_failure);
1078 }
1079
1080
check_klass_subtype_fast_path(Register sub_klass,Register super_klass,Register temp_reg,Label * L_success,Label * L_failure,Label * L_slow_path,RegisterOrConstant super_check_offset)1081 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1082 Register super_klass,
1083 Register temp_reg,
1084 Label* L_success,
1085 Label* L_failure,
1086 Label* L_slow_path,
1087 RegisterOrConstant super_check_offset) {
1088 assert_different_registers(sub_klass, super_klass, temp_reg);
1089 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1090 if (super_check_offset.is_register()) {
1091 assert_different_registers(sub_klass, super_klass,
1092 super_check_offset.as_register());
1093 } else if (must_load_sco) {
1094 assert(temp_reg != noreg, "supply either a temp or a register offset");
1095 }
1096
1097 Label L_fallthrough;
1098 int label_nulls = 0;
1099 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1100 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1101 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1102 assert(label_nulls <= 1, "at most one NULL in the batch");
1103
1104 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1105 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1106 Address super_check_offset_addr(super_klass, sco_offset);
1107
1108 // Hacked jmp, which may only be used just before L_fallthrough.
1109 #define final_jmp(label) \
1110 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
1111 else b(label) /*omit semi*/
1112
1113 // If the pointers are equal, we are done (e.g., String[] elements).
1114 // This self-check enables sharing of secondary supertype arrays among
1115 // non-primary types such as array-of-interface. Otherwise, each such
1116 // type would need its own customized SSA.
1117 // We move this check to the front of the fast path because many
1118 // type checks are in fact trivially successful in this manner,
1119 // so we get a nicely predicted branch right at the start of the check.
1120 cmp(sub_klass, super_klass);
1121 br(Assembler::EQ, *L_success);
1122
1123 // Check the supertype display:
1124 if (must_load_sco) {
1125 ldrw(temp_reg, super_check_offset_addr);
1126 super_check_offset = RegisterOrConstant(temp_reg);
1127 }
1128 Address super_check_addr(sub_klass, super_check_offset);
1129 ldr(rscratch1, super_check_addr);
1130 cmp(super_klass, rscratch1); // load displayed supertype
1131
1132 // This check has worked decisively for primary supers.
1133 // Secondary supers are sought in the super_cache ('super_cache_addr').
1134 // (Secondary supers are interfaces and very deeply nested subtypes.)
1135 // This works in the same check above because of a tricky aliasing
1136 // between the super_cache and the primary super display elements.
1137 // (The 'super_check_addr' can address either, as the case requires.)
1138 // Note that the cache is updated below if it does not help us find
1139 // what we need immediately.
1140 // So if it was a primary super, we can just fail immediately.
1141 // Otherwise, it's the slow path for us (no success at this point).
1142
1143 if (super_check_offset.is_register()) {
1144 br(Assembler::EQ, *L_success);
1145 subs(zr, super_check_offset.as_register(), sc_offset);
1146 if (L_failure == &L_fallthrough) {
1147 br(Assembler::EQ, *L_slow_path);
1148 } else {
1149 br(Assembler::NE, *L_failure);
1150 final_jmp(*L_slow_path);
1151 }
1152 } else if (super_check_offset.as_constant() == sc_offset) {
1153 // Need a slow path; fast failure is impossible.
1154 if (L_slow_path == &L_fallthrough) {
1155 br(Assembler::EQ, *L_success);
1156 } else {
1157 br(Assembler::NE, *L_slow_path);
1158 final_jmp(*L_success);
1159 }
1160 } else {
1161 // No slow path; it's a fast decision.
1162 if (L_failure == &L_fallthrough) {
1163 br(Assembler::EQ, *L_success);
1164 } else {
1165 br(Assembler::NE, *L_failure);
1166 final_jmp(*L_success);
1167 }
1168 }
1169
1170 bind(L_fallthrough);
1171
1172 #undef final_jmp
1173 }
1174
1175 // These two are taken from x86, but they look generally useful
1176
1177 // scans count pointer sized words at [addr] for occurence of value,
1178 // generic
repne_scan(Register addr,Register value,Register count,Register scratch)1179 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1180 Register scratch) {
1181 Label Lloop, Lexit;
1182 cbz(count, Lexit);
1183 bind(Lloop);
1184 ldr(scratch, post(addr, wordSize));
1185 cmp(value, scratch);
1186 br(EQ, Lexit);
1187 sub(count, count, 1);
1188 cbnz(count, Lloop);
1189 bind(Lexit);
1190 }
1191
1192 // scans count 4 byte words at [addr] for occurence of value,
1193 // generic
repne_scanw(Register addr,Register value,Register count,Register scratch)1194 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1195 Register scratch) {
1196 Label Lloop, Lexit;
1197 cbz(count, Lexit);
1198 bind(Lloop);
1199 ldrw(scratch, post(addr, wordSize));
1200 cmpw(value, scratch);
1201 br(EQ, Lexit);
1202 sub(count, count, 1);
1203 cbnz(count, Lloop);
1204 bind(Lexit);
1205 }
1206
check_klass_subtype_slow_path(Register sub_klass,Register super_klass,Register temp_reg,Register temp2_reg,Label * L_success,Label * L_failure,bool set_cond_codes)1207 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1208 Register super_klass,
1209 Register temp_reg,
1210 Register temp2_reg,
1211 Label* L_success,
1212 Label* L_failure,
1213 bool set_cond_codes) {
1214 assert_different_registers(sub_klass, super_klass, temp_reg);
1215 if (temp2_reg != noreg)
1216 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1217 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1218
1219 Label L_fallthrough;
1220 int label_nulls = 0;
1221 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1222 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1223 assert(label_nulls <= 1, "at most one NULL in the batch");
1224
1225 // a couple of useful fields in sub_klass:
1226 int ss_offset = in_bytes(Klass::secondary_supers_offset());
1227 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1228 Address secondary_supers_addr(sub_klass, ss_offset);
1229 Address super_cache_addr( sub_klass, sc_offset);
1230
1231 BLOCK_COMMENT("check_klass_subtype_slow_path");
1232
1233 // Do a linear scan of the secondary super-klass chain.
1234 // This code is rarely used, so simplicity is a virtue here.
1235 // The repne_scan instruction uses fixed registers, which we must spill.
1236 // Don't worry too much about pre-existing connections with the input regs.
1237
1238 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1239 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1240
1241 RegSet pushed_registers;
1242 if (!IS_A_TEMP(r2)) pushed_registers += r2;
1243 if (!IS_A_TEMP(r5)) pushed_registers += r5;
1244
1245 if (super_klass != r0 || UseCompressedOops) {
1246 if (!IS_A_TEMP(r0)) pushed_registers += r0;
1247 }
1248
1249 push(pushed_registers, sp);
1250
1251 // Get super_klass value into r0 (even if it was in r5 or r2).
1252 if (super_klass != r0) {
1253 mov(r0, super_klass);
1254 }
1255
1256 #ifndef PRODUCT
1257 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1258 Address pst_counter_addr(rscratch2);
1259 ldr(rscratch1, pst_counter_addr);
1260 add(rscratch1, rscratch1, 1);
1261 str(rscratch1, pst_counter_addr);
1262 #endif //PRODUCT
1263
1264 // We will consult the secondary-super array.
1265 ldr(r5, secondary_supers_addr);
1266 // Load the array length.
1267 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1268 // Skip to start of data.
1269 add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1270
1271 cmp(sp, zr); // Clear Z flag; SP is never zero
1272 // Scan R2 words at [R5] for an occurrence of R0.
1273 // Set NZ/Z based on last compare.
1274 repne_scan(r5, r0, r2, rscratch1);
1275
1276 // Unspill the temp. registers:
1277 pop(pushed_registers, sp);
1278
1279 br(Assembler::NE, *L_failure);
1280
1281 // Success. Cache the super we found and proceed in triumph.
1282 str(super_klass, super_cache_addr);
1283
1284 if (L_success != &L_fallthrough) {
1285 b(*L_success);
1286 }
1287
1288 #undef IS_A_TEMP
1289
1290 bind(L_fallthrough);
1291 }
1292
clinit_barrier(Register klass,Register scratch,Label * L_fast_path,Label * L_slow_path)1293 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1294 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
1295 assert_different_registers(klass, rthread, scratch);
1296
1297 Label L_fallthrough, L_tmp;
1298 if (L_fast_path == NULL) {
1299 L_fast_path = &L_fallthrough;
1300 } else if (L_slow_path == NULL) {
1301 L_slow_path = &L_fallthrough;
1302 }
1303 // Fast path check: class is fully initialized
1304 ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1305 subs(zr, scratch, InstanceKlass::fully_initialized);
1306 br(Assembler::EQ, *L_fast_path);
1307
1308 // Fast path check: current thread is initializer thread
1309 ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1310 cmp(rthread, scratch);
1311
1312 if (L_slow_path == &L_fallthrough) {
1313 br(Assembler::EQ, *L_fast_path);
1314 bind(*L_slow_path);
1315 } else if (L_fast_path == &L_fallthrough) {
1316 br(Assembler::NE, *L_slow_path);
1317 bind(*L_fast_path);
1318 } else {
1319 Unimplemented();
1320 }
1321 }
1322
verify_oop(Register reg,const char * s)1323 void MacroAssembler::verify_oop(Register reg, const char* s) {
1324 if (!VerifyOops) return;
1325
1326 // Pass register number to verify_oop_subroutine
1327 const char* b = NULL;
1328 {
1329 ResourceMark rm;
1330 stringStream ss;
1331 ss.print("verify_oop: %s: %s", reg->name(), s);
1332 b = code_string(ss.as_string());
1333 }
1334 BLOCK_COMMENT("verify_oop {");
1335
1336 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1337 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1338
1339 mov(r0, reg);
1340 movptr(rscratch1, (uintptr_t)(address)b);
1341
1342 // call indirectly to solve generation ordering problem
1343 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1344 ldr(rscratch2, Address(rscratch2));
1345 blr(rscratch2);
1346
1347 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1348 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1349
1350 BLOCK_COMMENT("} verify_oop");
1351 }
1352
verify_oop_addr(Address addr,const char * s)1353 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1354 if (!VerifyOops) return;
1355
1356 const char* b = NULL;
1357 {
1358 ResourceMark rm;
1359 stringStream ss;
1360 ss.print("verify_oop_addr: %s", s);
1361 b = code_string(ss.as_string());
1362 }
1363 BLOCK_COMMENT("verify_oop_addr {");
1364
1365 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1366 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1367
1368 // addr may contain sp so we will have to adjust it based on the
1369 // pushes that we just did.
1370 if (addr.uses(sp)) {
1371 lea(r0, addr);
1372 ldr(r0, Address(r0, 4 * wordSize));
1373 } else {
1374 ldr(r0, addr);
1375 }
1376 movptr(rscratch1, (uintptr_t)(address)b);
1377
1378 // call indirectly to solve generation ordering problem
1379 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1380 ldr(rscratch2, Address(rscratch2));
1381 blr(rscratch2);
1382
1383 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1384 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1385
1386 BLOCK_COMMENT("} verify_oop_addr");
1387 }
1388
argument_address(RegisterOrConstant arg_slot,int extra_slot_offset)1389 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1390 int extra_slot_offset) {
1391 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1392 int stackElementSize = Interpreter::stackElementSize;
1393 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1394 #ifdef ASSERT
1395 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1396 assert(offset1 - offset == stackElementSize, "correct arithmetic");
1397 #endif
1398 if (arg_slot.is_constant()) {
1399 return Address(esp, arg_slot.as_constant() * stackElementSize
1400 + offset);
1401 } else {
1402 add(rscratch1, esp, arg_slot.as_register(),
1403 ext::uxtx, exact_log2(stackElementSize));
1404 return Address(rscratch1, offset);
1405 }
1406 }
1407
call_VM_leaf_base(address entry_point,int number_of_arguments,Label * retaddr)1408 void MacroAssembler::call_VM_leaf_base(address entry_point,
1409 int number_of_arguments,
1410 Label *retaddr) {
1411 Label E, L;
1412
1413 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1414
1415 mov(rscratch1, entry_point);
1416 blr(rscratch1);
1417 if (retaddr)
1418 bind(*retaddr);
1419
1420 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1421 maybe_isb();
1422 }
1423
call_VM_leaf(address entry_point,int number_of_arguments)1424 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1425 call_VM_leaf_base(entry_point, number_of_arguments);
1426 }
1427
call_VM_leaf(address entry_point,Register arg_0)1428 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1429 pass_arg0(this, arg_0);
1430 call_VM_leaf_base(entry_point, 1);
1431 }
1432
call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1433 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1434 pass_arg0(this, arg_0);
1435 pass_arg1(this, arg_1);
1436 call_VM_leaf_base(entry_point, 2);
1437 }
1438
call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1439 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1440 Register arg_1, Register arg_2) {
1441 pass_arg0(this, arg_0);
1442 pass_arg1(this, arg_1);
1443 pass_arg2(this, arg_2);
1444 call_VM_leaf_base(entry_point, 3);
1445 }
1446
super_call_VM_leaf(address entry_point,Register arg_0)1447 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1448 pass_arg0(this, arg_0);
1449 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1450 }
1451
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1452 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1453
1454 assert(arg_0 != c_rarg1, "smashed arg");
1455 pass_arg1(this, arg_1);
1456 pass_arg0(this, arg_0);
1457 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1458 }
1459
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1460 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1461 assert(arg_0 != c_rarg2, "smashed arg");
1462 assert(arg_1 != c_rarg2, "smashed arg");
1463 pass_arg2(this, arg_2);
1464 assert(arg_0 != c_rarg1, "smashed arg");
1465 pass_arg1(this, arg_1);
1466 pass_arg0(this, arg_0);
1467 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1468 }
1469
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2,Register arg_3)1470 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1471 assert(arg_0 != c_rarg3, "smashed arg");
1472 assert(arg_1 != c_rarg3, "smashed arg");
1473 assert(arg_2 != c_rarg3, "smashed arg");
1474 pass_arg3(this, arg_3);
1475 assert(arg_0 != c_rarg2, "smashed arg");
1476 assert(arg_1 != c_rarg2, "smashed arg");
1477 pass_arg2(this, arg_2);
1478 assert(arg_0 != c_rarg1, "smashed arg");
1479 pass_arg1(this, arg_1);
1480 pass_arg0(this, arg_0);
1481 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1482 }
1483
null_check(Register reg,int offset)1484 void MacroAssembler::null_check(Register reg, int offset) {
1485 if (needs_explicit_null_check(offset)) {
1486 // provoke OS NULL exception if reg = NULL by
1487 // accessing M[reg] w/o changing any registers
1488 // NOTE: this is plenty to provoke a segv
1489 ldr(zr, Address(reg));
1490 } else {
1491 // nothing to do, (later) access of M[reg + offset]
1492 // will provoke OS NULL exception if reg = NULL
1493 }
1494 }
1495
1496 // MacroAssembler protected routines needed to implement
1497 // public methods
1498
mov(Register r,Address dest)1499 void MacroAssembler::mov(Register r, Address dest) {
1500 code_section()->relocate(pc(), dest.rspec());
1501 u_int64_t imm64 = (u_int64_t)dest.target();
1502 movptr(r, imm64);
1503 }
1504
1505 // Move a constant pointer into r. In AArch64 mode the virtual
1506 // address space is 48 bits in size, so we only need three
1507 // instructions to create a patchable instruction sequence that can
1508 // reach anywhere.
movptr(Register r,uintptr_t imm64)1509 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1510 #ifndef PRODUCT
1511 {
1512 char buffer[64];
1513 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1514 block_comment(buffer);
1515 }
1516 #endif
1517 assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1518 movz(r, imm64 & 0xffff);
1519 imm64 >>= 16;
1520 movk(r, imm64 & 0xffff, 16);
1521 imm64 >>= 16;
1522 movk(r, imm64 & 0xffff, 32);
1523 }
1524
1525 // Macro to mov replicated immediate to vector register.
1526 // Vd will get the following values for different arrangements in T
1527 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh
1528 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh
1529 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh
1530 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh
1531 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh
1532 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh
1533 // T1D/T2D: invalid
mov(FloatRegister Vd,SIMD_Arrangement T,u_int32_t imm32)1534 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1535 assert(T != T1D && T != T2D, "invalid arrangement");
1536 if (T == T8B || T == T16B) {
1537 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1538 movi(Vd, T, imm32 & 0xff, 0);
1539 return;
1540 }
1541 u_int32_t nimm32 = ~imm32;
1542 if (T == T4H || T == T8H) {
1543 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1544 imm32 &= 0xffff;
1545 nimm32 &= 0xffff;
1546 }
1547 u_int32_t x = imm32;
1548 int movi_cnt = 0;
1549 int movn_cnt = 0;
1550 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1551 x = nimm32;
1552 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1553 if (movn_cnt < movi_cnt) imm32 = nimm32;
1554 unsigned lsl = 0;
1555 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1556 if (movn_cnt < movi_cnt)
1557 mvni(Vd, T, imm32 & 0xff, lsl);
1558 else
1559 movi(Vd, T, imm32 & 0xff, lsl);
1560 imm32 >>= 8; lsl += 8;
1561 while (imm32) {
1562 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1563 if (movn_cnt < movi_cnt)
1564 bici(Vd, T, imm32 & 0xff, lsl);
1565 else
1566 orri(Vd, T, imm32 & 0xff, lsl);
1567 lsl += 8; imm32 >>= 8;
1568 }
1569 }
1570
mov_immediate64(Register dst,u_int64_t imm64)1571 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1572 {
1573 #ifndef PRODUCT
1574 {
1575 char buffer[64];
1576 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1577 block_comment(buffer);
1578 }
1579 #endif
1580 if (operand_valid_for_logical_immediate(false, imm64)) {
1581 orr(dst, zr, imm64);
1582 } else {
1583 // we can use a combination of MOVZ or MOVN with
1584 // MOVK to build up the constant
1585 u_int64_t imm_h[4];
1586 int zero_count = 0;
1587 int neg_count = 0;
1588 int i;
1589 for (i = 0; i < 4; i++) {
1590 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1591 if (imm_h[i] == 0) {
1592 zero_count++;
1593 } else if (imm_h[i] == 0xffffL) {
1594 neg_count++;
1595 }
1596 }
1597 if (zero_count == 4) {
1598 // one MOVZ will do
1599 movz(dst, 0);
1600 } else if (neg_count == 4) {
1601 // one MOVN will do
1602 movn(dst, 0);
1603 } else if (zero_count == 3) {
1604 for (i = 0; i < 4; i++) {
1605 if (imm_h[i] != 0L) {
1606 movz(dst, (u_int32_t)imm_h[i], (i << 4));
1607 break;
1608 }
1609 }
1610 } else if (neg_count == 3) {
1611 // one MOVN will do
1612 for (int i = 0; i < 4; i++) {
1613 if (imm_h[i] != 0xffffL) {
1614 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1615 break;
1616 }
1617 }
1618 } else if (zero_count == 2) {
1619 // one MOVZ and one MOVK will do
1620 for (i = 0; i < 3; i++) {
1621 if (imm_h[i] != 0L) {
1622 movz(dst, (u_int32_t)imm_h[i], (i << 4));
1623 i++;
1624 break;
1625 }
1626 }
1627 for (;i < 4; i++) {
1628 if (imm_h[i] != 0L) {
1629 movk(dst, (u_int32_t)imm_h[i], (i << 4));
1630 }
1631 }
1632 } else if (neg_count == 2) {
1633 // one MOVN and one MOVK will do
1634 for (i = 0; i < 4; i++) {
1635 if (imm_h[i] != 0xffffL) {
1636 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1637 i++;
1638 break;
1639 }
1640 }
1641 for (;i < 4; i++) {
1642 if (imm_h[i] != 0xffffL) {
1643 movk(dst, (u_int32_t)imm_h[i], (i << 4));
1644 }
1645 }
1646 } else if (zero_count == 1) {
1647 // one MOVZ and two MOVKs will do
1648 for (i = 0; i < 4; i++) {
1649 if (imm_h[i] != 0L) {
1650 movz(dst, (u_int32_t)imm_h[i], (i << 4));
1651 i++;
1652 break;
1653 }
1654 }
1655 for (;i < 4; i++) {
1656 if (imm_h[i] != 0x0L) {
1657 movk(dst, (u_int32_t)imm_h[i], (i << 4));
1658 }
1659 }
1660 } else if (neg_count == 1) {
1661 // one MOVN and two MOVKs will do
1662 for (i = 0; i < 4; i++) {
1663 if (imm_h[i] != 0xffffL) {
1664 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1665 i++;
1666 break;
1667 }
1668 }
1669 for (;i < 4; i++) {
1670 if (imm_h[i] != 0xffffL) {
1671 movk(dst, (u_int32_t)imm_h[i], (i << 4));
1672 }
1673 }
1674 } else {
1675 // use a MOVZ and 3 MOVKs (makes it easier to debug)
1676 movz(dst, (u_int32_t)imm_h[0], 0);
1677 for (i = 1; i < 4; i++) {
1678 movk(dst, (u_int32_t)imm_h[i], (i << 4));
1679 }
1680 }
1681 }
1682 }
1683
mov_immediate32(Register dst,u_int32_t imm32)1684 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1685 {
1686 #ifndef PRODUCT
1687 {
1688 char buffer[64];
1689 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1690 block_comment(buffer);
1691 }
1692 #endif
1693 if (operand_valid_for_logical_immediate(true, imm32)) {
1694 orrw(dst, zr, imm32);
1695 } else {
1696 // we can use MOVZ, MOVN or two calls to MOVK to build up the
1697 // constant
1698 u_int32_t imm_h[2];
1699 imm_h[0] = imm32 & 0xffff;
1700 imm_h[1] = ((imm32 >> 16) & 0xffff);
1701 if (imm_h[0] == 0) {
1702 movzw(dst, imm_h[1], 16);
1703 } else if (imm_h[0] == 0xffff) {
1704 movnw(dst, imm_h[1] ^ 0xffff, 16);
1705 } else if (imm_h[1] == 0) {
1706 movzw(dst, imm_h[0], 0);
1707 } else if (imm_h[1] == 0xffff) {
1708 movnw(dst, imm_h[0] ^ 0xffff, 0);
1709 } else {
1710 // use a MOVZ and MOVK (makes it easier to debug)
1711 movzw(dst, imm_h[0], 0);
1712 movkw(dst, imm_h[1], 16);
1713 }
1714 }
1715 }
1716
1717 // Form an address from base + offset in Rd. Rd may or may
1718 // not actually be used: you must use the Address that is returned.
1719 // It is up to you to ensure that the shift provided matches the size
1720 // of your data.
form_address(Register Rd,Register base,long byte_offset,int shift)1721 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1722 if (Address::offset_ok_for_immed(byte_offset, shift))
1723 // It fits; no need for any heroics
1724 return Address(base, byte_offset);
1725
1726 // Don't do anything clever with negative or misaligned offsets
1727 unsigned mask = (1 << shift) - 1;
1728 if (byte_offset < 0 || byte_offset & mask) {
1729 mov(Rd, byte_offset);
1730 add(Rd, base, Rd);
1731 return Address(Rd);
1732 }
1733
1734 // See if we can do this with two 12-bit offsets
1735 {
1736 unsigned long word_offset = byte_offset >> shift;
1737 unsigned long masked_offset = word_offset & 0xfff000;
1738 if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
1739 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1740 add(Rd, base, masked_offset << shift);
1741 word_offset -= masked_offset;
1742 return Address(Rd, word_offset << shift);
1743 }
1744 }
1745
1746 // Do it the hard way
1747 mov(Rd, byte_offset);
1748 add(Rd, base, Rd);
1749 return Address(Rd);
1750 }
1751
atomic_incw(Register counter_addr,Register tmp,Register tmp2)1752 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1753 if (UseLSE) {
1754 mov(tmp, 1);
1755 ldadd(Assembler::word, tmp, zr, counter_addr);
1756 return;
1757 }
1758 Label retry_load;
1759 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1760 prfm(Address(counter_addr), PSTL1STRM);
1761 bind(retry_load);
1762 // flush and load exclusive from the memory location
1763 ldxrw(tmp, counter_addr);
1764 addw(tmp, tmp, 1);
1765 // if we store+flush with no intervening write tmp wil be zero
1766 stxrw(tmp2, tmp, counter_addr);
1767 cbnzw(tmp2, retry_load);
1768 }
1769
1770
corrected_idivl(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1771 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1772 bool want_remainder, Register scratch)
1773 {
1774 // Full implementation of Java idiv and irem. The function
1775 // returns the (pc) offset of the div instruction - may be needed
1776 // for implicit exceptions.
1777 //
1778 // constraint : ra/rb =/= scratch
1779 // normal case
1780 //
1781 // input : ra: dividend
1782 // rb: divisor
1783 //
1784 // result: either
1785 // quotient (= ra idiv rb)
1786 // remainder (= ra irem rb)
1787
1788 assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1789
1790 int idivl_offset = offset();
1791 if (! want_remainder) {
1792 sdivw(result, ra, rb);
1793 } else {
1794 sdivw(scratch, ra, rb);
1795 Assembler::msubw(result, scratch, rb, ra);
1796 }
1797
1798 return idivl_offset;
1799 }
1800
corrected_idivq(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1801 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1802 bool want_remainder, Register scratch)
1803 {
1804 // Full implementation of Java ldiv and lrem. The function
1805 // returns the (pc) offset of the div instruction - may be needed
1806 // for implicit exceptions.
1807 //
1808 // constraint : ra/rb =/= scratch
1809 // normal case
1810 //
1811 // input : ra: dividend
1812 // rb: divisor
1813 //
1814 // result: either
1815 // quotient (= ra idiv rb)
1816 // remainder (= ra irem rb)
1817
1818 assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1819
1820 int idivq_offset = offset();
1821 if (! want_remainder) {
1822 sdiv(result, ra, rb);
1823 } else {
1824 sdiv(scratch, ra, rb);
1825 Assembler::msub(result, scratch, rb, ra);
1826 }
1827
1828 return idivq_offset;
1829 }
1830
membar(Membar_mask_bits order_constraint)1831 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1832 address prev = pc() - NativeMembar::instruction_size;
1833 address last = code()->last_insn();
1834 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1835 NativeMembar *bar = NativeMembar_at(prev);
1836 // We are merging two memory barrier instructions. On AArch64 we
1837 // can do this simply by ORing them together.
1838 bar->set_kind(bar->get_kind() | order_constraint);
1839 BLOCK_COMMENT("merged membar");
1840 } else {
1841 code()->set_last_insn(pc());
1842 dmb(Assembler::barrier(order_constraint));
1843 }
1844 }
1845
try_merge_ldst(Register rt,const Address & adr,size_t size_in_bytes,bool is_store)1846 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1847 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1848 merge_ldst(rt, adr, size_in_bytes, is_store);
1849 code()->clear_last_insn();
1850 return true;
1851 } else {
1852 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1853 const unsigned mask = size_in_bytes - 1;
1854 if (adr.getMode() == Address::base_plus_offset &&
1855 (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1856 code()->set_last_insn(pc());
1857 }
1858 return false;
1859 }
1860 }
1861
ldr(Register Rx,const Address & adr)1862 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1863 // We always try to merge two adjacent loads into one ldp.
1864 if (!try_merge_ldst(Rx, adr, 8, false)) {
1865 Assembler::ldr(Rx, adr);
1866 }
1867 }
1868
ldrw(Register Rw,const Address & adr)1869 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1870 // We always try to merge two adjacent loads into one ldp.
1871 if (!try_merge_ldst(Rw, adr, 4, false)) {
1872 Assembler::ldrw(Rw, adr);
1873 }
1874 }
1875
str(Register Rx,const Address & adr)1876 void MacroAssembler::str(Register Rx, const Address &adr) {
1877 // We always try to merge two adjacent stores into one stp.
1878 if (!try_merge_ldst(Rx, adr, 8, true)) {
1879 Assembler::str(Rx, adr);
1880 }
1881 }
1882
strw(Register Rw,const Address & adr)1883 void MacroAssembler::strw(Register Rw, const Address &adr) {
1884 // We always try to merge two adjacent stores into one stp.
1885 if (!try_merge_ldst(Rw, adr, 4, true)) {
1886 Assembler::strw(Rw, adr);
1887 }
1888 }
1889
1890 // MacroAssembler routines found actually to be needed
1891
push(Register src)1892 void MacroAssembler::push(Register src)
1893 {
1894 str(src, Address(pre(esp, -1 * wordSize)));
1895 }
1896
pop(Register dst)1897 void MacroAssembler::pop(Register dst)
1898 {
1899 ldr(dst, Address(post(esp, 1 * wordSize)));
1900 }
1901
1902 // Note: load_unsigned_short used to be called load_unsigned_word.
load_unsigned_short(Register dst,Address src)1903 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1904 int off = offset();
1905 ldrh(dst, src);
1906 return off;
1907 }
1908
load_unsigned_byte(Register dst,Address src)1909 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1910 int off = offset();
1911 ldrb(dst, src);
1912 return off;
1913 }
1914
load_signed_short(Register dst,Address src)1915 int MacroAssembler::load_signed_short(Register dst, Address src) {
1916 int off = offset();
1917 ldrsh(dst, src);
1918 return off;
1919 }
1920
load_signed_byte(Register dst,Address src)1921 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1922 int off = offset();
1923 ldrsb(dst, src);
1924 return off;
1925 }
1926
load_signed_short32(Register dst,Address src)1927 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1928 int off = offset();
1929 ldrshw(dst, src);
1930 return off;
1931 }
1932
load_signed_byte32(Register dst,Address src)1933 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1934 int off = offset();
1935 ldrsbw(dst, src);
1936 return off;
1937 }
1938
load_sized_value(Register dst,Address src,size_t size_in_bytes,bool is_signed,Register dst2)1939 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1940 switch (size_in_bytes) {
1941 case 8: ldr(dst, src); break;
1942 case 4: ldrw(dst, src); break;
1943 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1944 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1945 default: ShouldNotReachHere();
1946 }
1947 }
1948
store_sized_value(Address dst,Register src,size_t size_in_bytes,Register src2)1949 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1950 switch (size_in_bytes) {
1951 case 8: str(src, dst); break;
1952 case 4: strw(src, dst); break;
1953 case 2: strh(src, dst); break;
1954 case 1: strb(src, dst); break;
1955 default: ShouldNotReachHere();
1956 }
1957 }
1958
decrementw(Register reg,int value)1959 void MacroAssembler::decrementw(Register reg, int value)
1960 {
1961 if (value < 0) { incrementw(reg, -value); return; }
1962 if (value == 0) { return; }
1963 if (value < (1 << 12)) { subw(reg, reg, value); return; }
1964 /* else */ {
1965 guarantee(reg != rscratch2, "invalid dst for register decrement");
1966 movw(rscratch2, (unsigned)value);
1967 subw(reg, reg, rscratch2);
1968 }
1969 }
1970
decrement(Register reg,int value)1971 void MacroAssembler::decrement(Register reg, int value)
1972 {
1973 if (value < 0) { increment(reg, -value); return; }
1974 if (value == 0) { return; }
1975 if (value < (1 << 12)) { sub(reg, reg, value); return; }
1976 /* else */ {
1977 assert(reg != rscratch2, "invalid dst for register decrement");
1978 mov(rscratch2, (unsigned long)value);
1979 sub(reg, reg, rscratch2);
1980 }
1981 }
1982
decrementw(Address dst,int value)1983 void MacroAssembler::decrementw(Address dst, int value)
1984 {
1985 assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1986 if (dst.getMode() == Address::literal) {
1987 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1988 lea(rscratch2, dst);
1989 dst = Address(rscratch2);
1990 }
1991 ldrw(rscratch1, dst);
1992 decrementw(rscratch1, value);
1993 strw(rscratch1, dst);
1994 }
1995
decrement(Address dst,int value)1996 void MacroAssembler::decrement(Address dst, int value)
1997 {
1998 assert(!dst.uses(rscratch1), "invalid address for decrement");
1999 if (dst.getMode() == Address::literal) {
2000 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2001 lea(rscratch2, dst);
2002 dst = Address(rscratch2);
2003 }
2004 ldr(rscratch1, dst);
2005 decrement(rscratch1, value);
2006 str(rscratch1, dst);
2007 }
2008
incrementw(Register reg,int value)2009 void MacroAssembler::incrementw(Register reg, int value)
2010 {
2011 if (value < 0) { decrementw(reg, -value); return; }
2012 if (value == 0) { return; }
2013 if (value < (1 << 12)) { addw(reg, reg, value); return; }
2014 /* else */ {
2015 assert(reg != rscratch2, "invalid dst for register increment");
2016 movw(rscratch2, (unsigned)value);
2017 addw(reg, reg, rscratch2);
2018 }
2019 }
2020
increment(Register reg,int value)2021 void MacroAssembler::increment(Register reg, int value)
2022 {
2023 if (value < 0) { decrement(reg, -value); return; }
2024 if (value == 0) { return; }
2025 if (value < (1 << 12)) { add(reg, reg, value); return; }
2026 /* else */ {
2027 assert(reg != rscratch2, "invalid dst for register increment");
2028 movw(rscratch2, (unsigned)value);
2029 add(reg, reg, rscratch2);
2030 }
2031 }
2032
incrementw(Address dst,int value)2033 void MacroAssembler::incrementw(Address dst, int value)
2034 {
2035 assert(!dst.uses(rscratch1), "invalid dst for address increment");
2036 if (dst.getMode() == Address::literal) {
2037 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2038 lea(rscratch2, dst);
2039 dst = Address(rscratch2);
2040 }
2041 ldrw(rscratch1, dst);
2042 incrementw(rscratch1, value);
2043 strw(rscratch1, dst);
2044 }
2045
increment(Address dst,int value)2046 void MacroAssembler::increment(Address dst, int value)
2047 {
2048 assert(!dst.uses(rscratch1), "invalid dst for address increment");
2049 if (dst.getMode() == Address::literal) {
2050 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2051 lea(rscratch2, dst);
2052 dst = Address(rscratch2);
2053 }
2054 ldr(rscratch1, dst);
2055 increment(rscratch1, value);
2056 str(rscratch1, dst);
2057 }
2058
2059
pusha()2060 void MacroAssembler::pusha() {
2061 push(0x7fffffff, sp);
2062 }
2063
popa()2064 void MacroAssembler::popa() {
2065 pop(0x7fffffff, sp);
2066 }
2067
2068 // Push lots of registers in the bit set supplied. Don't push sp.
2069 // Return the number of words pushed
push(unsigned int bitset,Register stack)2070 int MacroAssembler::push(unsigned int bitset, Register stack) {
2071 int words_pushed = 0;
2072
2073 // Scan bitset to accumulate register pairs
2074 unsigned char regs[32];
2075 int count = 0;
2076 for (int reg = 0; reg <= 30; reg++) {
2077 if (1 & bitset)
2078 regs[count++] = reg;
2079 bitset >>= 1;
2080 }
2081 regs[count++] = zr->encoding_nocheck();
2082 count &= ~1; // Only push an even nuber of regs
2083
2084 if (count) {
2085 stp(as_Register(regs[0]), as_Register(regs[1]),
2086 Address(pre(stack, -count * wordSize)));
2087 words_pushed += 2;
2088 }
2089 for (int i = 2; i < count; i += 2) {
2090 stp(as_Register(regs[i]), as_Register(regs[i+1]),
2091 Address(stack, i * wordSize));
2092 words_pushed += 2;
2093 }
2094
2095 assert(words_pushed == count, "oops, pushed != count");
2096
2097 return count;
2098 }
2099
pop(unsigned int bitset,Register stack)2100 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2101 int words_pushed = 0;
2102
2103 // Scan bitset to accumulate register pairs
2104 unsigned char regs[32];
2105 int count = 0;
2106 for (int reg = 0; reg <= 30; reg++) {
2107 if (1 & bitset)
2108 regs[count++] = reg;
2109 bitset >>= 1;
2110 }
2111 regs[count++] = zr->encoding_nocheck();
2112 count &= ~1;
2113
2114 for (int i = 2; i < count; i += 2) {
2115 ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2116 Address(stack, i * wordSize));
2117 words_pushed += 2;
2118 }
2119 if (count) {
2120 ldp(as_Register(regs[0]), as_Register(regs[1]),
2121 Address(post(stack, count * wordSize)));
2122 words_pushed += 2;
2123 }
2124
2125 assert(words_pushed == count, "oops, pushed != count");
2126
2127 return count;
2128 }
2129
2130 // Push lots of registers in the bit set supplied. Don't push sp.
2131 // Return the number of words pushed
push_fp(unsigned int bitset,Register stack)2132 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2133 int words_pushed = 0;
2134
2135 // Scan bitset to accumulate register pairs
2136 unsigned char regs[32];
2137 int count = 0;
2138 for (int reg = 0; reg <= 31; reg++) {
2139 if (1 & bitset)
2140 regs[count++] = reg;
2141 bitset >>= 1;
2142 }
2143
2144 if (count == 0) {
2145 return 0;
2146 }
2147
2148 if (count == 1) {
2149 strq(as_FloatRegister(regs[0]), Address(pre(stack, -wordSize * 2)));
2150 return 1;
2151 }
2152
2153 bool odd = (count & 1) == 1;
2154 int push_slots = count + (odd ? 1 : 0);
2155
2156 // Always pushing full 128 bit registers.
2157 stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -push_slots * wordSize * 2)));
2158 words_pushed += 2;
2159
2160 for (int i = 2; i + 1 < count; i += 2) {
2161 stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2162 words_pushed += 2;
2163 }
2164
2165 if (odd) {
2166 strq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2167 words_pushed++;
2168 }
2169
2170 assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2171 return count;
2172 }
2173
pop_fp(unsigned int bitset,Register stack)2174 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2175 int words_pushed = 0;
2176
2177 // Scan bitset to accumulate register pairs
2178 unsigned char regs[32];
2179 int count = 0;
2180 for (int reg = 0; reg <= 31; reg++) {
2181 if (1 & bitset)
2182 regs[count++] = reg;
2183 bitset >>= 1;
2184 }
2185
2186 if (count == 0) {
2187 return 0;
2188 }
2189
2190 if (count == 1) {
2191 ldrq(as_FloatRegister(regs[0]), Address(post(stack, wordSize * 2)));
2192 return 1;
2193 }
2194
2195 bool odd = (count & 1) == 1;
2196 int push_slots = count + (odd ? 1 : 0);
2197
2198 if (odd) {
2199 ldrq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2200 words_pushed++;
2201 }
2202
2203 for (int i = 2; i + 1 < count; i += 2) {
2204 ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2205 words_pushed += 2;
2206 }
2207
2208 ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, push_slots * wordSize * 2)));
2209 words_pushed += 2;
2210
2211 assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2212
2213 return count;
2214 }
2215
2216 #ifdef ASSERT
verify_heapbase(const char * msg)2217 void MacroAssembler::verify_heapbase(const char* msg) {
2218 #if 0
2219 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2220 assert (Universe::heap() != NULL, "java heap should be initialized");
2221 if (!UseCompressedOops || Universe::ptr_base() == NULL) {
2222 // rheapbase is allocated as general register
2223 return;
2224 }
2225 if (CheckCompressedOops) {
2226 Label ok;
2227 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2228 cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2229 br(Assembler::EQ, ok);
2230 stop(msg);
2231 bind(ok);
2232 pop(1 << rscratch1->encoding(), sp);
2233 }
2234 #endif
2235 }
2236 #endif
2237
resolve_jobject(Register value,Register thread,Register tmp)2238 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2239 Label done, not_weak;
2240 cbz(value, done); // Use NULL as-is.
2241
2242 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2243 tbz(r0, 0, not_weak); // Test for jweak tag.
2244
2245 // Resolve jweak.
2246 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2247 Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2248 verify_oop(value);
2249 b(done);
2250
2251 bind(not_weak);
2252 // Resolve (untagged) jobject.
2253 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2254 verify_oop(value);
2255 bind(done);
2256 }
2257
stop(const char * msg)2258 void MacroAssembler::stop(const char* msg) {
2259 BLOCK_COMMENT(msg);
2260 dcps1(0xdeae);
2261 emit_int64((uintptr_t)msg);
2262 }
2263
unimplemented(const char * what)2264 void MacroAssembler::unimplemented(const char* what) {
2265 const char* buf = NULL;
2266 {
2267 ResourceMark rm;
2268 stringStream ss;
2269 ss.print("unimplemented: %s", what);
2270 buf = code_string(ss.as_string());
2271 }
2272 stop(buf);
2273 }
2274
2275 // If a constant does not fit in an immediate field, generate some
2276 // number of MOV instructions and then perform the operation.
wrap_add_sub_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2277 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2278 add_sub_imm_insn insn1,
2279 add_sub_reg_insn insn2) {
2280 assert(Rd != zr, "Rd = zr and not setting flags?");
2281 if (operand_valid_for_add_sub_immediate((int)imm)) {
2282 (this->*insn1)(Rd, Rn, imm);
2283 } else {
2284 if (uabs(imm) < (1 << 24)) {
2285 (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2286 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2287 } else {
2288 assert_different_registers(Rd, Rn);
2289 mov(Rd, (uint64_t)imm);
2290 (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2291 }
2292 }
2293 }
2294
2295 // Seperate vsn which sets the flags. Optimisations are more restricted
2296 // because we must set the flags correctly.
wrap_adds_subs_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2297 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2298 add_sub_imm_insn insn1,
2299 add_sub_reg_insn insn2) {
2300 if (operand_valid_for_add_sub_immediate((int)imm)) {
2301 (this->*insn1)(Rd, Rn, imm);
2302 } else {
2303 assert_different_registers(Rd, Rn);
2304 assert(Rd != zr, "overflow in immediate operand");
2305 mov(Rd, (uint64_t)imm);
2306 (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2307 }
2308 }
2309
2310
add(Register Rd,Register Rn,RegisterOrConstant increment)2311 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2312 if (increment.is_register()) {
2313 add(Rd, Rn, increment.as_register());
2314 } else {
2315 add(Rd, Rn, increment.as_constant());
2316 }
2317 }
2318
addw(Register Rd,Register Rn,RegisterOrConstant increment)2319 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2320 if (increment.is_register()) {
2321 addw(Rd, Rn, increment.as_register());
2322 } else {
2323 addw(Rd, Rn, increment.as_constant());
2324 }
2325 }
2326
sub(Register Rd,Register Rn,RegisterOrConstant decrement)2327 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2328 if (decrement.is_register()) {
2329 sub(Rd, Rn, decrement.as_register());
2330 } else {
2331 sub(Rd, Rn, decrement.as_constant());
2332 }
2333 }
2334
subw(Register Rd,Register Rn,RegisterOrConstant decrement)2335 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2336 if (decrement.is_register()) {
2337 subw(Rd, Rn, decrement.as_register());
2338 } else {
2339 subw(Rd, Rn, decrement.as_constant());
2340 }
2341 }
2342
reinit_heapbase()2343 void MacroAssembler::reinit_heapbase()
2344 {
2345 if (UseCompressedOops) {
2346 if (Universe::is_fully_initialized()) {
2347 mov(rheapbase, CompressedOops::ptrs_base());
2348 } else {
2349 lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2350 ldr(rheapbase, Address(rheapbase));
2351 }
2352 }
2353 }
2354
2355 // this simulates the behaviour of the x86 cmpxchg instruction using a
2356 // load linked/store conditional pair. we use the acquire/release
2357 // versions of these instructions so that we flush pending writes as
2358 // per Java semantics.
2359
2360 // n.b the x86 version assumes the old value to be compared against is
2361 // in rax and updates rax with the value located in memory if the
2362 // cmpxchg fails. we supply a register for the old value explicitly
2363
2364 // the aarch64 load linked/store conditional instructions do not
2365 // accept an offset. so, unlike x86, we must provide a plain register
2366 // to identify the memory word to be compared/exchanged rather than a
2367 // register+offset Address.
2368
cmpxchgptr(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2369 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2370 Label &succeed, Label *fail) {
2371 // oldv holds comparison value
2372 // newv holds value to write in exchange
2373 // addr identifies memory word to compare against/update
2374 if (UseLSE) {
2375 mov(tmp, oldv);
2376 casal(Assembler::xword, oldv, newv, addr);
2377 cmp(tmp, oldv);
2378 br(Assembler::EQ, succeed);
2379 membar(AnyAny);
2380 } else {
2381 Label retry_load, nope;
2382 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2383 prfm(Address(addr), PSTL1STRM);
2384 bind(retry_load);
2385 // flush and load exclusive from the memory location
2386 // and fail if it is not what we expect
2387 ldaxr(tmp, addr);
2388 cmp(tmp, oldv);
2389 br(Assembler::NE, nope);
2390 // if we store+flush with no intervening write tmp wil be zero
2391 stlxr(tmp, newv, addr);
2392 cbzw(tmp, succeed);
2393 // retry so we only ever return after a load fails to compare
2394 // ensures we don't return a stale value after a failed write.
2395 b(retry_load);
2396 // if the memory word differs we return it in oldv and signal a fail
2397 bind(nope);
2398 membar(AnyAny);
2399 mov(oldv, tmp);
2400 }
2401 if (fail)
2402 b(*fail);
2403 }
2404
cmpxchg_obj_header(Register oldv,Register newv,Register obj,Register tmp,Label & succeed,Label * fail)2405 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2406 Label &succeed, Label *fail) {
2407 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2408 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2409 }
2410
cmpxchgw(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2411 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2412 Label &succeed, Label *fail) {
2413 // oldv holds comparison value
2414 // newv holds value to write in exchange
2415 // addr identifies memory word to compare against/update
2416 // tmp returns 0/1 for success/failure
2417 if (UseLSE) {
2418 mov(tmp, oldv);
2419 casal(Assembler::word, oldv, newv, addr);
2420 cmp(tmp, oldv);
2421 br(Assembler::EQ, succeed);
2422 membar(AnyAny);
2423 } else {
2424 Label retry_load, nope;
2425 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2426 prfm(Address(addr), PSTL1STRM);
2427 bind(retry_load);
2428 // flush and load exclusive from the memory location
2429 // and fail if it is not what we expect
2430 ldaxrw(tmp, addr);
2431 cmp(tmp, oldv);
2432 br(Assembler::NE, nope);
2433 // if we store+flush with no intervening write tmp wil be zero
2434 stlxrw(tmp, newv, addr);
2435 cbzw(tmp, succeed);
2436 // retry so we only ever return after a load fails to compare
2437 // ensures we don't return a stale value after a failed write.
2438 b(retry_load);
2439 // if the memory word differs we return it in oldv and signal a fail
2440 bind(nope);
2441 membar(AnyAny);
2442 mov(oldv, tmp);
2443 }
2444 if (fail)
2445 b(*fail);
2446 }
2447
2448 // A generic CAS; success or failure is in the EQ flag. A weak CAS
2449 // doesn't retry and may fail spuriously. If the oldval is wanted,
2450 // Pass a register for the result, otherwise pass noreg.
2451
2452 // Clobbers rscratch1
cmpxchg(Register addr,Register expected,Register new_val,enum operand_size size,bool acquire,bool release,bool weak,Register result)2453 void MacroAssembler::cmpxchg(Register addr, Register expected,
2454 Register new_val,
2455 enum operand_size size,
2456 bool acquire, bool release,
2457 bool weak,
2458 Register result) {
2459 if (result == noreg) result = rscratch1;
2460 BLOCK_COMMENT("cmpxchg {");
2461 if (UseLSE) {
2462 mov(result, expected);
2463 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2464 compare_eq(result, expected, size);
2465 } else {
2466 Label retry_load, done;
2467 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2468 prfm(Address(addr), PSTL1STRM);
2469 bind(retry_load);
2470 load_exclusive(result, addr, size, acquire);
2471 compare_eq(result, expected, size);
2472 br(Assembler::NE, done);
2473 store_exclusive(rscratch1, new_val, addr, size, release);
2474 if (weak) {
2475 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller.
2476 } else {
2477 cbnzw(rscratch1, retry_load);
2478 }
2479 bind(done);
2480 }
2481 BLOCK_COMMENT("} cmpxchg");
2482 }
2483
2484 // A generic comparison. Only compares for equality, clobbers rscratch1.
compare_eq(Register rm,Register rn,enum operand_size size)2485 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2486 if (size == xword) {
2487 cmp(rm, rn);
2488 } else if (size == word) {
2489 cmpw(rm, rn);
2490 } else if (size == halfword) {
2491 eorw(rscratch1, rm, rn);
2492 ands(zr, rscratch1, 0xffff);
2493 } else if (size == byte) {
2494 eorw(rscratch1, rm, rn);
2495 ands(zr, rscratch1, 0xff);
2496 } else {
2497 ShouldNotReachHere();
2498 }
2499 }
2500
2501
different(Register a,RegisterOrConstant b,Register c)2502 static bool different(Register a, RegisterOrConstant b, Register c) {
2503 if (b.is_constant())
2504 return a != c;
2505 else
2506 return a != b.as_register() && a != c && b.as_register() != c;
2507 }
2508
2509 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \
2510 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2511 if (UseLSE) { \
2512 prev = prev->is_valid() ? prev : zr; \
2513 if (incr.is_register()) { \
2514 AOP(sz, incr.as_register(), prev, addr); \
2515 } else { \
2516 mov(rscratch2, incr.as_constant()); \
2517 AOP(sz, rscratch2, prev, addr); \
2518 } \
2519 return; \
2520 } \
2521 Register result = rscratch2; \
2522 if (prev->is_valid()) \
2523 result = different(prev, incr, addr) ? prev : rscratch2; \
2524 \
2525 Label retry_load; \
2526 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \
2527 prfm(Address(addr), PSTL1STRM); \
2528 bind(retry_load); \
2529 LDXR(result, addr); \
2530 OP(rscratch1, result, incr); \
2531 STXR(rscratch2, rscratch1, addr); \
2532 cbnzw(rscratch2, retry_load); \
2533 if (prev->is_valid() && prev != result) { \
2534 IOP(prev, rscratch1, incr); \
2535 } \
2536 }
2537
2538 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2539 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2540 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2541 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2542
2543 #undef ATOMIC_OP
2544
2545 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \
2546 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2547 if (UseLSE) { \
2548 prev = prev->is_valid() ? prev : zr; \
2549 AOP(sz, newv, prev, addr); \
2550 return; \
2551 } \
2552 Register result = rscratch2; \
2553 if (prev->is_valid()) \
2554 result = different(prev, newv, addr) ? prev : rscratch2; \
2555 \
2556 Label retry_load; \
2557 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \
2558 prfm(Address(addr), PSTL1STRM); \
2559 bind(retry_load); \
2560 LDXR(result, addr); \
2561 STXR(rscratch1, newv, addr); \
2562 cbnzw(rscratch1, retry_load); \
2563 if (prev->is_valid() && prev != result) \
2564 mov(prev, result); \
2565 }
2566
2567 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2568 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2569 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2570 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2571
2572 #undef ATOMIC_XCHG
2573
2574 #ifndef PRODUCT
2575 extern "C" void findpc(intptr_t x);
2576 #endif
2577
debug64(char * msg,int64_t pc,int64_t regs[])2578 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2579 {
2580 // In order to get locks to work, we need to fake a in_VM state
2581 if (ShowMessageBoxOnError ) {
2582 JavaThread* thread = JavaThread::current();
2583 JavaThreadState saved_state = thread->thread_state();
2584 thread->set_thread_state(_thread_in_vm);
2585 #ifndef PRODUCT
2586 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2587 ttyLocker ttyl;
2588 BytecodeCounter::print();
2589 }
2590 #endif
2591 if (os::message_box(msg, "Execution stopped, print registers?")) {
2592 ttyLocker ttyl;
2593 tty->print_cr(" pc = 0x%016lx", pc);
2594 #ifndef PRODUCT
2595 tty->cr();
2596 findpc(pc);
2597 tty->cr();
2598 #endif
2599 tty->print_cr(" r0 = 0x%016lx", regs[0]);
2600 tty->print_cr(" r1 = 0x%016lx", regs[1]);
2601 tty->print_cr(" r2 = 0x%016lx", regs[2]);
2602 tty->print_cr(" r3 = 0x%016lx", regs[3]);
2603 tty->print_cr(" r4 = 0x%016lx", regs[4]);
2604 tty->print_cr(" r5 = 0x%016lx", regs[5]);
2605 tty->print_cr(" r6 = 0x%016lx", regs[6]);
2606 tty->print_cr(" r7 = 0x%016lx", regs[7]);
2607 tty->print_cr(" r8 = 0x%016lx", regs[8]);
2608 tty->print_cr(" r9 = 0x%016lx", regs[9]);
2609 tty->print_cr("r10 = 0x%016lx", regs[10]);
2610 tty->print_cr("r11 = 0x%016lx", regs[11]);
2611 tty->print_cr("r12 = 0x%016lx", regs[12]);
2612 tty->print_cr("r13 = 0x%016lx", regs[13]);
2613 tty->print_cr("r14 = 0x%016lx", regs[14]);
2614 tty->print_cr("r15 = 0x%016lx", regs[15]);
2615 tty->print_cr("r16 = 0x%016lx", regs[16]);
2616 tty->print_cr("r17 = 0x%016lx", regs[17]);
2617 tty->print_cr("r18 = 0x%016lx", regs[18]);
2618 tty->print_cr("r19 = 0x%016lx", regs[19]);
2619 tty->print_cr("r20 = 0x%016lx", regs[20]);
2620 tty->print_cr("r21 = 0x%016lx", regs[21]);
2621 tty->print_cr("r22 = 0x%016lx", regs[22]);
2622 tty->print_cr("r23 = 0x%016lx", regs[23]);
2623 tty->print_cr("r24 = 0x%016lx", regs[24]);
2624 tty->print_cr("r25 = 0x%016lx", regs[25]);
2625 tty->print_cr("r26 = 0x%016lx", regs[26]);
2626 tty->print_cr("r27 = 0x%016lx", regs[27]);
2627 tty->print_cr("r28 = 0x%016lx", regs[28]);
2628 tty->print_cr("r30 = 0x%016lx", regs[30]);
2629 tty->print_cr("r31 = 0x%016lx", regs[31]);
2630 BREAKPOINT;
2631 }
2632 }
2633 fatal("DEBUG MESSAGE: %s", msg);
2634 }
2635
push_call_clobbered_registers()2636 void MacroAssembler::push_call_clobbered_registers() {
2637 int step = 4 * wordSize;
2638 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2639 sub(sp, sp, step);
2640 mov(rscratch1, -step);
2641 // Push v0-v7, v16-v31.
2642 for (int i = 31; i>= 4; i -= 4) {
2643 if (i <= v7->encoding() || i >= v16->encoding())
2644 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2645 as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2646 }
2647 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2648 as_FloatRegister(3), T1D, Address(sp));
2649 }
2650
pop_call_clobbered_registers()2651 void MacroAssembler::pop_call_clobbered_registers() {
2652 for (int i = 0; i < 32; i += 4) {
2653 if (i <= v7->encoding() || i >= v16->encoding())
2654 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2655 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2656 }
2657
2658 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2659 }
2660
push_CPU_state(bool save_vectors)2661 void MacroAssembler::push_CPU_state(bool save_vectors) {
2662 int step = (save_vectors ? 8 : 4) * wordSize;
2663 push(0x3fffffff, sp); // integer registers except lr & sp
2664 mov(rscratch1, -step);
2665 sub(sp, sp, step);
2666 for (int i = 28; i >= 4; i -= 4) {
2667 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2668 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2669 }
2670 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2671 }
2672
pop_CPU_state(bool restore_vectors)2673 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2674 int step = (restore_vectors ? 8 : 4) * wordSize;
2675 for (int i = 0; i <= 28; i += 4)
2676 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2677 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2678 pop(0x3fffffff, sp); // integer registers except lr & sp
2679 }
2680
2681 /**
2682 * Helpers for multiply_to_len().
2683 */
add2_with_carry(Register final_dest_hi,Register dest_hi,Register dest_lo,Register src1,Register src2)2684 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2685 Register src1, Register src2) {
2686 adds(dest_lo, dest_lo, src1);
2687 adc(dest_hi, dest_hi, zr);
2688 adds(dest_lo, dest_lo, src2);
2689 adc(final_dest_hi, dest_hi, zr);
2690 }
2691
2692 // Generate an address from (r + r1 extend offset). "size" is the
2693 // size of the operand. The result may be in rscratch2.
offsetted_address(Register r,Register r1,Address::extend ext,int offset,int size)2694 Address MacroAssembler::offsetted_address(Register r, Register r1,
2695 Address::extend ext, int offset, int size) {
2696 if (offset || (ext.shift() % size != 0)) {
2697 lea(rscratch2, Address(r, r1, ext));
2698 return Address(rscratch2, offset);
2699 } else {
2700 return Address(r, r1, ext);
2701 }
2702 }
2703
spill_address(int size,int offset,Register tmp)2704 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2705 {
2706 assert(offset >= 0, "spill to negative address?");
2707 // Offset reachable ?
2708 // Not aligned - 9 bits signed offset
2709 // Aligned - 12 bits unsigned offset shifted
2710 Register base = sp;
2711 if ((offset & (size-1)) && offset >= (1<<8)) {
2712 add(tmp, base, offset & ((1<<12)-1));
2713 base = tmp;
2714 offset &= -1u<<12;
2715 }
2716
2717 if (offset >= (1<<12) * size) {
2718 add(tmp, base, offset & (((1<<12)-1)<<12));
2719 base = tmp;
2720 offset &= ~(((1<<12)-1)<<12);
2721 }
2722
2723 return Address(base, offset);
2724 }
2725
2726 // Checks whether offset is aligned.
2727 // Returns true if it is, else false.
merge_alignment_check(Register base,size_t size,long cur_offset,long prev_offset) const2728 bool MacroAssembler::merge_alignment_check(Register base,
2729 size_t size,
2730 long cur_offset,
2731 long prev_offset) const {
2732 if (AvoidUnalignedAccesses) {
2733 if (base == sp) {
2734 // Checks whether low offset if aligned to pair of registers.
2735 long pair_mask = size * 2 - 1;
2736 long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2737 return (offset & pair_mask) == 0;
2738 } else { // If base is not sp, we can't guarantee the access is aligned.
2739 return false;
2740 }
2741 } else {
2742 long mask = size - 1;
2743 // Load/store pair instruction only supports element size aligned offset.
2744 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2745 }
2746 }
2747
2748 // Checks whether current and previous loads/stores can be merged.
2749 // Returns true if it can be merged, else false.
ldst_can_merge(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store) const2750 bool MacroAssembler::ldst_can_merge(Register rt,
2751 const Address &adr,
2752 size_t cur_size_in_bytes,
2753 bool is_store) const {
2754 address prev = pc() - NativeInstruction::instruction_size;
2755 address last = code()->last_insn();
2756
2757 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2758 return false;
2759 }
2760
2761 if (adr.getMode() != Address::base_plus_offset || prev != last) {
2762 return false;
2763 }
2764
2765 NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2766 size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2767
2768 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2769 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2770
2771 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2772 return false;
2773 }
2774
2775 long max_offset = 63 * prev_size_in_bytes;
2776 long min_offset = -64 * prev_size_in_bytes;
2777
2778 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2779
2780 // Only same base can be merged.
2781 if (adr.base() != prev_ldst->base()) {
2782 return false;
2783 }
2784
2785 long cur_offset = adr.offset();
2786 long prev_offset = prev_ldst->offset();
2787 size_t diff = abs(cur_offset - prev_offset);
2788 if (diff != prev_size_in_bytes) {
2789 return false;
2790 }
2791
2792 // Following cases can not be merged:
2793 // ldr x2, [x2, #8]
2794 // ldr x3, [x2, #16]
2795 // or:
2796 // ldr x2, [x3, #8]
2797 // ldr x2, [x3, #16]
2798 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2799 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2800 return false;
2801 }
2802
2803 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2804 // Offset range must be in ldp/stp instruction's range.
2805 if (low_offset > max_offset || low_offset < min_offset) {
2806 return false;
2807 }
2808
2809 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2810 return true;
2811 }
2812
2813 return false;
2814 }
2815
2816 // Merge current load/store with previous load/store into ldp/stp.
merge_ldst(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store)2817 void MacroAssembler::merge_ldst(Register rt,
2818 const Address &adr,
2819 size_t cur_size_in_bytes,
2820 bool is_store) {
2821
2822 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2823
2824 Register rt_low, rt_high;
2825 address prev = pc() - NativeInstruction::instruction_size;
2826 NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2827
2828 long offset;
2829
2830 if (adr.offset() < prev_ldst->offset()) {
2831 offset = adr.offset();
2832 rt_low = rt;
2833 rt_high = prev_ldst->target();
2834 } else {
2835 offset = prev_ldst->offset();
2836 rt_low = prev_ldst->target();
2837 rt_high = rt;
2838 }
2839
2840 Address adr_p = Address(prev_ldst->base(), offset);
2841 // Overwrite previous generated binary.
2842 code_section()->set_end(prev);
2843
2844 const int sz = prev_ldst->size_in_bytes();
2845 assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2846 if (!is_store) {
2847 BLOCK_COMMENT("merged ldr pair");
2848 if (sz == 8) {
2849 ldp(rt_low, rt_high, adr_p);
2850 } else {
2851 ldpw(rt_low, rt_high, adr_p);
2852 }
2853 } else {
2854 BLOCK_COMMENT("merged str pair");
2855 if (sz == 8) {
2856 stp(rt_low, rt_high, adr_p);
2857 } else {
2858 stpw(rt_low, rt_high, adr_p);
2859 }
2860 }
2861 }
2862
2863 /**
2864 * Multiply 64 bit by 64 bit first loop.
2865 */
multiply_64_x_64_loop(Register x,Register xstart,Register x_xstart,Register y,Register y_idx,Register z,Register carry,Register product,Register idx,Register kdx)2866 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2867 Register y, Register y_idx, Register z,
2868 Register carry, Register product,
2869 Register idx, Register kdx) {
2870 //
2871 // jlong carry, x[], y[], z[];
2872 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2873 // huge_128 product = y[idx] * x[xstart] + carry;
2874 // z[kdx] = (jlong)product;
2875 // carry = (jlong)(product >>> 64);
2876 // }
2877 // z[xstart] = carry;
2878 //
2879
2880 Label L_first_loop, L_first_loop_exit;
2881 Label L_one_x, L_one_y, L_multiply;
2882
2883 subsw(xstart, xstart, 1);
2884 br(Assembler::MI, L_one_x);
2885
2886 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2887 ldr(x_xstart, Address(rscratch1));
2888 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2889
2890 bind(L_first_loop);
2891 subsw(idx, idx, 1);
2892 br(Assembler::MI, L_first_loop_exit);
2893 subsw(idx, idx, 1);
2894 br(Assembler::MI, L_one_y);
2895 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2896 ldr(y_idx, Address(rscratch1));
2897 ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2898 bind(L_multiply);
2899
2900 // AArch64 has a multiply-accumulate instruction that we can't use
2901 // here because it has no way to process carries, so we have to use
2902 // separate add and adc instructions. Bah.
2903 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2904 mul(product, x_xstart, y_idx);
2905 adds(product, product, carry);
2906 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product
2907
2908 subw(kdx, kdx, 2);
2909 ror(product, product, 32); // back to big-endian
2910 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2911
2912 b(L_first_loop);
2913
2914 bind(L_one_y);
2915 ldrw(y_idx, Address(y, 0));
2916 b(L_multiply);
2917
2918 bind(L_one_x);
2919 ldrw(x_xstart, Address(x, 0));
2920 b(L_first_loop);
2921
2922 bind(L_first_loop_exit);
2923 }
2924
2925 /**
2926 * Multiply 128 bit by 128. Unrolled inner loop.
2927 *
2928 */
multiply_128_x_128_loop(Register y,Register z,Register carry,Register carry2,Register idx,Register jdx,Register yz_idx1,Register yz_idx2,Register tmp,Register tmp3,Register tmp4,Register tmp6,Register product_hi)2929 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2930 Register carry, Register carry2,
2931 Register idx, Register jdx,
2932 Register yz_idx1, Register yz_idx2,
2933 Register tmp, Register tmp3, Register tmp4,
2934 Register tmp6, Register product_hi) {
2935
2936 // jlong carry, x[], y[], z[];
2937 // int kdx = ystart+1;
2938 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2939 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2940 // jlong carry2 = (jlong)(tmp3 >>> 64);
2941 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2;
2942 // carry = (jlong)(tmp4 >>> 64);
2943 // z[kdx+idx+1] = (jlong)tmp3;
2944 // z[kdx+idx] = (jlong)tmp4;
2945 // }
2946 // idx += 2;
2947 // if (idx > 0) {
2948 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2949 // z[kdx+idx] = (jlong)yz_idx1;
2950 // carry = (jlong)(yz_idx1 >>> 64);
2951 // }
2952 //
2953
2954 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2955
2956 lsrw(jdx, idx, 2);
2957
2958 bind(L_third_loop);
2959
2960 subsw(jdx, jdx, 1);
2961 br(Assembler::MI, L_third_loop_exit);
2962 subw(idx, idx, 4);
2963
2964 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2965
2966 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2967
2968 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2969
2970 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2971 ror(yz_idx2, yz_idx2, 32);
2972
2973 ldp(rscratch2, rscratch1, Address(tmp6, 0));
2974
2975 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
2976 umulh(tmp4, product_hi, yz_idx1);
2977
2978 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2979 ror(rscratch2, rscratch2, 32);
2980
2981 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp
2982 umulh(carry2, product_hi, yz_idx2);
2983
2984 // propagate sum of both multiplications into carry:tmp4:tmp3
2985 adds(tmp3, tmp3, carry);
2986 adc(tmp4, tmp4, zr);
2987 adds(tmp3, tmp3, rscratch1);
2988 adcs(tmp4, tmp4, tmp);
2989 adc(carry, carry2, zr);
2990 adds(tmp4, tmp4, rscratch2);
2991 adc(carry, carry, zr);
2992
2993 ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2994 ror(tmp4, tmp4, 32);
2995 stp(tmp4, tmp3, Address(tmp6, 0));
2996
2997 b(L_third_loop);
2998 bind (L_third_loop_exit);
2999
3000 andw (idx, idx, 0x3);
3001 cbz(idx, L_post_third_loop_done);
3002
3003 Label L_check_1;
3004 subsw(idx, idx, 2);
3005 br(Assembler::MI, L_check_1);
3006
3007 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3008 ldr(yz_idx1, Address(rscratch1, 0));
3009 ror(yz_idx1, yz_idx1, 32);
3010 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
3011 umulh(tmp4, product_hi, yz_idx1);
3012 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3013 ldr(yz_idx2, Address(rscratch1, 0));
3014 ror(yz_idx2, yz_idx2, 32);
3015
3016 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3017
3018 ror(tmp3, tmp3, 32);
3019 str(tmp3, Address(rscratch1, 0));
3020
3021 bind (L_check_1);
3022
3023 andw (idx, idx, 0x1);
3024 subsw(idx, idx, 1);
3025 br(Assembler::MI, L_post_third_loop_done);
3026 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3027 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3
3028 umulh(carry2, tmp4, product_hi);
3029 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3030
3031 add2_with_carry(carry2, tmp3, tmp4, carry);
3032
3033 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3034 extr(carry, carry2, tmp3, 32);
3035
3036 bind(L_post_third_loop_done);
3037 }
3038
3039 /**
3040 * Code for BigInteger::multiplyToLen() instrinsic.
3041 *
3042 * r0: x
3043 * r1: xlen
3044 * r2: y
3045 * r3: ylen
3046 * r4: z
3047 * r5: zlen
3048 * r10: tmp1
3049 * r11: tmp2
3050 * r12: tmp3
3051 * r13: tmp4
3052 * r14: tmp5
3053 * r15: tmp6
3054 * r16: tmp7
3055 *
3056 */
multiply_to_len(Register x,Register xlen,Register y,Register ylen,Register z,Register zlen,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,Register product_hi)3057 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3058 Register z, Register zlen,
3059 Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3060 Register tmp5, Register tmp6, Register product_hi) {
3061
3062 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3063
3064 const Register idx = tmp1;
3065 const Register kdx = tmp2;
3066 const Register xstart = tmp3;
3067
3068 const Register y_idx = tmp4;
3069 const Register carry = tmp5;
3070 const Register product = xlen;
3071 const Register x_xstart = zlen; // reuse register
3072
3073 // First Loop.
3074 //
3075 // final static long LONG_MASK = 0xffffffffL;
3076 // int xstart = xlen - 1;
3077 // int ystart = ylen - 1;
3078 // long carry = 0;
3079 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3080 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3081 // z[kdx] = (int)product;
3082 // carry = product >>> 32;
3083 // }
3084 // z[xstart] = (int)carry;
3085 //
3086
3087 movw(idx, ylen); // idx = ylen;
3088 movw(kdx, zlen); // kdx = xlen+ylen;
3089 mov(carry, zr); // carry = 0;
3090
3091 Label L_done;
3092
3093 movw(xstart, xlen);
3094 subsw(xstart, xstart, 1);
3095 br(Assembler::MI, L_done);
3096
3097 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3098
3099 Label L_second_loop;
3100 cbzw(kdx, L_second_loop);
3101
3102 Label L_carry;
3103 subw(kdx, kdx, 1);
3104 cbzw(kdx, L_carry);
3105
3106 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3107 lsr(carry, carry, 32);
3108 subw(kdx, kdx, 1);
3109
3110 bind(L_carry);
3111 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3112
3113 // Second and third (nested) loops.
3114 //
3115 // for (int i = xstart-1; i >= 0; i--) { // Second loop
3116 // carry = 0;
3117 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3118 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3119 // (z[k] & LONG_MASK) + carry;
3120 // z[k] = (int)product;
3121 // carry = product >>> 32;
3122 // }
3123 // z[i] = (int)carry;
3124 // }
3125 //
3126 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3127
3128 const Register jdx = tmp1;
3129
3130 bind(L_second_loop);
3131 mov(carry, zr); // carry = 0;
3132 movw(jdx, ylen); // j = ystart+1
3133
3134 subsw(xstart, xstart, 1); // i = xstart-1;
3135 br(Assembler::MI, L_done);
3136
3137 str(z, Address(pre(sp, -4 * wordSize)));
3138
3139 Label L_last_x;
3140 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3141 subsw(xstart, xstart, 1); // i = xstart-1;
3142 br(Assembler::MI, L_last_x);
3143
3144 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3145 ldr(product_hi, Address(rscratch1));
3146 ror(product_hi, product_hi, 32); // convert big-endian to little-endian
3147
3148 Label L_third_loop_prologue;
3149 bind(L_third_loop_prologue);
3150
3151 str(ylen, Address(sp, wordSize));
3152 stp(x, xstart, Address(sp, 2 * wordSize));
3153 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3154 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3155 ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3156 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen
3157
3158 addw(tmp3, xlen, 1);
3159 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3160 subsw(tmp3, tmp3, 1);
3161 br(Assembler::MI, L_done);
3162
3163 lsr(carry, carry, 32);
3164 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3165 b(L_second_loop);
3166
3167 // Next infrequent code is moved outside loops.
3168 bind(L_last_x);
3169 ldrw(product_hi, Address(x, 0));
3170 b(L_third_loop_prologue);
3171
3172 bind(L_done);
3173 }
3174
3175 // Code for BigInteger::mulAdd instrinsic
3176 // out = r0
3177 // in = r1
3178 // offset = r2 (already out.length-offset)
3179 // len = r3
3180 // k = r4
3181 //
3182 // pseudo code from java implementation:
3183 // carry = 0;
3184 // offset = out.length-offset - 1;
3185 // for (int j=len-1; j >= 0; j--) {
3186 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3187 // out[offset--] = (int)product;
3188 // carry = product >>> 32;
3189 // }
3190 // return (int)carry;
mul_add(Register out,Register in,Register offset,Register len,Register k)3191 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3192 Register len, Register k) {
3193 Label LOOP, END;
3194 // pre-loop
3195 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3196 csel(out, zr, out, Assembler::EQ);
3197 br(Assembler::EQ, END);
3198 add(in, in, len, LSL, 2); // in[j+1] address
3199 add(offset, out, offset, LSL, 2); // out[offset + 1] address
3200 mov(out, zr); // used to keep carry now
3201 BIND(LOOP);
3202 ldrw(rscratch1, Address(pre(in, -4)));
3203 madd(rscratch1, rscratch1, k, out);
3204 ldrw(rscratch2, Address(pre(offset, -4)));
3205 add(rscratch1, rscratch1, rscratch2);
3206 strw(rscratch1, Address(offset));
3207 lsr(out, rscratch1, 32);
3208 subs(len, len, 1);
3209 br(Assembler::NE, LOOP);
3210 BIND(END);
3211 }
3212
3213 /**
3214 * Emits code to update CRC-32 with a byte value according to constants in table
3215 *
3216 * @param [in,out]crc Register containing the crc.
3217 * @param [in]val Register containing the byte to fold into the CRC.
3218 * @param [in]table Register containing the table of crc constants.
3219 *
3220 * uint32_t crc;
3221 * val = crc_table[(val ^ crc) & 0xFF];
3222 * crc = val ^ (crc >> 8);
3223 *
3224 */
update_byte_crc32(Register crc,Register val,Register table)3225 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3226 eor(val, val, crc);
3227 andr(val, val, 0xff);
3228 ldrw(val, Address(table, val, Address::lsl(2)));
3229 eor(crc, val, crc, Assembler::LSR, 8);
3230 }
3231
3232 /**
3233 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3234 *
3235 * @param [in,out]crc Register containing the crc.
3236 * @param [in]v Register containing the 32-bit to fold into the CRC.
3237 * @param [in]table0 Register containing table 0 of crc constants.
3238 * @param [in]table1 Register containing table 1 of crc constants.
3239 * @param [in]table2 Register containing table 2 of crc constants.
3240 * @param [in]table3 Register containing table 3 of crc constants.
3241 *
3242 * uint32_t crc;
3243 * v = crc ^ v
3244 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3245 *
3246 */
update_word_crc32(Register crc,Register v,Register tmp,Register table0,Register table1,Register table2,Register table3,bool upper)3247 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3248 Register table0, Register table1, Register table2, Register table3,
3249 bool upper) {
3250 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3251 uxtb(tmp, v);
3252 ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3253 ubfx(tmp, v, 8, 8);
3254 ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3255 eor(crc, crc, tmp);
3256 ubfx(tmp, v, 16, 8);
3257 ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3258 eor(crc, crc, tmp);
3259 ubfx(tmp, v, 24, 8);
3260 ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3261 eor(crc, crc, tmp);
3262 }
3263
kernel_crc32_using_crc32(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3264 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3265 Register len, Register tmp0, Register tmp1, Register tmp2,
3266 Register tmp3) {
3267 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3268 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3269
3270 mvnw(crc, crc);
3271
3272 subs(len, len, 128);
3273 br(Assembler::GE, CRC_by64_pre);
3274 BIND(CRC_less64);
3275 adds(len, len, 128-32);
3276 br(Assembler::GE, CRC_by32_loop);
3277 BIND(CRC_less32);
3278 adds(len, len, 32-4);
3279 br(Assembler::GE, CRC_by4_loop);
3280 adds(len, len, 4);
3281 br(Assembler::GT, CRC_by1_loop);
3282 b(L_exit);
3283
3284 BIND(CRC_by32_loop);
3285 ldp(tmp0, tmp1, Address(post(buf, 16)));
3286 subs(len, len, 32);
3287 crc32x(crc, crc, tmp0);
3288 ldr(tmp2, Address(post(buf, 8)));
3289 crc32x(crc, crc, tmp1);
3290 ldr(tmp3, Address(post(buf, 8)));
3291 crc32x(crc, crc, tmp2);
3292 crc32x(crc, crc, tmp3);
3293 br(Assembler::GE, CRC_by32_loop);
3294 cmn(len, 32);
3295 br(Assembler::NE, CRC_less32);
3296 b(L_exit);
3297
3298 BIND(CRC_by4_loop);
3299 ldrw(tmp0, Address(post(buf, 4)));
3300 subs(len, len, 4);
3301 crc32w(crc, crc, tmp0);
3302 br(Assembler::GE, CRC_by4_loop);
3303 adds(len, len, 4);
3304 br(Assembler::LE, L_exit);
3305 BIND(CRC_by1_loop);
3306 ldrb(tmp0, Address(post(buf, 1)));
3307 subs(len, len, 1);
3308 crc32b(crc, crc, tmp0);
3309 br(Assembler::GT, CRC_by1_loop);
3310 b(L_exit);
3311
3312 BIND(CRC_by64_pre);
3313 sub(buf, buf, 8);
3314 ldp(tmp0, tmp1, Address(buf, 8));
3315 crc32x(crc, crc, tmp0);
3316 ldr(tmp2, Address(buf, 24));
3317 crc32x(crc, crc, tmp1);
3318 ldr(tmp3, Address(buf, 32));
3319 crc32x(crc, crc, tmp2);
3320 ldr(tmp0, Address(buf, 40));
3321 crc32x(crc, crc, tmp3);
3322 ldr(tmp1, Address(buf, 48));
3323 crc32x(crc, crc, tmp0);
3324 ldr(tmp2, Address(buf, 56));
3325 crc32x(crc, crc, tmp1);
3326 ldr(tmp3, Address(pre(buf, 64)));
3327
3328 b(CRC_by64_loop);
3329
3330 align(CodeEntryAlignment);
3331 BIND(CRC_by64_loop);
3332 subs(len, len, 64);
3333 crc32x(crc, crc, tmp2);
3334 ldr(tmp0, Address(buf, 8));
3335 crc32x(crc, crc, tmp3);
3336 ldr(tmp1, Address(buf, 16));
3337 crc32x(crc, crc, tmp0);
3338 ldr(tmp2, Address(buf, 24));
3339 crc32x(crc, crc, tmp1);
3340 ldr(tmp3, Address(buf, 32));
3341 crc32x(crc, crc, tmp2);
3342 ldr(tmp0, Address(buf, 40));
3343 crc32x(crc, crc, tmp3);
3344 ldr(tmp1, Address(buf, 48));
3345 crc32x(crc, crc, tmp0);
3346 ldr(tmp2, Address(buf, 56));
3347 crc32x(crc, crc, tmp1);
3348 ldr(tmp3, Address(pre(buf, 64)));
3349 br(Assembler::GE, CRC_by64_loop);
3350
3351 // post-loop
3352 crc32x(crc, crc, tmp2);
3353 crc32x(crc, crc, tmp3);
3354
3355 sub(len, len, 64);
3356 add(buf, buf, 8);
3357 cmn(len, 128);
3358 br(Assembler::NE, CRC_less64);
3359 BIND(L_exit);
3360 mvnw(crc, crc);
3361 }
3362
3363 /**
3364 * @param crc register containing existing CRC (32-bit)
3365 * @param buf register pointing to input byte buffer (byte*)
3366 * @param len register containing number of bytes
3367 * @param table register that will contain address of CRC table
3368 * @param tmp scratch register
3369 */
kernel_crc32(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3370 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3371 Register table0, Register table1, Register table2, Register table3,
3372 Register tmp, Register tmp2, Register tmp3) {
3373 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3374 unsigned long offset;
3375
3376 if (UseCRC32) {
3377 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3378 return;
3379 }
3380
3381 mvnw(crc, crc);
3382
3383 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3384 if (offset) add(table0, table0, offset);
3385 add(table1, table0, 1*256*sizeof(juint));
3386 add(table2, table0, 2*256*sizeof(juint));
3387 add(table3, table0, 3*256*sizeof(juint));
3388
3389 if (UseNeon) {
3390 cmp(len, (u1)64);
3391 br(Assembler::LT, L_by16);
3392 eor(v16, T16B, v16, v16);
3393
3394 Label L_fold;
3395
3396 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3397
3398 ld1(v0, v1, T2D, post(buf, 32));
3399 ld1r(v4, T2D, post(tmp, 8));
3400 ld1r(v5, T2D, post(tmp, 8));
3401 ld1r(v6, T2D, post(tmp, 8));
3402 ld1r(v7, T2D, post(tmp, 8));
3403 mov(v16, T4S, 0, crc);
3404
3405 eor(v0, T16B, v0, v16);
3406 sub(len, len, 64);
3407
3408 BIND(L_fold);
3409 pmull(v22, T8H, v0, v5, T8B);
3410 pmull(v20, T8H, v0, v7, T8B);
3411 pmull(v23, T8H, v0, v4, T8B);
3412 pmull(v21, T8H, v0, v6, T8B);
3413
3414 pmull2(v18, T8H, v0, v5, T16B);
3415 pmull2(v16, T8H, v0, v7, T16B);
3416 pmull2(v19, T8H, v0, v4, T16B);
3417 pmull2(v17, T8H, v0, v6, T16B);
3418
3419 uzp1(v24, T8H, v20, v22);
3420 uzp2(v25, T8H, v20, v22);
3421 eor(v20, T16B, v24, v25);
3422
3423 uzp1(v26, T8H, v16, v18);
3424 uzp2(v27, T8H, v16, v18);
3425 eor(v16, T16B, v26, v27);
3426
3427 ushll2(v22, T4S, v20, T8H, 8);
3428 ushll(v20, T4S, v20, T4H, 8);
3429
3430 ushll2(v18, T4S, v16, T8H, 8);
3431 ushll(v16, T4S, v16, T4H, 8);
3432
3433 eor(v22, T16B, v23, v22);
3434 eor(v18, T16B, v19, v18);
3435 eor(v20, T16B, v21, v20);
3436 eor(v16, T16B, v17, v16);
3437
3438 uzp1(v17, T2D, v16, v20);
3439 uzp2(v21, T2D, v16, v20);
3440 eor(v17, T16B, v17, v21);
3441
3442 ushll2(v20, T2D, v17, T4S, 16);
3443 ushll(v16, T2D, v17, T2S, 16);
3444
3445 eor(v20, T16B, v20, v22);
3446 eor(v16, T16B, v16, v18);
3447
3448 uzp1(v17, T2D, v20, v16);
3449 uzp2(v21, T2D, v20, v16);
3450 eor(v28, T16B, v17, v21);
3451
3452 pmull(v22, T8H, v1, v5, T8B);
3453 pmull(v20, T8H, v1, v7, T8B);
3454 pmull(v23, T8H, v1, v4, T8B);
3455 pmull(v21, T8H, v1, v6, T8B);
3456
3457 pmull2(v18, T8H, v1, v5, T16B);
3458 pmull2(v16, T8H, v1, v7, T16B);
3459 pmull2(v19, T8H, v1, v4, T16B);
3460 pmull2(v17, T8H, v1, v6, T16B);
3461
3462 ld1(v0, v1, T2D, post(buf, 32));
3463
3464 uzp1(v24, T8H, v20, v22);
3465 uzp2(v25, T8H, v20, v22);
3466 eor(v20, T16B, v24, v25);
3467
3468 uzp1(v26, T8H, v16, v18);
3469 uzp2(v27, T8H, v16, v18);
3470 eor(v16, T16B, v26, v27);
3471
3472 ushll2(v22, T4S, v20, T8H, 8);
3473 ushll(v20, T4S, v20, T4H, 8);
3474
3475 ushll2(v18, T4S, v16, T8H, 8);
3476 ushll(v16, T4S, v16, T4H, 8);
3477
3478 eor(v22, T16B, v23, v22);
3479 eor(v18, T16B, v19, v18);
3480 eor(v20, T16B, v21, v20);
3481 eor(v16, T16B, v17, v16);
3482
3483 uzp1(v17, T2D, v16, v20);
3484 uzp2(v21, T2D, v16, v20);
3485 eor(v16, T16B, v17, v21);
3486
3487 ushll2(v20, T2D, v16, T4S, 16);
3488 ushll(v16, T2D, v16, T2S, 16);
3489
3490 eor(v20, T16B, v22, v20);
3491 eor(v16, T16B, v16, v18);
3492
3493 uzp1(v17, T2D, v20, v16);
3494 uzp2(v21, T2D, v20, v16);
3495 eor(v20, T16B, v17, v21);
3496
3497 shl(v16, T2D, v28, 1);
3498 shl(v17, T2D, v20, 1);
3499
3500 eor(v0, T16B, v0, v16);
3501 eor(v1, T16B, v1, v17);
3502
3503 subs(len, len, 32);
3504 br(Assembler::GE, L_fold);
3505
3506 mov(crc, 0);
3507 mov(tmp, v0, T1D, 0);
3508 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3509 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3510 mov(tmp, v0, T1D, 1);
3511 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3512 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3513 mov(tmp, v1, T1D, 0);
3514 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3515 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3516 mov(tmp, v1, T1D, 1);
3517 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3518 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3519
3520 add(len, len, 32);
3521 }
3522
3523 BIND(L_by16);
3524 subs(len, len, 16);
3525 br(Assembler::GE, L_by16_loop);
3526 adds(len, len, 16-4);
3527 br(Assembler::GE, L_by4_loop);
3528 adds(len, len, 4);
3529 br(Assembler::GT, L_by1_loop);
3530 b(L_exit);
3531
3532 BIND(L_by4_loop);
3533 ldrw(tmp, Address(post(buf, 4)));
3534 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3535 subs(len, len, 4);
3536 br(Assembler::GE, L_by4_loop);
3537 adds(len, len, 4);
3538 br(Assembler::LE, L_exit);
3539 BIND(L_by1_loop);
3540 subs(len, len, 1);
3541 ldrb(tmp, Address(post(buf, 1)));
3542 update_byte_crc32(crc, tmp, table0);
3543 br(Assembler::GT, L_by1_loop);
3544 b(L_exit);
3545
3546 align(CodeEntryAlignment);
3547 BIND(L_by16_loop);
3548 subs(len, len, 16);
3549 ldp(tmp, tmp3, Address(post(buf, 16)));
3550 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3551 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3552 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3553 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3554 br(Assembler::GE, L_by16_loop);
3555 adds(len, len, 16-4);
3556 br(Assembler::GE, L_by4_loop);
3557 adds(len, len, 4);
3558 br(Assembler::GT, L_by1_loop);
3559 BIND(L_exit);
3560 mvnw(crc, crc);
3561 }
3562
kernel_crc32c_using_crc32c(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3563 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3564 Register len, Register tmp0, Register tmp1, Register tmp2,
3565 Register tmp3) {
3566 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3567 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3568
3569 subs(len, len, 128);
3570 br(Assembler::GE, CRC_by64_pre);
3571 BIND(CRC_less64);
3572 adds(len, len, 128-32);
3573 br(Assembler::GE, CRC_by32_loop);
3574 BIND(CRC_less32);
3575 adds(len, len, 32-4);
3576 br(Assembler::GE, CRC_by4_loop);
3577 adds(len, len, 4);
3578 br(Assembler::GT, CRC_by1_loop);
3579 b(L_exit);
3580
3581 BIND(CRC_by32_loop);
3582 ldp(tmp0, tmp1, Address(post(buf, 16)));
3583 subs(len, len, 32);
3584 crc32cx(crc, crc, tmp0);
3585 ldr(tmp2, Address(post(buf, 8)));
3586 crc32cx(crc, crc, tmp1);
3587 ldr(tmp3, Address(post(buf, 8)));
3588 crc32cx(crc, crc, tmp2);
3589 crc32cx(crc, crc, tmp3);
3590 br(Assembler::GE, CRC_by32_loop);
3591 cmn(len, 32);
3592 br(Assembler::NE, CRC_less32);
3593 b(L_exit);
3594
3595 BIND(CRC_by4_loop);
3596 ldrw(tmp0, Address(post(buf, 4)));
3597 subs(len, len, 4);
3598 crc32cw(crc, crc, tmp0);
3599 br(Assembler::GE, CRC_by4_loop);
3600 adds(len, len, 4);
3601 br(Assembler::LE, L_exit);
3602 BIND(CRC_by1_loop);
3603 ldrb(tmp0, Address(post(buf, 1)));
3604 subs(len, len, 1);
3605 crc32cb(crc, crc, tmp0);
3606 br(Assembler::GT, CRC_by1_loop);
3607 b(L_exit);
3608
3609 BIND(CRC_by64_pre);
3610 sub(buf, buf, 8);
3611 ldp(tmp0, tmp1, Address(buf, 8));
3612 crc32cx(crc, crc, tmp0);
3613 ldr(tmp2, Address(buf, 24));
3614 crc32cx(crc, crc, tmp1);
3615 ldr(tmp3, Address(buf, 32));
3616 crc32cx(crc, crc, tmp2);
3617 ldr(tmp0, Address(buf, 40));
3618 crc32cx(crc, crc, tmp3);
3619 ldr(tmp1, Address(buf, 48));
3620 crc32cx(crc, crc, tmp0);
3621 ldr(tmp2, Address(buf, 56));
3622 crc32cx(crc, crc, tmp1);
3623 ldr(tmp3, Address(pre(buf, 64)));
3624
3625 b(CRC_by64_loop);
3626
3627 align(CodeEntryAlignment);
3628 BIND(CRC_by64_loop);
3629 subs(len, len, 64);
3630 crc32cx(crc, crc, tmp2);
3631 ldr(tmp0, Address(buf, 8));
3632 crc32cx(crc, crc, tmp3);
3633 ldr(tmp1, Address(buf, 16));
3634 crc32cx(crc, crc, tmp0);
3635 ldr(tmp2, Address(buf, 24));
3636 crc32cx(crc, crc, tmp1);
3637 ldr(tmp3, Address(buf, 32));
3638 crc32cx(crc, crc, tmp2);
3639 ldr(tmp0, Address(buf, 40));
3640 crc32cx(crc, crc, tmp3);
3641 ldr(tmp1, Address(buf, 48));
3642 crc32cx(crc, crc, tmp0);
3643 ldr(tmp2, Address(buf, 56));
3644 crc32cx(crc, crc, tmp1);
3645 ldr(tmp3, Address(pre(buf, 64)));
3646 br(Assembler::GE, CRC_by64_loop);
3647
3648 // post-loop
3649 crc32cx(crc, crc, tmp2);
3650 crc32cx(crc, crc, tmp3);
3651
3652 sub(len, len, 64);
3653 add(buf, buf, 8);
3654 cmn(len, 128);
3655 br(Assembler::NE, CRC_less64);
3656 BIND(L_exit);
3657 }
3658
3659 /**
3660 * @param crc register containing existing CRC (32-bit)
3661 * @param buf register pointing to input byte buffer (byte*)
3662 * @param len register containing number of bytes
3663 * @param table register that will contain address of CRC table
3664 * @param tmp scratch register
3665 */
kernel_crc32c(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3666 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3667 Register table0, Register table1, Register table2, Register table3,
3668 Register tmp, Register tmp2, Register tmp3) {
3669 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3670 }
3671
3672
SkipIfEqual(MacroAssembler * masm,const bool * flag_addr,bool value)3673 SkipIfEqual::SkipIfEqual(
3674 MacroAssembler* masm, const bool* flag_addr, bool value) {
3675 _masm = masm;
3676 unsigned long offset;
3677 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3678 _masm->ldrb(rscratch1, Address(rscratch1, offset));
3679 _masm->cbzw(rscratch1, _label);
3680 }
3681
~SkipIfEqual()3682 SkipIfEqual::~SkipIfEqual() {
3683 _masm->bind(_label);
3684 }
3685
addptr(const Address & dst,int32_t src)3686 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3687 Address adr;
3688 switch(dst.getMode()) {
3689 case Address::base_plus_offset:
3690 // This is the expected mode, although we allow all the other
3691 // forms below.
3692 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3693 break;
3694 default:
3695 lea(rscratch2, dst);
3696 adr = Address(rscratch2);
3697 break;
3698 }
3699 ldr(rscratch1, adr);
3700 add(rscratch1, rscratch1, src);
3701 str(rscratch1, adr);
3702 }
3703
cmpptr(Register src1,Address src2)3704 void MacroAssembler::cmpptr(Register src1, Address src2) {
3705 unsigned long offset;
3706 adrp(rscratch1, src2, offset);
3707 ldr(rscratch1, Address(rscratch1, offset));
3708 cmp(src1, rscratch1);
3709 }
3710
cmpoop(Register obj1,Register obj2)3711 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3712 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3713 bs->obj_equals(this, obj1, obj2);
3714 }
3715
load_method_holder_cld(Register rresult,Register rmethod)3716 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
3717 load_method_holder(rresult, rmethod);
3718 ldr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
3719 }
3720
load_method_holder(Register holder,Register method)3721 void MacroAssembler::load_method_holder(Register holder, Register method) {
3722 ldr(holder, Address(method, Method::const_offset())); // ConstMethod*
3723 ldr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
3724 ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
3725 }
3726
load_klass(Register dst,Register src)3727 void MacroAssembler::load_klass(Register dst, Register src) {
3728 if (UseCompressedClassPointers) {
3729 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3730 decode_klass_not_null(dst);
3731 } else {
3732 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3733 }
3734 }
3735
3736 // ((OopHandle)result).resolve();
resolve_oop_handle(Register result,Register tmp)3737 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3738 // OopHandle::resolve is an indirection.
3739 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3740 }
3741
3742 // ((WeakHandle)result).resolve();
resolve_weak_handle(Register rresult,Register rtmp)3743 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
3744 assert_different_registers(rresult, rtmp);
3745 Label resolved;
3746
3747 // A null weak handle resolves to null.
3748 cbz(rresult, resolved);
3749
3750 // Only 64 bit platforms support GCs that require a tmp register
3751 // Only IN_HEAP loads require a thread_tmp register
3752 // WeakHandle::resolve is an indirection like jweak.
3753 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3754 rresult, Address(rresult), rtmp, /*tmp_thread*/noreg);
3755 bind(resolved);
3756 }
3757
load_mirror(Register dst,Register method,Register tmp)3758 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3759 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3760 ldr(dst, Address(rmethod, Method::const_offset()));
3761 ldr(dst, Address(dst, ConstMethod::constants_offset()));
3762 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3763 ldr(dst, Address(dst, mirror_offset));
3764 resolve_oop_handle(dst, tmp);
3765 }
3766
cmp_klass(Register oop,Register trial_klass,Register tmp)3767 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3768 if (UseCompressedClassPointers) {
3769 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3770 if (CompressedKlassPointers::base() == NULL) {
3771 cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3772 return;
3773 } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3774 && CompressedKlassPointers::shift() == 0) {
3775 // Only the bottom 32 bits matter
3776 cmpw(trial_klass, tmp);
3777 return;
3778 }
3779 decode_klass_not_null(tmp);
3780 } else {
3781 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3782 }
3783 cmp(trial_klass, tmp);
3784 }
3785
load_prototype_header(Register dst,Register src)3786 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3787 load_klass(dst, src);
3788 ldr(dst, Address(dst, Klass::prototype_header_offset()));
3789 }
3790
store_klass(Register dst,Register src)3791 void MacroAssembler::store_klass(Register dst, Register src) {
3792 // FIXME: Should this be a store release? concurrent gcs assumes
3793 // klass length is valid if klass field is not null.
3794 if (UseCompressedClassPointers) {
3795 encode_klass_not_null(src);
3796 strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3797 } else {
3798 str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3799 }
3800 }
3801
store_klass_gap(Register dst,Register src)3802 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3803 if (UseCompressedClassPointers) {
3804 // Store to klass gap in destination
3805 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3806 }
3807 }
3808
3809 // Algorithm must match CompressedOops::encode.
encode_heap_oop(Register d,Register s)3810 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3811 #ifdef ASSERT
3812 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3813 #endif
3814 verify_oop(s, "broken oop in encode_heap_oop");
3815 if (CompressedOops::base() == NULL) {
3816 if (CompressedOops::shift() != 0) {
3817 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3818 lsr(d, s, LogMinObjAlignmentInBytes);
3819 } else {
3820 mov(d, s);
3821 }
3822 } else {
3823 subs(d, s, rheapbase);
3824 csel(d, d, zr, Assembler::HS);
3825 lsr(d, d, LogMinObjAlignmentInBytes);
3826
3827 /* Old algorithm: is this any worse?
3828 Label nonnull;
3829 cbnz(r, nonnull);
3830 sub(r, r, rheapbase);
3831 bind(nonnull);
3832 lsr(r, r, LogMinObjAlignmentInBytes);
3833 */
3834 }
3835 }
3836
encode_heap_oop_not_null(Register r)3837 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3838 #ifdef ASSERT
3839 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3840 if (CheckCompressedOops) {
3841 Label ok;
3842 cbnz(r, ok);
3843 stop("null oop passed to encode_heap_oop_not_null");
3844 bind(ok);
3845 }
3846 #endif
3847 verify_oop(r, "broken oop in encode_heap_oop_not_null");
3848 if (CompressedOops::base() != NULL) {
3849 sub(r, r, rheapbase);
3850 }
3851 if (CompressedOops::shift() != 0) {
3852 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3853 lsr(r, r, LogMinObjAlignmentInBytes);
3854 }
3855 }
3856
encode_heap_oop_not_null(Register dst,Register src)3857 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3858 #ifdef ASSERT
3859 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3860 if (CheckCompressedOops) {
3861 Label ok;
3862 cbnz(src, ok);
3863 stop("null oop passed to encode_heap_oop_not_null2");
3864 bind(ok);
3865 }
3866 #endif
3867 verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3868
3869 Register data = src;
3870 if (CompressedOops::base() != NULL) {
3871 sub(dst, src, rheapbase);
3872 data = dst;
3873 }
3874 if (CompressedOops::shift() != 0) {
3875 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3876 lsr(dst, data, LogMinObjAlignmentInBytes);
3877 data = dst;
3878 }
3879 if (data == src)
3880 mov(dst, src);
3881 }
3882
decode_heap_oop(Register d,Register s)3883 void MacroAssembler::decode_heap_oop(Register d, Register s) {
3884 #ifdef ASSERT
3885 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3886 #endif
3887 if (CompressedOops::base() == NULL) {
3888 if (CompressedOops::shift() != 0 || d != s) {
3889 lsl(d, s, CompressedOops::shift());
3890 }
3891 } else {
3892 Label done;
3893 if (d != s)
3894 mov(d, s);
3895 cbz(s, done);
3896 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3897 bind(done);
3898 }
3899 verify_oop(d, "broken oop in decode_heap_oop");
3900 }
3901
decode_heap_oop_not_null(Register r)3902 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3903 assert (UseCompressedOops, "should only be used for compressed headers");
3904 assert (Universe::heap() != NULL, "java heap should be initialized");
3905 // Cannot assert, unverified entry point counts instructions (see .ad file)
3906 // vtableStubs also counts instructions in pd_code_size_limit.
3907 // Also do not verify_oop as this is called by verify_oop.
3908 if (CompressedOops::shift() != 0) {
3909 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3910 if (CompressedOops::base() != NULL) {
3911 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3912 } else {
3913 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3914 }
3915 } else {
3916 assert (CompressedOops::base() == NULL, "sanity");
3917 }
3918 }
3919
decode_heap_oop_not_null(Register dst,Register src)3920 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3921 assert (UseCompressedOops, "should only be used for compressed headers");
3922 assert (Universe::heap() != NULL, "java heap should be initialized");
3923 // Cannot assert, unverified entry point counts instructions (see .ad file)
3924 // vtableStubs also counts instructions in pd_code_size_limit.
3925 // Also do not verify_oop as this is called by verify_oop.
3926 if (CompressedOops::shift() != 0) {
3927 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3928 if (CompressedOops::base() != NULL) {
3929 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3930 } else {
3931 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3932 }
3933 } else {
3934 assert (CompressedOops::base() == NULL, "sanity");
3935 if (dst != src) {
3936 mov(dst, src);
3937 }
3938 }
3939 }
3940
3941 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode(KlassDecodeNone);
3942
klass_decode_mode()3943 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() {
3944 assert(UseCompressedClassPointers, "not using compressed class pointers");
3945 assert(Metaspace::initialized(), "metaspace not initialized yet");
3946
3947 if (_klass_decode_mode != KlassDecodeNone) {
3948 return _klass_decode_mode;
3949 }
3950
3951 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift()
3952 || 0 == CompressedKlassPointers::shift(), "decode alg wrong");
3953
3954 if (CompressedKlassPointers::base() == NULL) {
3955 return (_klass_decode_mode = KlassDecodeZero);
3956 }
3957
3958 if (operand_valid_for_logical_immediate(
3959 /*is32*/false, (uint64_t)CompressedKlassPointers::base())) {
3960 const uint64_t range_mask =
3961 (1UL << log2_intptr(CompressedKlassPointers::range())) - 1;
3962 if (((uint64_t)CompressedKlassPointers::base() & range_mask) == 0) {
3963 return (_klass_decode_mode = KlassDecodeXor);
3964 }
3965 }
3966
3967 const uint64_t shifted_base =
3968 (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
3969 guarantee((shifted_base & 0xffff0000ffffffff) == 0,
3970 "compressed class base bad alignment");
3971
3972 return (_klass_decode_mode = KlassDecodeMovk);
3973 }
3974
encode_klass_not_null(Register dst,Register src)3975 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3976 switch (klass_decode_mode()) {
3977 case KlassDecodeZero:
3978 if (CompressedKlassPointers::shift() != 0) {
3979 lsr(dst, src, LogKlassAlignmentInBytes);
3980 } else {
3981 if (dst != src) mov(dst, src);
3982 }
3983 break;
3984
3985 case KlassDecodeXor:
3986 if (CompressedKlassPointers::shift() != 0) {
3987 eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3988 lsr(dst, dst, LogKlassAlignmentInBytes);
3989 } else {
3990 eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3991 }
3992 break;
3993
3994 case KlassDecodeMovk:
3995 if (CompressedKlassPointers::shift() != 0) {
3996 ubfx(dst, src, LogKlassAlignmentInBytes, 32);
3997 } else {
3998 movw(dst, src);
3999 }
4000 break;
4001
4002 case KlassDecodeNone:
4003 ShouldNotReachHere();
4004 break;
4005 }
4006 }
4007
encode_klass_not_null(Register r)4008 void MacroAssembler::encode_klass_not_null(Register r) {
4009 encode_klass_not_null(r, r);
4010 }
4011
decode_klass_not_null(Register dst,Register src)4012 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
4013 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4014
4015 switch (klass_decode_mode()) {
4016 case KlassDecodeZero:
4017 if (CompressedKlassPointers::shift() != 0) {
4018 lsl(dst, src, LogKlassAlignmentInBytes);
4019 } else {
4020 if (dst != src) mov(dst, src);
4021 }
4022 break;
4023
4024 case KlassDecodeXor:
4025 if (CompressedKlassPointers::shift() != 0) {
4026 lsl(dst, src, LogKlassAlignmentInBytes);
4027 eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
4028 } else {
4029 eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4030 }
4031 break;
4032
4033 case KlassDecodeMovk: {
4034 const uint64_t shifted_base =
4035 (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4036
4037 if (dst != src) movw(dst, src);
4038 movk(dst, shifted_base >> 32, 32);
4039
4040 if (CompressedKlassPointers::shift() != 0) {
4041 lsl(dst, dst, LogKlassAlignmentInBytes);
4042 }
4043
4044 break;
4045 }
4046
4047 case KlassDecodeNone:
4048 ShouldNotReachHere();
4049 break;
4050 }
4051 }
4052
decode_klass_not_null(Register r)4053 void MacroAssembler::decode_klass_not_null(Register r) {
4054 decode_klass_not_null(r, r);
4055 }
4056
set_narrow_oop(Register dst,jobject obj)4057 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4058 #ifdef ASSERT
4059 {
4060 ThreadInVMfromUnknown tiv;
4061 assert (UseCompressedOops, "should only be used for compressed oops");
4062 assert (Universe::heap() != NULL, "java heap should be initialized");
4063 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4064 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4065 }
4066 #endif
4067 int oop_index = oop_recorder()->find_index(obj);
4068 InstructionMark im(this);
4069 RelocationHolder rspec = oop_Relocation::spec(oop_index);
4070 code_section()->relocate(inst_mark(), rspec);
4071 movz(dst, 0xDEAD, 16);
4072 movk(dst, 0xBEEF);
4073 }
4074
set_narrow_klass(Register dst,Klass * k)4075 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4076 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4077 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4078 int index = oop_recorder()->find_index(k);
4079 assert(! Universe::heap()->is_in(k), "should not be an oop");
4080
4081 InstructionMark im(this);
4082 RelocationHolder rspec = metadata_Relocation::spec(index);
4083 code_section()->relocate(inst_mark(), rspec);
4084 narrowKlass nk = CompressedKlassPointers::encode(k);
4085 movz(dst, (nk >> 16), 16);
4086 movk(dst, nk & 0xffff);
4087 }
4088
access_load_at(BasicType type,DecoratorSet decorators,Register dst,Address src,Register tmp1,Register thread_tmp)4089 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4090 Register dst, Address src,
4091 Register tmp1, Register thread_tmp) {
4092 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4093 decorators = AccessInternal::decorator_fixup(decorators);
4094 bool as_raw = (decorators & AS_RAW) != 0;
4095 if (as_raw) {
4096 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4097 } else {
4098 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4099 }
4100 }
4101
access_store_at(BasicType type,DecoratorSet decorators,Address dst,Register src,Register tmp1,Register thread_tmp)4102 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4103 Address dst, Register src,
4104 Register tmp1, Register thread_tmp) {
4105 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4106 decorators = AccessInternal::decorator_fixup(decorators);
4107 bool as_raw = (decorators & AS_RAW) != 0;
4108 if (as_raw) {
4109 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4110 } else {
4111 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4112 }
4113 }
4114
resolve(DecoratorSet decorators,Register obj)4115 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4116 // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4117 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4118 decorators |= ACCESS_READ | ACCESS_WRITE;
4119 }
4120 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4121 return bs->resolve(this, decorators, obj);
4122 }
4123
load_heap_oop(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4124 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4125 Register thread_tmp, DecoratorSet decorators) {
4126 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4127 }
4128
load_heap_oop_not_null(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4129 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4130 Register thread_tmp, DecoratorSet decorators) {
4131 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4132 }
4133
store_heap_oop(Address dst,Register src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4134 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4135 Register thread_tmp, DecoratorSet decorators) {
4136 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4137 }
4138
4139 // Used for storing NULLs.
store_heap_oop_null(Address dst)4140 void MacroAssembler::store_heap_oop_null(Address dst) {
4141 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4142 }
4143
allocate_metadata_address(Metadata * obj)4144 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4145 assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4146 int index = oop_recorder()->allocate_metadata_index(obj);
4147 RelocationHolder rspec = metadata_Relocation::spec(index);
4148 return Address((address)obj, rspec);
4149 }
4150
4151 // Move an oop into a register. immediate is true if we want
4152 // immediate instructions and nmethod entry barriers are not enabled.
4153 // i.e. we are not going to patch this instruction while the code is being
4154 // executed by another thread.
movoop(Register dst,jobject obj,bool immediate)4155 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4156 int oop_index;
4157 if (obj == NULL) {
4158 oop_index = oop_recorder()->allocate_oop_index(obj);
4159 } else {
4160 #ifdef ASSERT
4161 {
4162 ThreadInVMfromUnknown tiv;
4163 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4164 }
4165 #endif
4166 oop_index = oop_recorder()->find_index(obj);
4167 }
4168 RelocationHolder rspec = oop_Relocation::spec(oop_index);
4169
4170 // nmethod entry barrier necessitate using the constant pool. They have to be
4171 // ordered with respected to oop accesses.
4172 // Using immediate literals would necessitate ISBs.
4173 if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL || !immediate) {
4174 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4175 ldr_constant(dst, Address(dummy, rspec));
4176 } else
4177 mov(dst, Address((address)obj, rspec));
4178
4179 }
4180
4181 // Move a metadata address into a register.
mov_metadata(Register dst,Metadata * obj)4182 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4183 int oop_index;
4184 if (obj == NULL) {
4185 oop_index = oop_recorder()->allocate_metadata_index(obj);
4186 } else {
4187 oop_index = oop_recorder()->find_index(obj);
4188 }
4189 RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4190 mov(dst, Address((address)obj, rspec));
4191 }
4192
constant_oop_address(jobject obj)4193 Address MacroAssembler::constant_oop_address(jobject obj) {
4194 #ifdef ASSERT
4195 {
4196 ThreadInVMfromUnknown tiv;
4197 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4198 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4199 }
4200 #endif
4201 int oop_index = oop_recorder()->find_index(obj);
4202 return Address((address)obj, oop_Relocation::spec(oop_index));
4203 }
4204
4205 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
tlab_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Register t2,Label & slow_case)4206 void MacroAssembler::tlab_allocate(Register obj,
4207 Register var_size_in_bytes,
4208 int con_size_in_bytes,
4209 Register t1,
4210 Register t2,
4211 Label& slow_case) {
4212 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4213 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4214 }
4215
4216 // Defines obj, preserves var_size_in_bytes
eden_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Label & slow_case)4217 void MacroAssembler::eden_allocate(Register obj,
4218 Register var_size_in_bytes,
4219 int con_size_in_bytes,
4220 Register t1,
4221 Label& slow_case) {
4222 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4223 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4224 }
4225
4226 // Zero words; len is in bytes
4227 // Destroys all registers except addr
4228 // len must be a nonzero multiple of wordSize
zero_memory(Register addr,Register len,Register t1)4229 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4230 assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4231
4232 #ifdef ASSERT
4233 { Label L;
4234 tst(len, BytesPerWord - 1);
4235 br(Assembler::EQ, L);
4236 stop("len is not a multiple of BytesPerWord");
4237 bind(L);
4238 }
4239 #endif
4240
4241 #ifndef PRODUCT
4242 block_comment("zero memory");
4243 #endif
4244
4245 Label loop;
4246 Label entry;
4247
4248 // Algorithm:
4249 //
4250 // scratch1 = cnt & 7;
4251 // cnt -= scratch1;
4252 // p += scratch1;
4253 // switch (scratch1) {
4254 // do {
4255 // cnt -= 8;
4256 // p[-8] = 0;
4257 // case 7:
4258 // p[-7] = 0;
4259 // case 6:
4260 // p[-6] = 0;
4261 // // ...
4262 // case 1:
4263 // p[-1] = 0;
4264 // case 0:
4265 // p += 8;
4266 // } while (cnt);
4267 // }
4268
4269 const int unroll = 8; // Number of str(zr) instructions we'll unroll
4270
4271 lsr(len, len, LogBytesPerWord);
4272 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll
4273 sub(len, len, rscratch1); // cnt -= unroll
4274 // t1 always points to the end of the region we're about to zero
4275 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4276 adr(rscratch2, entry);
4277 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4278 br(rscratch2);
4279 bind(loop);
4280 sub(len, len, unroll);
4281 for (int i = -unroll; i < 0; i++)
4282 Assembler::str(zr, Address(t1, i * wordSize));
4283 bind(entry);
4284 add(t1, t1, unroll * wordSize);
4285 cbnz(len, loop);
4286 }
4287
verify_tlab()4288 void MacroAssembler::verify_tlab() {
4289 #ifdef ASSERT
4290 if (UseTLAB && VerifyOops) {
4291 Label next, ok;
4292
4293 stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4294
4295 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4296 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4297 cmp(rscratch2, rscratch1);
4298 br(Assembler::HS, next);
4299 STOP("assert(top >= start)");
4300 should_not_reach_here();
4301
4302 bind(next);
4303 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4304 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4305 cmp(rscratch2, rscratch1);
4306 br(Assembler::HS, ok);
4307 STOP("assert(top <= end)");
4308 should_not_reach_here();
4309
4310 bind(ok);
4311 ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4312 }
4313 #endif
4314 }
4315
4316 // Writes to stack successive pages until offset reached to check for
4317 // stack overflow + shadow pages. This clobbers tmp.
bang_stack_size(Register size,Register tmp)4318 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4319 assert_different_registers(tmp, size, rscratch1);
4320 mov(tmp, sp);
4321 // Bang stack for total size given plus shadow page size.
4322 // Bang one page at a time because large size can bang beyond yellow and
4323 // red zones.
4324 Label loop;
4325 mov(rscratch1, os::vm_page_size());
4326 bind(loop);
4327 lea(tmp, Address(tmp, -os::vm_page_size()));
4328 subsw(size, size, rscratch1);
4329 str(size, Address(tmp));
4330 br(Assembler::GT, loop);
4331
4332 // Bang down shadow pages too.
4333 // At this point, (tmp-0) is the last address touched, so don't
4334 // touch it again. (It was touched as (tmp-pagesize) but then tmp
4335 // was post-decremented.) Skip this address by starting at i=1, and
4336 // touch a few more pages below. N.B. It is important to touch all
4337 // the way down to and including i=StackShadowPages.
4338 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4339 // this could be any sized move but this is can be a debugging crumb
4340 // so the bigger the better.
4341 lea(tmp, Address(tmp, -os::vm_page_size()));
4342 str(size, Address(tmp));
4343 }
4344 }
4345
4346 // Move the address of the polling page into dest.
get_polling_page(Register dest,relocInfo::relocType rtype)4347 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4348 ldr(dest, Address(rthread, Thread::polling_page_offset()));
4349 }
4350
4351 // Move the address of the polling page into r, then read the polling
4352 // page.
fetch_and_read_polling_page(Register r,relocInfo::relocType rtype)4353 address MacroAssembler::fetch_and_read_polling_page(Register r, relocInfo::relocType rtype) {
4354 get_polling_page(r, rtype);
4355 return read_polling_page(r, rtype);
4356 }
4357
4358 // Read the polling page. The address of the polling page must
4359 // already be in r.
read_polling_page(Register r,relocInfo::relocType rtype)4360 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4361 InstructionMark im(this);
4362 code_section()->relocate(inst_mark(), rtype);
4363 ldrw(zr, Address(r, 0));
4364 return inst_mark();
4365 }
4366
adrp(Register reg1,const Address & dest,unsigned long & byte_offset)4367 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4368 relocInfo::relocType rtype = dest.rspec().reloc()->type();
4369 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4370 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4371 unsigned long dest_page = (unsigned long)dest.target() >> 12;
4372 long offset_low = dest_page - low_page;
4373 long offset_high = dest_page - high_page;
4374
4375 assert(is_valid_AArch64_address(dest.target()), "bad address");
4376 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4377
4378 InstructionMark im(this);
4379 code_section()->relocate(inst_mark(), dest.rspec());
4380 // 8143067: Ensure that the adrp can reach the dest from anywhere within
4381 // the code cache so that if it is relocated we know it will still reach
4382 if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4383 _adrp(reg1, dest.target());
4384 } else {
4385 unsigned long target = (unsigned long)dest.target();
4386 unsigned long adrp_target
4387 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4388
4389 _adrp(reg1, (address)adrp_target);
4390 movk(reg1, target >> 32, 32);
4391 }
4392 byte_offset = (unsigned long)dest.target() & 0xfff;
4393 }
4394
load_byte_map_base(Register reg)4395 void MacroAssembler::load_byte_map_base(Register reg) {
4396 CardTable::CardValue* byte_map_base =
4397 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4398
4399 if (is_valid_AArch64_address((address)byte_map_base)) {
4400 // Strictly speaking the byte_map_base isn't an address at all,
4401 // and it might even be negative.
4402 unsigned long offset;
4403 adrp(reg, ExternalAddress((address)byte_map_base), offset);
4404 // We expect offset to be zero with most collectors.
4405 if (offset != 0) {
4406 add(reg, reg, offset);
4407 }
4408 } else {
4409 mov(reg, (uint64_t)byte_map_base);
4410 }
4411 }
4412
build_frame(int framesize)4413 void MacroAssembler::build_frame(int framesize) {
4414 assert(framesize > 0, "framesize must be > 0");
4415 if (framesize < ((1 << 9) + 2 * wordSize)) {
4416 sub(sp, sp, framesize);
4417 stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4418 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4419 } else {
4420 stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4421 if (PreserveFramePointer) mov(rfp, sp);
4422 if (framesize < ((1 << 12) + 2 * wordSize))
4423 sub(sp, sp, framesize - 2 * wordSize);
4424 else {
4425 mov(rscratch1, framesize - 2 * wordSize);
4426 sub(sp, sp, rscratch1);
4427 }
4428 }
4429 }
4430
remove_frame(int framesize)4431 void MacroAssembler::remove_frame(int framesize) {
4432 assert(framesize > 0, "framesize must be > 0");
4433 if (framesize < ((1 << 9) + 2 * wordSize)) {
4434 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4435 add(sp, sp, framesize);
4436 } else {
4437 if (framesize < ((1 << 12) + 2 * wordSize))
4438 add(sp, sp, framesize - 2 * wordSize);
4439 else {
4440 mov(rscratch1, framesize - 2 * wordSize);
4441 add(sp, sp, rscratch1);
4442 }
4443 ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4444 }
4445 }
4446
4447
4448 // This method checks if provided byte array contains byte with highest bit set.
has_negatives(Register ary1,Register len,Register result)4449 address MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
4450 // Simple and most common case of aligned small array which is not at the
4451 // end of memory page is placed here. All other cases are in stub.
4452 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
4453 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4454 assert_different_registers(ary1, len, result);
4455
4456 cmpw(len, 0);
4457 br(LE, SET_RESULT);
4458 cmpw(len, 4 * wordSize);
4459 br(GE, STUB_LONG); // size > 32 then go to stub
4460
4461 int shift = 64 - exact_log2(os::vm_page_size());
4462 lsl(rscratch1, ary1, shift);
4463 mov(rscratch2, (size_t)(4 * wordSize) << shift);
4464 adds(rscratch2, rscratch1, rscratch2); // At end of page?
4465 br(CS, STUB); // at the end of page then go to stub
4466 subs(len, len, wordSize);
4467 br(LT, END);
4468
4469 BIND(LOOP);
4470 ldr(rscratch1, Address(post(ary1, wordSize)));
4471 tst(rscratch1, UPPER_BIT_MASK);
4472 br(NE, SET_RESULT);
4473 subs(len, len, wordSize);
4474 br(GE, LOOP);
4475 cmpw(len, -wordSize);
4476 br(EQ, SET_RESULT);
4477
4478 BIND(END);
4479 ldr(result, Address(ary1));
4480 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
4481 lslv(result, result, len);
4482 tst(result, UPPER_BIT_MASK);
4483 b(SET_RESULT);
4484
4485 BIND(STUB);
4486 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives());
4487 assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
4488 address tpc1 = trampoline_call(has_neg);
4489 if (tpc1 == NULL) {
4490 DEBUG_ONLY(reset_labels3(STUB_LONG, SET_RESULT, DONE));
4491 postcond(pc() == badAddress);
4492 return NULL;
4493 }
4494 b(DONE);
4495
4496 BIND(STUB_LONG);
4497 RuntimeAddress has_neg_long = RuntimeAddress(StubRoutines::aarch64::has_negatives_long());
4498 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
4499 address tpc2 = trampoline_call(has_neg_long);
4500 if (tpc2 == NULL) {
4501 DEBUG_ONLY(reset_labels2(SET_RESULT, DONE));
4502 postcond(pc() == badAddress);
4503 return NULL;
4504 }
4505 b(DONE);
4506
4507 BIND(SET_RESULT);
4508 cset(result, NE); // set true or false
4509
4510 BIND(DONE);
4511 postcond(pc() != badAddress);
4512 return pc();
4513 }
4514
arrays_equals(Register a1,Register a2,Register tmp3,Register tmp4,Register tmp5,Register result,Register cnt1,int elem_size)4515 address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
4516 Register tmp4, Register tmp5, Register result,
4517 Register cnt1, int elem_size) {
4518 Label DONE, SAME;
4519 Register tmp1 = rscratch1;
4520 Register tmp2 = rscratch2;
4521 Register cnt2 = tmp2; // cnt2 only used in array length compare
4522 int elem_per_word = wordSize/elem_size;
4523 int log_elem_size = exact_log2(elem_size);
4524 int length_offset = arrayOopDesc::length_offset_in_bytes();
4525 int base_offset
4526 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
4527 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
4528
4529 assert(elem_size == 1 || elem_size == 2, "must be char or byte");
4530 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4531
4532 #ifndef PRODUCT
4533 {
4534 const char kind = (elem_size == 2) ? 'U' : 'L';
4535 char comment[64];
4536 snprintf(comment, sizeof comment, "array_equals%c{", kind);
4537 BLOCK_COMMENT(comment);
4538 }
4539 #endif
4540
4541 // if (a1 == a2)
4542 // return true;
4543 cmpoop(a1, a2); // May have read barriers for a1 and a2.
4544 br(EQ, SAME);
4545
4546 if (UseSimpleArrayEquals) {
4547 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
4548 // if (a1 == null || a2 == null)
4549 // return false;
4550 // a1 & a2 == 0 means (some-pointer is null) or
4551 // (very-rare-or-even-probably-impossible-pointer-values)
4552 // so, we can save one branch in most cases
4553 tst(a1, a2);
4554 mov(result, false);
4555 br(EQ, A_MIGHT_BE_NULL);
4556 // if (a1.length != a2.length)
4557 // return false;
4558 bind(A_IS_NOT_NULL);
4559 ldrw(cnt1, Address(a1, length_offset));
4560 ldrw(cnt2, Address(a2, length_offset));
4561 eorw(tmp5, cnt1, cnt2);
4562 cbnzw(tmp5, DONE);
4563 lea(a1, Address(a1, base_offset));
4564 lea(a2, Address(a2, base_offset));
4565 // Check for short strings, i.e. smaller than wordSize.
4566 subs(cnt1, cnt1, elem_per_word);
4567 br(Assembler::LT, SHORT);
4568 // Main 8 byte comparison loop.
4569 bind(NEXT_WORD); {
4570 ldr(tmp1, Address(post(a1, wordSize)));
4571 ldr(tmp2, Address(post(a2, wordSize)));
4572 subs(cnt1, cnt1, elem_per_word);
4573 eor(tmp5, tmp1, tmp2);
4574 cbnz(tmp5, DONE);
4575 } br(GT, NEXT_WORD);
4576 // Last longword. In the case where length == 4 we compare the
4577 // same longword twice, but that's still faster than another
4578 // conditional branch.
4579 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
4580 // length == 4.
4581 if (log_elem_size > 0)
4582 lsl(cnt1, cnt1, log_elem_size);
4583 ldr(tmp3, Address(a1, cnt1));
4584 ldr(tmp4, Address(a2, cnt1));
4585 eor(tmp5, tmp3, tmp4);
4586 cbnz(tmp5, DONE);
4587 b(SAME);
4588 bind(A_MIGHT_BE_NULL);
4589 // in case both a1 and a2 are not-null, proceed with loads
4590 cbz(a1, DONE);
4591 cbz(a2, DONE);
4592 b(A_IS_NOT_NULL);
4593 bind(SHORT);
4594
4595 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
4596 {
4597 ldrw(tmp1, Address(post(a1, 4)));
4598 ldrw(tmp2, Address(post(a2, 4)));
4599 eorw(tmp5, tmp1, tmp2);
4600 cbnzw(tmp5, DONE);
4601 }
4602 bind(TAIL03);
4603 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
4604 {
4605 ldrh(tmp3, Address(post(a1, 2)));
4606 ldrh(tmp4, Address(post(a2, 2)));
4607 eorw(tmp5, tmp3, tmp4);
4608 cbnzw(tmp5, DONE);
4609 }
4610 bind(TAIL01);
4611 if (elem_size == 1) { // Only needed when comparing byte arrays.
4612 tbz(cnt1, 0, SAME); // 0-1 bytes left.
4613 {
4614 ldrb(tmp1, a1);
4615 ldrb(tmp2, a2);
4616 eorw(tmp5, tmp1, tmp2);
4617 cbnzw(tmp5, DONE);
4618 }
4619 }
4620 } else {
4621 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB,
4622 CSET_EQ, LAST_CHECK;
4623 mov(result, false);
4624 cbz(a1, DONE);
4625 ldrw(cnt1, Address(a1, length_offset));
4626 cbz(a2, DONE);
4627 ldrw(cnt2, Address(a2, length_offset));
4628 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
4629 // faster to perform another branch before comparing a1 and a2
4630 cmp(cnt1, (u1)elem_per_word);
4631 br(LE, SHORT); // short or same
4632 ldr(tmp3, Address(pre(a1, base_offset)));
4633 subs(zr, cnt1, stubBytesThreshold);
4634 br(GE, STUB);
4635 ldr(tmp4, Address(pre(a2, base_offset)));
4636 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
4637 cmp(cnt2, cnt1);
4638 br(NE, DONE);
4639
4640 // Main 16 byte comparison loop with 2 exits
4641 bind(NEXT_DWORD); {
4642 ldr(tmp1, Address(pre(a1, wordSize)));
4643 ldr(tmp2, Address(pre(a2, wordSize)));
4644 subs(cnt1, cnt1, 2 * elem_per_word);
4645 br(LE, TAIL);
4646 eor(tmp4, tmp3, tmp4);
4647 cbnz(tmp4, DONE);
4648 ldr(tmp3, Address(pre(a1, wordSize)));
4649 ldr(tmp4, Address(pre(a2, wordSize)));
4650 cmp(cnt1, (u1)elem_per_word);
4651 br(LE, TAIL2);
4652 cmp(tmp1, tmp2);
4653 } br(EQ, NEXT_DWORD);
4654 b(DONE);
4655
4656 bind(TAIL);
4657 eor(tmp4, tmp3, tmp4);
4658 eor(tmp2, tmp1, tmp2);
4659 lslv(tmp2, tmp2, tmp5);
4660 orr(tmp5, tmp4, tmp2);
4661 cmp(tmp5, zr);
4662 b(CSET_EQ);
4663
4664 bind(TAIL2);
4665 eor(tmp2, tmp1, tmp2);
4666 cbnz(tmp2, DONE);
4667 b(LAST_CHECK);
4668
4669 bind(STUB);
4670 ldr(tmp4, Address(pre(a2, base_offset)));
4671 cmp(cnt2, cnt1);
4672 br(NE, DONE);
4673 if (elem_size == 2) { // convert to byte counter
4674 lsl(cnt1, cnt1, 1);
4675 }
4676 eor(tmp5, tmp3, tmp4);
4677 cbnz(tmp5, DONE);
4678 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
4679 assert(stub.target() != NULL, "array_equals_long stub has not been generated");
4680 address tpc = trampoline_call(stub);
4681 if (tpc == NULL) {
4682 DEBUG_ONLY(reset_labels5(SHORT, LAST_CHECK, CSET_EQ, SAME, DONE));
4683 postcond(pc() == badAddress);
4684 return NULL;
4685 }
4686 b(DONE);
4687
4688 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
4689 // so, if a2 == null => return false(0), else return true, so we can return a2
4690 mov(result, a2);
4691 b(DONE);
4692 bind(SHORT);
4693 cmp(cnt2, cnt1);
4694 br(NE, DONE);
4695 cbz(cnt1, SAME);
4696 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
4697 ldr(tmp3, Address(a1, base_offset));
4698 ldr(tmp4, Address(a2, base_offset));
4699 bind(LAST_CHECK);
4700 eor(tmp4, tmp3, tmp4);
4701 lslv(tmp5, tmp4, tmp5);
4702 cmp(tmp5, zr);
4703 bind(CSET_EQ);
4704 cset(result, EQ);
4705 b(DONE);
4706 }
4707
4708 bind(SAME);
4709 mov(result, true);
4710 // That's it.
4711 bind(DONE);
4712
4713 BLOCK_COMMENT("} array_equals");
4714 postcond(pc() != badAddress);
4715 return pc();
4716 }
4717
4718 // Compare Strings
4719
4720 // For Strings we're passed the address of the first characters in a1
4721 // and a2 and the length in cnt1.
4722 // elem_size is the element size in bytes: either 1 or 2.
4723 // There are two implementations. For arrays >= 8 bytes, all
4724 // comparisons (including the final one, which may overlap) are
4725 // performed 8 bytes at a time. For strings < 8 bytes, we compare a
4726 // halfword, then a short, and then a byte.
4727
string_equals(Register a1,Register a2,Register result,Register cnt1,int elem_size)4728 void MacroAssembler::string_equals(Register a1, Register a2,
4729 Register result, Register cnt1, int elem_size)
4730 {
4731 Label SAME, DONE, SHORT, NEXT_WORD;
4732 Register tmp1 = rscratch1;
4733 Register tmp2 = rscratch2;
4734 Register cnt2 = tmp2; // cnt2 only used in array length compare
4735
4736 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
4737 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4738
4739 #ifndef PRODUCT
4740 {
4741 const char kind = (elem_size == 2) ? 'U' : 'L';
4742 char comment[64];
4743 snprintf(comment, sizeof comment, "{string_equals%c", kind);
4744 BLOCK_COMMENT(comment);
4745 }
4746 #endif
4747
4748 mov(result, false);
4749
4750 // Check for short strings, i.e. smaller than wordSize.
4751 subs(cnt1, cnt1, wordSize);
4752 br(Assembler::LT, SHORT);
4753 // Main 8 byte comparison loop.
4754 bind(NEXT_WORD); {
4755 ldr(tmp1, Address(post(a1, wordSize)));
4756 ldr(tmp2, Address(post(a2, wordSize)));
4757 subs(cnt1, cnt1, wordSize);
4758 eor(tmp1, tmp1, tmp2);
4759 cbnz(tmp1, DONE);
4760 } br(GT, NEXT_WORD);
4761 // Last longword. In the case where length == 4 we compare the
4762 // same longword twice, but that's still faster than another
4763 // conditional branch.
4764 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
4765 // length == 4.
4766 ldr(tmp1, Address(a1, cnt1));
4767 ldr(tmp2, Address(a2, cnt1));
4768 eor(tmp2, tmp1, tmp2);
4769 cbnz(tmp2, DONE);
4770 b(SAME);
4771
4772 bind(SHORT);
4773 Label TAIL03, TAIL01;
4774
4775 tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
4776 {
4777 ldrw(tmp1, Address(post(a1, 4)));
4778 ldrw(tmp2, Address(post(a2, 4)));
4779 eorw(tmp1, tmp1, tmp2);
4780 cbnzw(tmp1, DONE);
4781 }
4782 bind(TAIL03);
4783 tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
4784 {
4785 ldrh(tmp1, Address(post(a1, 2)));
4786 ldrh(tmp2, Address(post(a2, 2)));
4787 eorw(tmp1, tmp1, tmp2);
4788 cbnzw(tmp1, DONE);
4789 }
4790 bind(TAIL01);
4791 if (elem_size == 1) { // Only needed when comparing 1-byte elements
4792 tbz(cnt1, 0, SAME); // 0-1 bytes left.
4793 {
4794 ldrb(tmp1, a1);
4795 ldrb(tmp2, a2);
4796 eorw(tmp1, tmp1, tmp2);
4797 cbnzw(tmp1, DONE);
4798 }
4799 }
4800 // Arrays are equal.
4801 bind(SAME);
4802 mov(result, true);
4803
4804 // That's it.
4805 bind(DONE);
4806 BLOCK_COMMENT("} string_equals");
4807 }
4808
4809
4810 // The size of the blocks erased by the zero_blocks stub. We must
4811 // handle anything smaller than this ourselves in zero_words().
4812 const int MacroAssembler::zero_words_block_size = 8;
4813
4814 // zero_words() is used by C2 ClearArray patterns. It is as small as
4815 // possible, handling small word counts locally and delegating
4816 // anything larger to the zero_blocks stub. It is expanded many times
4817 // in compiled code, so it is important to keep it short.
4818
4819 // ptr: Address of a buffer to be zeroed.
4820 // cnt: Count in HeapWords.
4821 //
4822 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
zero_words(Register ptr,Register cnt)4823 address MacroAssembler::zero_words(Register ptr, Register cnt)
4824 {
4825 assert(is_power_of_2(zero_words_block_size), "adjust this");
4826 assert(ptr == r10 && cnt == r11, "mismatch in register usage");
4827
4828 BLOCK_COMMENT("zero_words {");
4829 cmp(cnt, (u1)zero_words_block_size);
4830 Label around;
4831 br(LO, around);
4832 {
4833 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
4834 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
4835 if (StubRoutines::aarch64::complete()) {
4836 address tpc = trampoline_call(zero_blocks);
4837 if (tpc == NULL) {
4838 DEBUG_ONLY(reset_labels1(around));
4839 postcond(pc() == badAddress);
4840 return NULL;
4841 }
4842 } else {
4843 bl(zero_blocks);
4844 }
4845 }
4846 bind(around);
4847 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
4848 Label l;
4849 tbz(cnt, exact_log2(i), l);
4850 for (int j = 0; j < i; j += 2) {
4851 stp(zr, zr, post(ptr, 16));
4852 }
4853 bind(l);
4854 }
4855 {
4856 Label l;
4857 tbz(cnt, 0, l);
4858 str(zr, Address(ptr));
4859 bind(l);
4860 }
4861 BLOCK_COMMENT("} zero_words");
4862 postcond(pc() != badAddress);
4863 return pc();
4864 }
4865
4866 // base: Address of a buffer to be zeroed, 8 bytes aligned.
4867 // cnt: Immediate count in HeapWords.
4868 #define SmallArraySize (18 * BytesPerLong)
zero_words(Register base,u_int64_t cnt)4869 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
4870 {
4871 BLOCK_COMMENT("zero_words {");
4872 int i = cnt & 1; // store any odd word to start
4873 if (i) str(zr, Address(base));
4874
4875 if (cnt <= SmallArraySize / BytesPerLong) {
4876 for (; i < (int)cnt; i += 2) {
4877 stp(zr, zr, Address(base, i * wordSize));
4878 }
4879 } else {
4880 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
4881 int remainder = cnt % (2 * unroll);
4882 for (; i < remainder; i += 2) {
4883 stp(zr, zr, Address(base, i * wordSize));
4884 }
4885 Label loop;
4886 Register cnt_reg = rscratch1;
4887 Register loop_base = rscratch2;
4888 cnt = cnt - remainder;
4889 mov(cnt_reg, cnt);
4890 // adjust base and prebias by -2 * wordSize so we can pre-increment
4891 add(loop_base, base, (remainder - 2) * wordSize);
4892 bind(loop);
4893 sub(cnt_reg, cnt_reg, 2 * unroll);
4894 for (i = 1; i < unroll; i++) {
4895 stp(zr, zr, Address(loop_base, 2 * i * wordSize));
4896 }
4897 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
4898 cbnz(cnt_reg, loop);
4899 }
4900 BLOCK_COMMENT("} zero_words");
4901 }
4902
4903 // Zero blocks of memory by using DC ZVA.
4904 //
4905 // Aligns the base address first sufficently for DC ZVA, then uses
4906 // DC ZVA repeatedly for every full block. cnt is the size to be
4907 // zeroed in HeapWords. Returns the count of words left to be zeroed
4908 // in cnt.
4909 //
4910 // NOTE: This is intended to be used in the zero_blocks() stub. If
4911 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
zero_dcache_blocks(Register base,Register cnt)4912 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
4913 Register tmp = rscratch1;
4914 Register tmp2 = rscratch2;
4915 int zva_length = VM_Version::zva_length();
4916 Label initial_table_end, loop_zva;
4917 Label fini;
4918
4919 // Base must be 16 byte aligned. If not just return and let caller handle it
4920 tst(base, 0x0f);
4921 br(Assembler::NE, fini);
4922 // Align base with ZVA length.
4923 neg(tmp, base);
4924 andr(tmp, tmp, zva_length - 1);
4925
4926 // tmp: the number of bytes to be filled to align the base with ZVA length.
4927 add(base, base, tmp);
4928 sub(cnt, cnt, tmp, Assembler::ASR, 3);
4929 adr(tmp2, initial_table_end);
4930 sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
4931 br(tmp2);
4932
4933 for (int i = -zva_length + 16; i < 0; i += 16)
4934 stp(zr, zr, Address(base, i));
4935 bind(initial_table_end);
4936
4937 sub(cnt, cnt, zva_length >> 3);
4938 bind(loop_zva);
4939 dc(Assembler::ZVA, base);
4940 subs(cnt, cnt, zva_length >> 3);
4941 add(base, base, zva_length);
4942 br(Assembler::GE, loop_zva);
4943 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
4944 bind(fini);
4945 }
4946
4947 // base: Address of a buffer to be filled, 8 bytes aligned.
4948 // cnt: Count in 8-byte unit.
4949 // value: Value to be filled with.
4950 // base will point to the end of the buffer after filling.
fill_words(Register base,Register cnt,Register value)4951 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
4952 {
4953 // Algorithm:
4954 //
4955 // scratch1 = cnt & 7;
4956 // cnt -= scratch1;
4957 // p += scratch1;
4958 // switch (scratch1) {
4959 // do {
4960 // cnt -= 8;
4961 // p[-8] = v;
4962 // case 7:
4963 // p[-7] = v;
4964 // case 6:
4965 // p[-6] = v;
4966 // // ...
4967 // case 1:
4968 // p[-1] = v;
4969 // case 0:
4970 // p += 8;
4971 // } while (cnt);
4972 // }
4973
4974 assert_different_registers(base, cnt, value, rscratch1, rscratch2);
4975
4976 Label fini, skip, entry, loop;
4977 const int unroll = 8; // Number of stp instructions we'll unroll
4978
4979 cbz(cnt, fini);
4980 tbz(base, 3, skip);
4981 str(value, Address(post(base, 8)));
4982 sub(cnt, cnt, 1);
4983 bind(skip);
4984
4985 andr(rscratch1, cnt, (unroll-1) * 2);
4986 sub(cnt, cnt, rscratch1);
4987 add(base, base, rscratch1, Assembler::LSL, 3);
4988 adr(rscratch2, entry);
4989 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
4990 br(rscratch2);
4991
4992 bind(loop);
4993 add(base, base, unroll * 16);
4994 for (int i = -unroll; i < 0; i++)
4995 stp(value, value, Address(base, i * 16));
4996 bind(entry);
4997 subs(cnt, cnt, unroll * 2);
4998 br(Assembler::GE, loop);
4999
5000 tbz(cnt, 0, fini);
5001 str(value, Address(post(base, 8)));
5002 bind(fini);
5003 }
5004
5005 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5006 // java/lang/StringUTF16.compress.
encode_iso_array(Register src,Register dst,Register len,Register result,FloatRegister Vtmp1,FloatRegister Vtmp2,FloatRegister Vtmp3,FloatRegister Vtmp4)5007 void MacroAssembler::encode_iso_array(Register src, Register dst,
5008 Register len, Register result,
5009 FloatRegister Vtmp1, FloatRegister Vtmp2,
5010 FloatRegister Vtmp3, FloatRegister Vtmp4)
5011 {
5012 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5013 NEXT_32_START, NEXT_32_PRFM_START;
5014 Register tmp1 = rscratch1, tmp2 = rscratch2;
5015
5016 mov(result, len); // Save initial len
5017
5018 cmp(len, (u1)8); // handle shortest strings first
5019 br(LT, LOOP_1);
5020 cmp(len, (u1)32);
5021 br(LT, NEXT_8);
5022 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5023 // to convert chars to bytes
5024 if (SoftwarePrefetchHintDistance >= 0) {
5025 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5026 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5027 br(LE, NEXT_32_START);
5028 b(NEXT_32_PRFM_START);
5029 BIND(NEXT_32_PRFM);
5030 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5031 BIND(NEXT_32_PRFM_START);
5032 prfm(Address(src, SoftwarePrefetchHintDistance));
5033 orr(v4, T16B, Vtmp1, Vtmp2);
5034 orr(v5, T16B, Vtmp3, Vtmp4);
5035 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5036 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5037 uzp2(v5, T16B, v4, v5); // high bytes
5038 umov(tmp2, v5, D, 1);
5039 fmovd(tmp1, v5);
5040 orr(tmp1, tmp1, tmp2);
5041 cbnz(tmp1, LOOP_8);
5042 stpq(Vtmp1, Vtmp3, dst);
5043 sub(len, len, 32);
5044 add(dst, dst, 32);
5045 add(src, src, 64);
5046 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5047 br(GE, NEXT_32_PRFM);
5048 cmp(len, (u1)32);
5049 br(LT, LOOP_8);
5050 BIND(NEXT_32);
5051 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5052 BIND(NEXT_32_START);
5053 } else {
5054 BIND(NEXT_32);
5055 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5056 }
5057 prfm(Address(src, SoftwarePrefetchHintDistance));
5058 uzp1(v4, T16B, Vtmp1, Vtmp2);
5059 uzp1(v5, T16B, Vtmp3, Vtmp4);
5060 orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5061 orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5062 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5063 umov(tmp2, Vtmp1, D, 1);
5064 fmovd(tmp1, Vtmp1);
5065 orr(tmp1, tmp1, tmp2);
5066 cbnz(tmp1, LOOP_8);
5067 stpq(v4, v5, dst);
5068 sub(len, len, 32);
5069 add(dst, dst, 32);
5070 add(src, src, 64);
5071 cmp(len, (u1)32);
5072 br(GE, NEXT_32);
5073 cbz(len, DONE);
5074
5075 BIND(LOOP_8);
5076 cmp(len, (u1)8);
5077 br(LT, LOOP_1);
5078 BIND(NEXT_8);
5079 ld1(Vtmp1, T8H, src);
5080 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5081 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5082 fmovd(tmp1, Vtmp3);
5083 cbnz(tmp1, NEXT_1);
5084 strd(Vtmp2, dst);
5085
5086 sub(len, len, 8);
5087 add(dst, dst, 8);
5088 add(src, src, 16);
5089 cmp(len, (u1)8);
5090 br(GE, NEXT_8);
5091
5092 BIND(LOOP_1);
5093
5094 cbz(len, DONE);
5095 BIND(NEXT_1);
5096 ldrh(tmp1, Address(post(src, 2)));
5097 tst(tmp1, 0xff00);
5098 br(NE, SET_RESULT);
5099 strb(tmp1, Address(post(dst, 1)));
5100 subs(len, len, 1);
5101 br(GT, NEXT_1);
5102
5103 BIND(SET_RESULT);
5104 sub(result, result, len); // Return index where we stopped
5105 // Return len == 0 if we processed all
5106 // characters
5107 BIND(DONE);
5108 }
5109
5110
5111 // Inflate byte[] array to char[].
byte_array_inflate(Register src,Register dst,Register len,FloatRegister vtmp1,FloatRegister vtmp2,FloatRegister vtmp3,Register tmp4)5112 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5113 FloatRegister vtmp1, FloatRegister vtmp2,
5114 FloatRegister vtmp3, Register tmp4) {
5115 Label big, done, after_init, to_stub;
5116
5117 assert_different_registers(src, dst, len, tmp4, rscratch1);
5118
5119 fmovd(vtmp1, zr);
5120 lsrw(tmp4, len, 3);
5121 bind(after_init);
5122 cbnzw(tmp4, big);
5123 // Short string: less than 8 bytes.
5124 {
5125 Label loop, tiny;
5126
5127 cmpw(len, 4);
5128 br(LT, tiny);
5129 // Use SIMD to do 4 bytes.
5130 ldrs(vtmp2, post(src, 4));
5131 zip1(vtmp3, T8B, vtmp2, vtmp1);
5132 subw(len, len, 4);
5133 strd(vtmp3, post(dst, 8));
5134
5135 cbzw(len, done);
5136
5137 // Do the remaining bytes by steam.
5138 bind(loop);
5139 ldrb(tmp4, post(src, 1));
5140 strh(tmp4, post(dst, 2));
5141 subw(len, len, 1);
5142
5143 bind(tiny);
5144 cbnz(len, loop);
5145
5146 b(done);
5147 }
5148
5149 if (SoftwarePrefetchHintDistance >= 0) {
5150 bind(to_stub);
5151 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5152 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5153 address tpc = trampoline_call(stub);
5154 if (tpc == NULL) {
5155 DEBUG_ONLY(reset_labels2(big, done));
5156 postcond(pc() == badAddress);
5157 return NULL;
5158 }
5159 b(after_init);
5160 }
5161
5162 // Unpack the bytes 8 at a time.
5163 bind(big);
5164 {
5165 Label loop, around, loop_last, loop_start;
5166
5167 if (SoftwarePrefetchHintDistance >= 0) {
5168 const int large_loop_threshold = (64 + 16)/8;
5169 ldrd(vtmp2, post(src, 8));
5170 andw(len, len, 7);
5171 cmp(tmp4, (u1)large_loop_threshold);
5172 br(GE, to_stub);
5173 b(loop_start);
5174
5175 bind(loop);
5176 ldrd(vtmp2, post(src, 8));
5177 bind(loop_start);
5178 subs(tmp4, tmp4, 1);
5179 br(EQ, loop_last);
5180 zip1(vtmp2, T16B, vtmp2, vtmp1);
5181 ldrd(vtmp3, post(src, 8));
5182 st1(vtmp2, T8H, post(dst, 16));
5183 subs(tmp4, tmp4, 1);
5184 zip1(vtmp3, T16B, vtmp3, vtmp1);
5185 st1(vtmp3, T8H, post(dst, 16));
5186 br(NE, loop);
5187 b(around);
5188 bind(loop_last);
5189 zip1(vtmp2, T16B, vtmp2, vtmp1);
5190 st1(vtmp2, T8H, post(dst, 16));
5191 bind(around);
5192 cbz(len, done);
5193 } else {
5194 andw(len, len, 7);
5195 bind(loop);
5196 ldrd(vtmp2, post(src, 8));
5197 sub(tmp4, tmp4, 1);
5198 zip1(vtmp3, T16B, vtmp2, vtmp1);
5199 st1(vtmp3, T8H, post(dst, 16));
5200 cbnz(tmp4, loop);
5201 }
5202 }
5203
5204 // Do the tail of up to 8 bytes.
5205 add(src, src, len);
5206 ldrd(vtmp3, Address(src, -8));
5207 add(dst, dst, len, ext::uxtw, 1);
5208 zip1(vtmp3, T16B, vtmp3, vtmp1);
5209 strq(vtmp3, Address(dst, -16));
5210
5211 bind(done);
5212 postcond(pc() != badAddress);
5213 return pc();
5214 }
5215
5216 // Compress char[] array to byte[].
char_array_compress(Register src,Register dst,Register len,FloatRegister tmp1Reg,FloatRegister tmp2Reg,FloatRegister tmp3Reg,FloatRegister tmp4Reg,Register result)5217 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5218 FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5219 FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5220 Register result) {
5221 encode_iso_array(src, dst, len, result,
5222 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5223 cmp(len, zr);
5224 csel(result, result, zr, EQ);
5225 }
5226
5227 // get_thread() can be called anywhere inside generated code so we
5228 // need to save whatever non-callee save context might get clobbered
5229 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5230 // the call setup code.
5231 //
5232 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5233 //
get_thread(Register dst)5234 void MacroAssembler::get_thread(Register dst) {
5235 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5236 push(saved_regs, sp);
5237
5238 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5239 blr(lr);
5240 if (dst != c_rarg0) {
5241 mov(dst, c_rarg0);
5242 }
5243
5244 pop(saved_regs, sp);
5245 }
5246
cache_wb(Address line)5247 void MacroAssembler::cache_wb(Address line) {
5248 assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
5249 assert(line.index() == noreg, "index should be noreg");
5250 assert(line.offset() == 0, "offset should be 0");
5251 // would like to assert this
5252 // assert(line._ext.shift == 0, "shift should be zero");
5253 if (VM_Version::supports_dcpop()) {
5254 // writeback using clear virtual address to point of persistence
5255 dc(Assembler::CVAP, line.base());
5256 } else {
5257 // no need to generate anything as Unsafe.writebackMemory should
5258 // never invoke this stub
5259 }
5260 }
5261
cache_wbsync(bool is_pre)5262 void MacroAssembler::cache_wbsync(bool is_pre) {
5263 // we only need a barrier post sync
5264 if (!is_pre) {
5265 membar(Assembler::AnyAny);
5266 }
5267 }
5268