1 /*
2 * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include <sys/types.h>
27
28 #include "precompiled.hpp"
29 #include "jvm.h"
30 #include "asm/assembler.hpp"
31 #include "asm/assembler.inline.hpp"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/cardTable.hpp"
34 #include "gc/shared/barrierSetAssembler.hpp"
35 #include "gc/shared/cardTableBarrierSet.hpp"
36 #include "interpreter/interpreter.hpp"
37 #include "compiler/disassembler.hpp"
38 #include "memory/resourceArea.hpp"
39 #include "nativeInst_aarch64.hpp"
40 #include "oops/accessDecorators.hpp"
41 #include "oops/compressedOops.inline.hpp"
42 #include "oops/klass.inline.hpp"
43 #include "runtime/biasedLocking.hpp"
44 #include "runtime/icache.hpp"
45 #include "runtime/interfaceSupport.inline.hpp"
46 #include "runtime/jniHandles.inline.hpp"
47 #include "runtime/sharedRuntime.hpp"
48 #include "runtime/thread.hpp"
49 #ifdef COMPILER1
50 #include "c1/c1_LIRAssembler.hpp"
51 #endif
52 #ifdef COMPILER2
53 #include "oops/oop.hpp"
54 #include "opto/compile.hpp"
55 #include "opto/intrinsicnode.hpp"
56 #include "opto/node.hpp"
57 #endif
58
59 #ifdef PRODUCT
60 #define BLOCK_COMMENT(str) /* nothing */
61 #define STOP(error) stop(error)
62 #else
63 #define BLOCK_COMMENT(str) block_comment(str)
64 #define STOP(error) block_comment(error); stop(error)
65 #endif
66
67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
68
69 // Patch any kind of instruction; there may be several instructions.
70 // Return the total length (in bytes) of the instructions.
pd_patch_instruction_size(address branch,address target)71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
72 int instructions = 1;
73 assert((uint64_t)target < (1ull << 48), "48-bit overflow in address constant");
74 intptr_t offset = (target - branch) >> 2;
75 unsigned insn = *(unsigned*)branch;
76 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
77 // Load register (literal)
78 Instruction_aarch64::spatch(branch, 23, 5, offset);
79 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
80 // Unconditional branch (immediate)
81 Instruction_aarch64::spatch(branch, 25, 0, offset);
82 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
83 // Conditional branch (immediate)
84 Instruction_aarch64::spatch(branch, 23, 5, offset);
85 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
86 // Compare & branch (immediate)
87 Instruction_aarch64::spatch(branch, 23, 5, offset);
88 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
89 // Test & branch (immediate)
90 Instruction_aarch64::spatch(branch, 18, 5, offset);
91 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
92 // PC-rel. addressing
93 offset = target-branch;
94 int shift = Instruction_aarch64::extract(insn, 31, 31);
95 if (shift) {
96 uint64_t dest = (uint64_t)target;
97 uint64_t pc_page = (uint64_t)branch >> 12;
98 uint64_t adr_page = (uint64_t)target >> 12;
99 unsigned offset_lo = dest & 0xfff;
100 offset = adr_page - pc_page;
101
102 // We handle 4 types of PC relative addressing
103 // 1 - adrp Rx, target_page
104 // ldr/str Ry, [Rx, #offset_in_page]
105 // 2 - adrp Rx, target_page
106 // add Ry, Rx, #offset_in_page
107 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0)
108 // movk Rx, #imm16<<32
109 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0)
110 // In the first 3 cases we must check that Rx is the same in the adrp and the
111 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
112 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
113 // to be followed by a random unrelated ldr/str, add or movk instruction.
114 //
115 unsigned insn2 = ((unsigned*)branch)[1];
116 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
117 Instruction_aarch64::extract(insn, 4, 0) ==
118 Instruction_aarch64::extract(insn2, 9, 5)) {
119 // Load/store register (unsigned immediate)
120 unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
121 Instruction_aarch64::patch(branch + sizeof (unsigned),
122 21, 10, offset_lo >> size);
123 guarantee(((dest >> size) << size) == dest, "misaligned target");
124 instructions = 2;
125 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
126 Instruction_aarch64::extract(insn, 4, 0) ==
127 Instruction_aarch64::extract(insn2, 4, 0)) {
128 // add (immediate)
129 Instruction_aarch64::patch(branch + sizeof (unsigned),
130 21, 10, offset_lo);
131 instructions = 2;
132 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
133 Instruction_aarch64::extract(insn, 4, 0) ==
134 Instruction_aarch64::extract(insn2, 4, 0)) {
135 // movk #imm16<<32
136 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
137 uintptr_t dest = ((uintptr_t)target & 0xffffffffULL) | ((uintptr_t)branch & 0xffff00000000ULL);
138 uintptr_t pc_page = (uintptr_t)branch >> 12;
139 uintptr_t adr_page = (uintptr_t)dest >> 12;
140 offset = adr_page - pc_page;
141 instructions = 2;
142 }
143 }
144 int offset_lo = offset & 3;
145 offset >>= 2;
146 Instruction_aarch64::spatch(branch, 23, 5, offset);
147 Instruction_aarch64::patch(branch, 30, 29, offset_lo);
148 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
149 uint64_t dest = (uint64_t)target;
150 // Move wide constant
151 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
152 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
153 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
154 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
155 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
156 assert(target_addr_for_insn(branch) == target, "should be");
157 instructions = 3;
158 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
159 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
160 // nothing to do
161 assert(target == 0, "did not expect to relocate target for polling page load");
162 } else {
163 ShouldNotReachHere();
164 }
165 return instructions * NativeInstruction::instruction_size;
166 }
167
patch_oop(address insn_addr,address o)168 int MacroAssembler::patch_oop(address insn_addr, address o) {
169 int instructions;
170 unsigned insn = *(unsigned*)insn_addr;
171 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
172
173 // OOPs are either narrow (32 bits) or wide (48 bits). We encode
174 // narrow OOPs by setting the upper 16 bits in the first
175 // instruction.
176 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
177 // Move narrow OOP
178 narrowOop n = CompressedOops::encode((oop)o);
179 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
180 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
181 instructions = 2;
182 } else {
183 // Move wide OOP
184 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
185 uintptr_t dest = (uintptr_t)o;
186 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
187 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
188 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
189 instructions = 3;
190 }
191 return instructions * NativeInstruction::instruction_size;
192 }
193
patch_narrow_klass(address insn_addr,narrowKlass n)194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
195 // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
196 // We encode narrow ones by setting the upper 16 bits in the first
197 // instruction.
198 NativeInstruction *insn = nativeInstruction_at(insn_addr);
199 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
200 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
201
202 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
203 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
204 return 2 * NativeInstruction::instruction_size;
205 }
206
target_addr_for_insn(address insn_addr,unsigned insn)207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
208 intptr_t offset = 0;
209 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
210 // Load register (literal)
211 offset = Instruction_aarch64::sextract(insn, 23, 5);
212 return address(((uint64_t)insn_addr + (offset << 2)));
213 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
214 // Unconditional branch (immediate)
215 offset = Instruction_aarch64::sextract(insn, 25, 0);
216 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
217 // Conditional branch (immediate)
218 offset = Instruction_aarch64::sextract(insn, 23, 5);
219 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
220 // Compare & branch (immediate)
221 offset = Instruction_aarch64::sextract(insn, 23, 5);
222 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
223 // Test & branch (immediate)
224 offset = Instruction_aarch64::sextract(insn, 18, 5);
225 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
226 // PC-rel. addressing
227 offset = Instruction_aarch64::extract(insn, 30, 29);
228 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
229 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
230 if (shift) {
231 offset <<= shift;
232 uint64_t target_page = ((uint64_t)insn_addr) + offset;
233 target_page &= ((uint64_t)-1) << shift;
234 // Return the target address for the following sequences
235 // 1 - adrp Rx, target_page
236 // ldr/str Ry, [Rx, #offset_in_page]
237 // 2 - adrp Rx, target_page
238 // add Ry, Rx, #offset_in_page
239 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0)
240 // movk Rx, #imm12<<32
241 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0)
242 //
243 // In the first two cases we check that the register is the same and
244 // return the target_page + the offset within the page.
245 // Otherwise we assume it is a page aligned relocation and return
246 // the target page only.
247 //
248 unsigned insn2 = ((unsigned*)insn_addr)[1];
249 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
250 Instruction_aarch64::extract(insn, 4, 0) ==
251 Instruction_aarch64::extract(insn2, 9, 5)) {
252 // Load/store register (unsigned immediate)
253 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
254 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
255 return address(target_page + (byte_offset << size));
256 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
257 Instruction_aarch64::extract(insn, 4, 0) ==
258 Instruction_aarch64::extract(insn2, 4, 0)) {
259 // add (immediate)
260 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
261 return address(target_page + byte_offset);
262 } else {
263 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
264 Instruction_aarch64::extract(insn, 4, 0) ==
265 Instruction_aarch64::extract(insn2, 4, 0)) {
266 target_page = (target_page & 0xffffffff) |
267 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
268 }
269 return (address)target_page;
270 }
271 } else {
272 ShouldNotReachHere();
273 }
274 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
275 uint32_t *insns = (uint32_t *)insn_addr;
276 // Move wide constant: movz, movk, movk. See movptr().
277 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
278 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
279 return address(uint64_t(Instruction_aarch64::extract(insns[0], 20, 5))
280 + (uint64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
281 + (uint64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
282 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
283 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
284 return 0;
285 } else {
286 ShouldNotReachHere();
287 }
288 return address(((uint64_t)insn_addr + (offset << 2)));
289 }
290
serialize_memory(Register thread,Register tmp)291 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
292 dsb(Assembler::SY);
293 }
294
safepoint_poll(Label & slow_path)295 void MacroAssembler::safepoint_poll(Label& slow_path) {
296 if (SafepointMechanism::uses_thread_local_poll()) {
297 ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
298 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
299 } else {
300 unsigned long offset;
301 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
302 ldrw(rscratch1, Address(rscratch1, offset));
303 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
304 cbnz(rscratch1, slow_path);
305 }
306 }
307
308 // Just like safepoint_poll, but use an acquiring load for thread-
309 // local polling.
310 //
311 // We need an acquire here to ensure that any subsequent load of the
312 // global SafepointSynchronize::_state flag is ordered after this load
313 // of the local Thread::_polling page. We don't want this poll to
314 // return false (i.e. not safepointing) and a later poll of the global
315 // SafepointSynchronize::_state spuriously to return true.
316 //
317 // This is to avoid a race when we're in a native->Java transition
318 // racing the code which wakes up from a safepoint.
319 //
safepoint_poll_acquire(Label & slow_path)320 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
321 if (SafepointMechanism::uses_thread_local_poll()) {
322 lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
323 ldar(rscratch1, rscratch1);
324 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
325 } else {
326 safepoint_poll(slow_path);
327 }
328 }
329
reset_last_Java_frame(bool clear_fp)330 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
331 // we must set sp to zero to clear frame
332 str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
333
334 // must clear fp, so that compiled frames are not confused; it is
335 // possible that we need it only for debugging
336 if (clear_fp) {
337 str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
338 }
339
340 // Always clear the pc because it could have been set by make_walkable()
341 str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
342 }
343
344 // Calls to C land
345 //
346 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
347 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
348 // has to be reset to 0. This is required to allow proper stack traversal.
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Register last_java_pc,Register scratch)349 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
350 Register last_java_fp,
351 Register last_java_pc,
352 Register scratch) {
353
354 if (last_java_pc->is_valid()) {
355 str(last_java_pc, Address(rthread,
356 JavaThread::frame_anchor_offset()
357 + JavaFrameAnchor::last_Java_pc_offset()));
358 }
359
360 // determine last_java_sp register
361 if (last_java_sp == sp) {
362 mov(scratch, sp);
363 last_java_sp = scratch;
364 } else if (!last_java_sp->is_valid()) {
365 last_java_sp = esp;
366 }
367
368 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
369
370 // last_java_fp is optional
371 if (last_java_fp->is_valid()) {
372 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
373 }
374 }
375
set_last_Java_frame(Register last_java_sp,Register last_java_fp,address last_java_pc,Register scratch)376 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
377 Register last_java_fp,
378 address last_java_pc,
379 Register scratch) {
380 assert(last_java_pc != NULL, "must provide a valid PC");
381
382 adr(scratch, last_java_pc);
383 str(scratch, Address(rthread,
384 JavaThread::frame_anchor_offset()
385 + JavaFrameAnchor::last_Java_pc_offset()));
386
387 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
388 }
389
set_last_Java_frame(Register last_java_sp,Register last_java_fp,Label & L,Register scratch)390 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
391 Register last_java_fp,
392 Label &L,
393 Register scratch) {
394 if (L.is_bound()) {
395 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
396 } else {
397 InstructionMark im(this);
398 L.add_patch_at(code(), locator());
399 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
400 }
401 }
402
far_call(Address entry,CodeBuffer * cbuf,Register tmp)403 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
404 assert(ReservedCodeCacheSize < 4*G, "branch out of range");
405 assert(CodeCache::find_blob(entry.target()) != NULL,
406 "destination of far call not found in code cache");
407 if (far_branches()) {
408 uintptr_t offset;
409 // We can use ADRP here because we know that the total size of
410 // the code cache cannot exceed 2Gb.
411 adrp(tmp, entry, offset);
412 add(tmp, tmp, offset);
413 if (cbuf) cbuf->set_insts_mark();
414 blr(tmp);
415 } else {
416 if (cbuf) cbuf->set_insts_mark();
417 bl(entry);
418 }
419 }
420
far_jump(Address entry,CodeBuffer * cbuf,Register tmp)421 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
422 assert(ReservedCodeCacheSize < 4*G, "branch out of range");
423 assert(CodeCache::find_blob(entry.target()) != NULL,
424 "destination of far call not found in code cache");
425 if (far_branches()) {
426 uintptr_t offset;
427 // We can use ADRP here because we know that the total size of
428 // the code cache cannot exceed 2Gb.
429 adrp(tmp, entry, offset);
430 add(tmp, tmp, offset);
431 if (cbuf) cbuf->set_insts_mark();
432 br(tmp);
433 } else {
434 if (cbuf) cbuf->set_insts_mark();
435 b(entry);
436 }
437 }
438
reserved_stack_check()439 void MacroAssembler::reserved_stack_check() {
440 // testing if reserved zone needs to be enabled
441 Label no_reserved_zone_enabling;
442
443 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
444 cmp(sp, rscratch1);
445 br(Assembler::LO, no_reserved_zone_enabling);
446
447 enter(); // LR and FP are live.
448 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
449 mov(c_rarg0, rthread);
450 blr(rscratch1);
451 leave();
452
453 // We have already removed our own frame.
454 // throw_delayed_StackOverflowError will think that it's been
455 // called by our caller.
456 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
457 br(rscratch1);
458 should_not_reach_here();
459
460 bind(no_reserved_zone_enabling);
461 }
462
biased_locking_enter(Register lock_reg,Register obj_reg,Register swap_reg,Register tmp_reg,bool swap_reg_contains_mark,Label & done,Label * slow_case,BiasedLockingCounters * counters)463 int MacroAssembler::biased_locking_enter(Register lock_reg,
464 Register obj_reg,
465 Register swap_reg,
466 Register tmp_reg,
467 bool swap_reg_contains_mark,
468 Label& done,
469 Label* slow_case,
470 BiasedLockingCounters* counters) {
471 assert(UseBiasedLocking, "why call this otherwise?");
472 assert_different_registers(lock_reg, obj_reg, swap_reg);
473
474 if (PrintBiasedLockingStatistics && counters == NULL)
475 counters = BiasedLocking::counters();
476
477 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
478 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
479 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
480 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes());
481 Address saved_mark_addr(lock_reg, 0);
482
483 // Biased locking
484 // See whether the lock is currently biased toward our thread and
485 // whether the epoch is still valid
486 // Note that the runtime guarantees sufficient alignment of JavaThread
487 // pointers to allow age to be placed into low bits
488 // First check to see whether biasing is even enabled for this object
489 Label cas_label;
490 int null_check_offset = -1;
491 if (!swap_reg_contains_mark) {
492 null_check_offset = offset();
493 ldr(swap_reg, mark_addr);
494 }
495 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
496 cmp(tmp_reg, markOopDesc::biased_lock_pattern);
497 br(Assembler::NE, cas_label);
498 // The bias pattern is present in the object's header. Need to check
499 // whether the bias owner and the epoch are both still current.
500 load_prototype_header(tmp_reg, obj_reg);
501 orr(tmp_reg, tmp_reg, rthread);
502 eor(tmp_reg, swap_reg, tmp_reg);
503 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
504 if (counters != NULL) {
505 Label around;
506 cbnz(tmp_reg, around);
507 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
508 b(done);
509 bind(around);
510 } else {
511 cbz(tmp_reg, done);
512 }
513
514 Label try_revoke_bias;
515 Label try_rebias;
516
517 // At this point we know that the header has the bias pattern and
518 // that we are not the bias owner in the current epoch. We need to
519 // figure out more details about the state of the header in order to
520 // know what operations can be legally performed on the object's
521 // header.
522
523 // If the low three bits in the xor result aren't clear, that means
524 // the prototype header is no longer biased and we have to revoke
525 // the bias on this object.
526 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
527 cbnz(rscratch1, try_revoke_bias);
528
529 // Biasing is still enabled for this data type. See whether the
530 // epoch of the current bias is still valid, meaning that the epoch
531 // bits of the mark word are equal to the epoch bits of the
532 // prototype header. (Note that the prototype header's epoch bits
533 // only change at a safepoint.) If not, attempt to rebias the object
534 // toward the current thread. Note that we must be absolutely sure
535 // that the current epoch is invalid in order to do this because
536 // otherwise the manipulations it performs on the mark word are
537 // illegal.
538 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
539 cbnz(rscratch1, try_rebias);
540
541 // The epoch of the current bias is still valid but we know nothing
542 // about the owner; it might be set or it might be clear. Try to
543 // acquire the bias of the object using an atomic operation. If this
544 // fails we will go in to the runtime to revoke the object's bias.
545 // Note that we first construct the presumed unbiased header so we
546 // don't accidentally blow away another thread's valid bias.
547 {
548 Label here;
549 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
550 andr(swap_reg, swap_reg, rscratch1);
551 orr(tmp_reg, swap_reg, rthread);
552 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
553 // If the biasing toward our thread failed, this means that
554 // another thread succeeded in biasing it toward itself and we
555 // need to revoke that bias. The revocation will occur in the
556 // interpreter runtime in the slow case.
557 bind(here);
558 if (counters != NULL) {
559 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
560 tmp_reg, rscratch1, rscratch2);
561 }
562 }
563 b(done);
564
565 bind(try_rebias);
566 // At this point we know the epoch has expired, meaning that the
567 // current "bias owner", if any, is actually invalid. Under these
568 // circumstances _only_, we are allowed to use the current header's
569 // value as the comparison value when doing the cas to acquire the
570 // bias in the current epoch. In other words, we allow transfer of
571 // the bias from one thread to another directly in this situation.
572 //
573 // FIXME: due to a lack of registers we currently blow away the age
574 // bits in this situation. Should attempt to preserve them.
575 {
576 Label here;
577 load_prototype_header(tmp_reg, obj_reg);
578 orr(tmp_reg, rthread, tmp_reg);
579 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
580 // If the biasing toward our thread failed, then another thread
581 // succeeded in biasing it toward itself and we need to revoke that
582 // bias. The revocation will occur in the runtime in the slow case.
583 bind(here);
584 if (counters != NULL) {
585 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
586 tmp_reg, rscratch1, rscratch2);
587 }
588 }
589 b(done);
590
591 bind(try_revoke_bias);
592 // The prototype mark in the klass doesn't have the bias bit set any
593 // more, indicating that objects of this data type are not supposed
594 // to be biased any more. We are going to try to reset the mark of
595 // this object to the prototype value and fall through to the
596 // CAS-based locking scheme. Note that if our CAS fails, it means
597 // that another thread raced us for the privilege of revoking the
598 // bias of this particular object, so it's okay to continue in the
599 // normal locking code.
600 //
601 // FIXME: due to a lack of registers we currently blow away the age
602 // bits in this situation. Should attempt to preserve them.
603 {
604 Label here, nope;
605 load_prototype_header(tmp_reg, obj_reg);
606 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
607 bind(here);
608
609 // Fall through to the normal CAS-based lock, because no matter what
610 // the result of the above CAS, some thread must have succeeded in
611 // removing the bias bit from the object's header.
612 if (counters != NULL) {
613 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
614 rscratch1, rscratch2);
615 }
616 bind(nope);
617 }
618
619 bind(cas_label);
620
621 return null_check_offset;
622 }
623
biased_locking_exit(Register obj_reg,Register temp_reg,Label & done)624 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
625 assert(UseBiasedLocking, "why call this otherwise?");
626
627 // Check for biased locking unlock case, which is a no-op
628 // Note: we do not have to check the thread ID for two reasons.
629 // First, the interpreter checks for IllegalMonitorStateException at
630 // a higher level. Second, if the bias was revoked while we held the
631 // lock, the object could not be rebiased toward another thread, so
632 // the bias bit would be clear.
633 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
634 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
635 cmp(temp_reg, markOopDesc::biased_lock_pattern);
636 br(Assembler::EQ, done);
637 }
638
pass_arg0(MacroAssembler * masm,Register arg)639 static void pass_arg0(MacroAssembler* masm, Register arg) {
640 if (c_rarg0 != arg ) {
641 masm->mov(c_rarg0, arg);
642 }
643 }
644
pass_arg1(MacroAssembler * masm,Register arg)645 static void pass_arg1(MacroAssembler* masm, Register arg) {
646 if (c_rarg1 != arg ) {
647 masm->mov(c_rarg1, arg);
648 }
649 }
650
pass_arg2(MacroAssembler * masm,Register arg)651 static void pass_arg2(MacroAssembler* masm, Register arg) {
652 if (c_rarg2 != arg ) {
653 masm->mov(c_rarg2, arg);
654 }
655 }
656
pass_arg3(MacroAssembler * masm,Register arg)657 static void pass_arg3(MacroAssembler* masm, Register arg) {
658 if (c_rarg3 != arg ) {
659 masm->mov(c_rarg3, arg);
660 }
661 }
662
call_VM_base(Register oop_result,Register java_thread,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)663 void MacroAssembler::call_VM_base(Register oop_result,
664 Register java_thread,
665 Register last_java_sp,
666 address entry_point,
667 int number_of_arguments,
668 bool check_exceptions) {
669 // determine java_thread register
670 if (!java_thread->is_valid()) {
671 java_thread = rthread;
672 }
673
674 // determine last_java_sp register
675 if (!last_java_sp->is_valid()) {
676 last_java_sp = esp;
677 }
678
679 // debugging support
680 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
681 assert(java_thread == rthread, "unexpected register");
682 #ifdef ASSERT
683 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
684 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
685 #endif // ASSERT
686
687 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
688 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
689
690 // push java thread (becomes first argument of C function)
691
692 mov(c_rarg0, java_thread);
693
694 // set last Java frame before call
695 assert(last_java_sp != rfp, "can't use rfp");
696
697 Label l;
698 set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
699
700 // do the call, remove parameters
701 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
702
703 // lr could be poisoned with PAC signature during throw_pending_exception
704 // if it was tail-call optimized by compiler, since lr is not callee-saved
705 // reload it with proper value
706 adr(lr, l);
707
708 // reset last Java frame
709 // Only interpreter should have to clear fp
710 reset_last_Java_frame(true);
711
712 // C++ interp handles this in the interpreter
713 check_and_handle_popframe(java_thread);
714 check_and_handle_earlyret(java_thread);
715
716 if (check_exceptions) {
717 // check for pending exceptions (java_thread is set upon return)
718 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
719 Label ok;
720 cbz(rscratch1, ok);
721 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
722 br(rscratch1);
723 bind(ok);
724 }
725
726 // get oop result if there is one and reset the value in the thread
727 if (oop_result->is_valid()) {
728 get_vm_result(oop_result, java_thread);
729 }
730 }
731
call_VM_helper(Register oop_result,address entry_point,int number_of_arguments,bool check_exceptions)732 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
733 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
734 }
735
736 // Maybe emit a call via a trampoline. If the code cache is small
737 // trampolines won't be emitted.
738
trampoline_call(Address entry,CodeBuffer * cbuf)739 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
740 assert(JavaThread::current()->is_Compiler_thread(), "just checking");
741 assert(entry.rspec().type() == relocInfo::runtime_call_type
742 || entry.rspec().type() == relocInfo::opt_virtual_call_type
743 || entry.rspec().type() == relocInfo::static_call_type
744 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
745
746 // We need a trampoline if branches are far.
747 if (far_branches()) {
748 bool in_scratch_emit_size = false;
749 #ifdef COMPILER2
750 // We don't want to emit a trampoline if C2 is generating dummy
751 // code during its branch shortening phase.
752 CompileTask* task = ciEnv::current()->task();
753 in_scratch_emit_size =
754 (task != NULL && is_c2_compile(task->comp_level()) &&
755 Compile::current()->in_scratch_emit_size());
756 #endif
757 if (!in_scratch_emit_size) {
758 address stub = emit_trampoline_stub(offset(), entry.target());
759 if (stub == NULL) {
760 postcond(pc() == badAddress);
761 return NULL; // CodeCache is full
762 }
763 }
764 }
765
766 if (cbuf) cbuf->set_insts_mark();
767 relocate(entry.rspec());
768 if (!far_branches()) {
769 bl(entry.target());
770 } else {
771 bl(pc());
772 }
773 // just need to return a non-null address
774 postcond(pc() != badAddress);
775 return pc();
776 }
777
778
779 // Emit a trampoline stub for a call to a target which is too far away.
780 //
781 // code sequences:
782 //
783 // call-site:
784 // branch-and-link to <destination> or <trampoline stub>
785 //
786 // Related trampoline stub for this call site in the stub section:
787 // load the call target from the constant pool
788 // branch (LR still points to the call site above)
789
emit_trampoline_stub(int insts_call_instruction_offset,address dest)790 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
791 address dest) {
792 // Max stub size: alignment nop, TrampolineStub.
793 address stub = start_a_stub(NativeInstruction::instruction_size
794 + NativeCallTrampolineStub::instruction_size);
795 if (stub == NULL) {
796 return NULL; // CodeBuffer::expand failed
797 }
798
799 // Create a trampoline stub relocation which relates this trampoline stub
800 // with the call instruction at insts_call_instruction_offset in the
801 // instructions code-section.
802 align(wordSize);
803 relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
804 + insts_call_instruction_offset));
805 const int stub_start_offset = offset();
806
807 // Now, create the trampoline stub's code:
808 // - load the call
809 // - call
810 Label target;
811 ldr(rscratch1, target);
812 br(rscratch1);
813 bind(target);
814 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
815 "should be");
816 emit_int64((int64_t)dest);
817
818 const address stub_start_addr = addr_at(stub_start_offset);
819
820 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
821
822 end_a_stub();
823 return stub_start_addr;
824 }
825
emit_static_call_stub()826 void MacroAssembler::emit_static_call_stub() {
827 // CompiledDirectStaticCall::set_to_interpreted knows the
828 // exact layout of this stub.
829
830 isb();
831 mov_metadata(rmethod, (Metadata*)NULL);
832
833 // Jump to the entry point of the i2c stub.
834 movptr(rscratch1, 0);
835 br(rscratch1);
836 }
837
c2bool(Register x)838 void MacroAssembler::c2bool(Register x) {
839 // implements x == 0 ? 0 : 1
840 // note: must only look at least-significant byte of x
841 // since C-style booleans are stored in one byte
842 // only! (was bug)
843 tst(x, 0xff);
844 cset(x, Assembler::NE);
845 }
846
ic_call(address entry,jint method_index)847 address MacroAssembler::ic_call(address entry, jint method_index) {
848 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
849 // address const_ptr = long_constant((jlong)Universe::non_oop_word());
850 // uintptr_t offset;
851 // ldr_constant(rscratch2, const_ptr);
852 movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
853 return trampoline_call(Address(entry, rh));
854 }
855
856 // Implementation of call_VM versions
857
call_VM(Register oop_result,address entry_point,bool check_exceptions)858 void MacroAssembler::call_VM(Register oop_result,
859 address entry_point,
860 bool check_exceptions) {
861 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
862 }
863
call_VM(Register oop_result,address entry_point,Register arg_1,bool check_exceptions)864 void MacroAssembler::call_VM(Register oop_result,
865 address entry_point,
866 Register arg_1,
867 bool check_exceptions) {
868 pass_arg1(this, arg_1);
869 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
870 }
871
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)872 void MacroAssembler::call_VM(Register oop_result,
873 address entry_point,
874 Register arg_1,
875 Register arg_2,
876 bool check_exceptions) {
877 assert(arg_1 != c_rarg2, "smashed arg");
878 pass_arg2(this, arg_2);
879 pass_arg1(this, arg_1);
880 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
881 }
882
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)883 void MacroAssembler::call_VM(Register oop_result,
884 address entry_point,
885 Register arg_1,
886 Register arg_2,
887 Register arg_3,
888 bool check_exceptions) {
889 assert(arg_1 != c_rarg3, "smashed arg");
890 assert(arg_2 != c_rarg3, "smashed arg");
891 pass_arg3(this, arg_3);
892
893 assert(arg_1 != c_rarg2, "smashed arg");
894 pass_arg2(this, arg_2);
895
896 pass_arg1(this, arg_1);
897 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
898 }
899
call_VM(Register oop_result,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)900 void MacroAssembler::call_VM(Register oop_result,
901 Register last_java_sp,
902 address entry_point,
903 int number_of_arguments,
904 bool check_exceptions) {
905 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
906 }
907
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,bool check_exceptions)908 void MacroAssembler::call_VM(Register oop_result,
909 Register last_java_sp,
910 address entry_point,
911 Register arg_1,
912 bool check_exceptions) {
913 pass_arg1(this, arg_1);
914 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
915 }
916
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)917 void MacroAssembler::call_VM(Register oop_result,
918 Register last_java_sp,
919 address entry_point,
920 Register arg_1,
921 Register arg_2,
922 bool check_exceptions) {
923
924 assert(arg_1 != c_rarg2, "smashed arg");
925 pass_arg2(this, arg_2);
926 pass_arg1(this, arg_1);
927 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
928 }
929
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)930 void MacroAssembler::call_VM(Register oop_result,
931 Register last_java_sp,
932 address entry_point,
933 Register arg_1,
934 Register arg_2,
935 Register arg_3,
936 bool check_exceptions) {
937 assert(arg_1 != c_rarg3, "smashed arg");
938 assert(arg_2 != c_rarg3, "smashed arg");
939 pass_arg3(this, arg_3);
940 assert(arg_1 != c_rarg2, "smashed arg");
941 pass_arg2(this, arg_2);
942 pass_arg1(this, arg_1);
943 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
944 }
945
946
get_vm_result(Register oop_result,Register java_thread)947 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
948 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
949 str(zr, Address(java_thread, JavaThread::vm_result_offset()));
950 verify_oop(oop_result, "broken oop in call_VM_base");
951 }
952
get_vm_result_2(Register metadata_result,Register java_thread)953 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
954 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
955 str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
956 }
957
align(int modulus)958 void MacroAssembler::align(int modulus) {
959 while (offset() % modulus != 0) nop();
960 }
961
962 // these are no-ops overridden by InterpreterMacroAssembler
963
check_and_handle_earlyret(Register java_thread)964 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
965
check_and_handle_popframe(Register java_thread)966 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
967
968
delayed_value_impl(intptr_t * delayed_value_addr,Register tmp,int offset)969 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
970 Register tmp,
971 int offset) {
972 intptr_t value = *delayed_value_addr;
973 if (value != 0)
974 return RegisterOrConstant(value + offset);
975
976 // load indirectly to solve generation ordering problem
977 ldr(tmp, ExternalAddress((address) delayed_value_addr));
978
979 if (offset != 0)
980 add(tmp, tmp, offset);
981
982 return RegisterOrConstant(tmp);
983 }
984
985 // Look up the method for a megamorphic invokeinterface call.
986 // The target method is determined by <intf_klass, itable_index>.
987 // The receiver klass is in recv_klass.
988 // On success, the result will be in method_result, and execution falls through.
989 // On failure, execution transfers to the given label.
lookup_interface_method(Register recv_klass,Register intf_klass,RegisterOrConstant itable_index,Register method_result,Register scan_temp,Label & L_no_such_interface,bool return_method)990 void MacroAssembler::lookup_interface_method(Register recv_klass,
991 Register intf_klass,
992 RegisterOrConstant itable_index,
993 Register method_result,
994 Register scan_temp,
995 Label& L_no_such_interface,
996 bool return_method) {
997 assert_different_registers(recv_klass, intf_klass, scan_temp);
998 assert_different_registers(method_result, intf_klass, scan_temp);
999 assert(recv_klass != method_result || !return_method,
1000 "recv_klass can be destroyed when method isn't needed");
1001 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1002 "caller must use same register for non-constant itable index as for method");
1003
1004 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1005 int vtable_base = in_bytes(Klass::vtable_start_offset());
1006 int itentry_off = itableMethodEntry::method_offset_in_bytes();
1007 int scan_step = itableOffsetEntry::size() * wordSize;
1008 int vte_size = vtableEntry::size_in_bytes();
1009 assert(vte_size == wordSize, "else adjust times_vte_scale");
1010
1011 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1012
1013 // %%% Could store the aligned, prescaled offset in the klassoop.
1014 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1015 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1016 add(scan_temp, scan_temp, vtable_base);
1017
1018 if (return_method) {
1019 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1020 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1021 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1022 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1023 if (itentry_off)
1024 add(recv_klass, recv_klass, itentry_off);
1025 }
1026
1027 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1028 // if (scan->interface() == intf) {
1029 // result = (klass + scan->offset() + itable_index);
1030 // }
1031 // }
1032 Label search, found_method;
1033
1034 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1035 cmp(intf_klass, method_result);
1036 br(Assembler::EQ, found_method);
1037 bind(search);
1038 // Check that the previous entry is non-null. A null entry means that
1039 // the receiver class doesn't implement the interface, and wasn't the
1040 // same as when the caller was compiled.
1041 cbz(method_result, L_no_such_interface);
1042 if (itableOffsetEntry::interface_offset_in_bytes() != 0) {
1043 add(scan_temp, scan_temp, scan_step);
1044 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1045 } else {
1046 ldr(method_result, Address(pre(scan_temp, scan_step)));
1047 }
1048 cmp(intf_klass, method_result);
1049 br(Assembler::NE, search);
1050
1051 bind(found_method);
1052
1053 // Got a hit.
1054 if (return_method) {
1055 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1056 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1057 }
1058 }
1059
1060 // virtual method calling
lookup_virtual_method(Register recv_klass,RegisterOrConstant vtable_index,Register method_result)1061 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1062 RegisterOrConstant vtable_index,
1063 Register method_result) {
1064 const int base = in_bytes(Klass::vtable_start_offset());
1065 assert(vtableEntry::size() * wordSize == 8,
1066 "adjust the scaling in the code below");
1067 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1068
1069 if (vtable_index.is_register()) {
1070 lea(method_result, Address(recv_klass,
1071 vtable_index.as_register(),
1072 Address::lsl(LogBytesPerWord)));
1073 ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1074 } else {
1075 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1076 ldr(method_result,
1077 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1078 }
1079 }
1080
check_klass_subtype(Register sub_klass,Register super_klass,Register temp_reg,Label & L_success)1081 void MacroAssembler::check_klass_subtype(Register sub_klass,
1082 Register super_klass,
1083 Register temp_reg,
1084 Label& L_success) {
1085 Label L_failure;
1086 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL);
1087 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1088 bind(L_failure);
1089 }
1090
1091
check_klass_subtype_fast_path(Register sub_klass,Register super_klass,Register temp_reg,Label * L_success,Label * L_failure,Label * L_slow_path,RegisterOrConstant super_check_offset)1092 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1093 Register super_klass,
1094 Register temp_reg,
1095 Label* L_success,
1096 Label* L_failure,
1097 Label* L_slow_path,
1098 RegisterOrConstant super_check_offset) {
1099 assert_different_registers(sub_klass, super_klass, temp_reg);
1100 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1101 if (super_check_offset.is_register()) {
1102 assert_different_registers(sub_klass, super_klass,
1103 super_check_offset.as_register());
1104 } else if (must_load_sco) {
1105 assert(temp_reg != noreg, "supply either a temp or a register offset");
1106 }
1107
1108 Label L_fallthrough;
1109 int label_nulls = 0;
1110 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1111 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1112 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1113 assert(label_nulls <= 1, "at most one NULL in the batch");
1114
1115 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1116 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1117 Address super_check_offset_addr(super_klass, sco_offset);
1118
1119 // Hacked jmp, which may only be used just before L_fallthrough.
1120 #define final_jmp(label) \
1121 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
1122 else b(label) /*omit semi*/
1123
1124 // If the pointers are equal, we are done (e.g., String[] elements).
1125 // This self-check enables sharing of secondary supertype arrays among
1126 // non-primary types such as array-of-interface. Otherwise, each such
1127 // type would need its own customized SSA.
1128 // We move this check to the front of the fast path because many
1129 // type checks are in fact trivially successful in this manner,
1130 // so we get a nicely predicted branch right at the start of the check.
1131 cmp(sub_klass, super_klass);
1132 br(Assembler::EQ, *L_success);
1133
1134 // Check the supertype display:
1135 if (must_load_sco) {
1136 ldrw(temp_reg, super_check_offset_addr);
1137 super_check_offset = RegisterOrConstant(temp_reg);
1138 }
1139 Address super_check_addr(sub_klass, super_check_offset);
1140 ldr(rscratch1, super_check_addr);
1141 cmp(super_klass, rscratch1); // load displayed supertype
1142
1143 // This check has worked decisively for primary supers.
1144 // Secondary supers are sought in the super_cache ('super_cache_addr').
1145 // (Secondary supers are interfaces and very deeply nested subtypes.)
1146 // This works in the same check above because of a tricky aliasing
1147 // between the super_cache and the primary super display elements.
1148 // (The 'super_check_addr' can address either, as the case requires.)
1149 // Note that the cache is updated below if it does not help us find
1150 // what we need immediately.
1151 // So if it was a primary super, we can just fail immediately.
1152 // Otherwise, it's the slow path for us (no success at this point).
1153
1154 if (super_check_offset.is_register()) {
1155 br(Assembler::EQ, *L_success);
1156 cmp(super_check_offset.as_register(), sc_offset);
1157 if (L_failure == &L_fallthrough) {
1158 br(Assembler::EQ, *L_slow_path);
1159 } else {
1160 br(Assembler::NE, *L_failure);
1161 final_jmp(*L_slow_path);
1162 }
1163 } else if (super_check_offset.as_constant() == sc_offset) {
1164 // Need a slow path; fast failure is impossible.
1165 if (L_slow_path == &L_fallthrough) {
1166 br(Assembler::EQ, *L_success);
1167 } else {
1168 br(Assembler::NE, *L_slow_path);
1169 final_jmp(*L_success);
1170 }
1171 } else {
1172 // No slow path; it's a fast decision.
1173 if (L_failure == &L_fallthrough) {
1174 br(Assembler::EQ, *L_success);
1175 } else {
1176 br(Assembler::NE, *L_failure);
1177 final_jmp(*L_success);
1178 }
1179 }
1180
1181 bind(L_fallthrough);
1182
1183 #undef final_jmp
1184 }
1185
1186 // These two are taken from x86, but they look generally useful
1187
1188 // scans count pointer sized words at [addr] for occurence of value,
1189 // generic
repne_scan(Register addr,Register value,Register count,Register scratch)1190 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1191 Register scratch) {
1192 Label Lloop, Lexit;
1193 cbz(count, Lexit);
1194 bind(Lloop);
1195 ldr(scratch, post(addr, wordSize));
1196 cmp(value, scratch);
1197 br(EQ, Lexit);
1198 sub(count, count, 1);
1199 cbnz(count, Lloop);
1200 bind(Lexit);
1201 }
1202
1203 // scans count 4 byte words at [addr] for occurence of value,
1204 // generic
repne_scanw(Register addr,Register value,Register count,Register scratch)1205 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1206 Register scratch) {
1207 Label Lloop, Lexit;
1208 cbz(count, Lexit);
1209 bind(Lloop);
1210 ldrw(scratch, post(addr, wordSize));
1211 cmpw(value, scratch);
1212 br(EQ, Lexit);
1213 sub(count, count, 1);
1214 cbnz(count, Lloop);
1215 bind(Lexit);
1216 }
1217
check_klass_subtype_slow_path(Register sub_klass,Register super_klass,Register temp_reg,Register temp2_reg,Label * L_success,Label * L_failure,bool set_cond_codes)1218 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1219 Register super_klass,
1220 Register temp_reg,
1221 Register temp2_reg,
1222 Label* L_success,
1223 Label* L_failure,
1224 bool set_cond_codes) {
1225 assert_different_registers(sub_klass, super_klass, temp_reg);
1226 if (temp2_reg != noreg)
1227 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1228 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1229
1230 Label L_fallthrough;
1231 int label_nulls = 0;
1232 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1233 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1234 assert(label_nulls <= 1, "at most one NULL in the batch");
1235
1236 // a couple of useful fields in sub_klass:
1237 int ss_offset = in_bytes(Klass::secondary_supers_offset());
1238 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1239 Address secondary_supers_addr(sub_klass, ss_offset);
1240 Address super_cache_addr( sub_klass, sc_offset);
1241
1242 BLOCK_COMMENT("check_klass_subtype_slow_path");
1243
1244 // Do a linear scan of the secondary super-klass chain.
1245 // This code is rarely used, so simplicity is a virtue here.
1246 // The repne_scan instruction uses fixed registers, which we must spill.
1247 // Don't worry too much about pre-existing connections with the input regs.
1248
1249 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1250 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1251
1252 RegSet pushed_registers;
1253 if (!IS_A_TEMP(r2)) pushed_registers += r2;
1254 if (!IS_A_TEMP(r5)) pushed_registers += r5;
1255
1256 if (super_klass != r0 || UseCompressedOops) {
1257 if (!IS_A_TEMP(r0)) pushed_registers += r0;
1258 }
1259
1260 push(pushed_registers, sp);
1261
1262 // Get super_klass value into r0 (even if it was in r5 or r2).
1263 if (super_klass != r0) {
1264 mov(r0, super_klass);
1265 }
1266
1267 #ifndef PRODUCT
1268 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1269 Address pst_counter_addr(rscratch2);
1270 ldr(rscratch1, pst_counter_addr);
1271 add(rscratch1, rscratch1, 1);
1272 str(rscratch1, pst_counter_addr);
1273 #endif //PRODUCT
1274
1275 // We will consult the secondary-super array.
1276 ldr(r5, secondary_supers_addr);
1277 // Load the array length.
1278 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1279 // Skip to start of data.
1280 add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1281
1282 cmp(sp, zr); // Clear Z flag; SP is never zero
1283 // Scan R2 words at [R5] for an occurrence of R0.
1284 // Set NZ/Z based on last compare.
1285 repne_scan(r5, r0, r2, rscratch1);
1286
1287 // Unspill the temp. registers:
1288 pop(pushed_registers, sp);
1289
1290 br(Assembler::NE, *L_failure);
1291
1292 // Success. Cache the super we found and proceed in triumph.
1293 str(super_klass, super_cache_addr);
1294
1295 if (L_success != &L_fallthrough) {
1296 b(*L_success);
1297 }
1298
1299 #undef IS_A_TEMP
1300
1301 bind(L_fallthrough);
1302 }
1303
1304
verify_oop(Register reg,const char * s)1305 void MacroAssembler::verify_oop(Register reg, const char* s) {
1306 if (!VerifyOops) return;
1307
1308 // Pass register number to verify_oop_subroutine
1309 const char* b = NULL;
1310 {
1311 ResourceMark rm;
1312 stringStream ss;
1313 ss.print("verify_oop: %s: %s", reg->name(), s);
1314 b = code_string(ss.as_string());
1315 }
1316 BLOCK_COMMENT("verify_oop {");
1317
1318 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1319 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1320
1321 mov(r0, reg);
1322 movptr(rscratch1, (uintptr_t)(address)b);
1323
1324 // call indirectly to solve generation ordering problem
1325 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1326 ldr(rscratch2, Address(rscratch2));
1327 blr(rscratch2);
1328
1329 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1330 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1331
1332 BLOCK_COMMENT("} verify_oop");
1333 }
1334
verify_oop_addr(Address addr,const char * s)1335 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1336 if (!VerifyOops) return;
1337
1338 const char* b = NULL;
1339 {
1340 ResourceMark rm;
1341 stringStream ss;
1342 ss.print("verify_oop_addr: %s", s);
1343 b = code_string(ss.as_string());
1344 }
1345 BLOCK_COMMENT("verify_oop_addr {");
1346
1347 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1348 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1349
1350 // addr may contain sp so we will have to adjust it based on the
1351 // pushes that we just did.
1352 if (addr.uses(sp)) {
1353 lea(r0, addr);
1354 ldr(r0, Address(r0, 4 * wordSize));
1355 } else {
1356 ldr(r0, addr);
1357 }
1358 movptr(rscratch1, (uintptr_t)(address)b);
1359
1360 // call indirectly to solve generation ordering problem
1361 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1362 ldr(rscratch2, Address(rscratch2));
1363 blr(rscratch2);
1364
1365 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1366 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1367
1368 BLOCK_COMMENT("} verify_oop_addr");
1369 }
1370
argument_address(RegisterOrConstant arg_slot,int extra_slot_offset)1371 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1372 int extra_slot_offset) {
1373 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1374 int stackElementSize = Interpreter::stackElementSize;
1375 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1376 #ifdef ASSERT
1377 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1378 assert(offset1 - offset == stackElementSize, "correct arithmetic");
1379 #endif
1380 if (arg_slot.is_constant()) {
1381 return Address(esp, arg_slot.as_constant() * stackElementSize
1382 + offset);
1383 } else {
1384 add(rscratch1, esp, arg_slot.as_register(),
1385 ext::uxtx, exact_log2(stackElementSize));
1386 return Address(rscratch1, offset);
1387 }
1388 }
1389
call_VM_leaf_base(address entry_point,int number_of_arguments,Label * retaddr)1390 void MacroAssembler::call_VM_leaf_base(address entry_point,
1391 int number_of_arguments,
1392 Label *retaddr) {
1393 Label E, L;
1394
1395 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1396
1397 mov(rscratch1, entry_point);
1398 blr(rscratch1);
1399 if (retaddr)
1400 bind(*retaddr);
1401
1402 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1403 maybe_isb();
1404 }
1405
call_VM_leaf(address entry_point,int number_of_arguments)1406 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1407 call_VM_leaf_base(entry_point, number_of_arguments);
1408 }
1409
call_VM_leaf(address entry_point,Register arg_0)1410 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1411 pass_arg0(this, arg_0);
1412 call_VM_leaf_base(entry_point, 1);
1413 }
1414
call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1415 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1416 pass_arg0(this, arg_0);
1417 pass_arg1(this, arg_1);
1418 call_VM_leaf_base(entry_point, 2);
1419 }
1420
call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1421 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1422 Register arg_1, Register arg_2) {
1423 pass_arg0(this, arg_0);
1424 pass_arg1(this, arg_1);
1425 pass_arg2(this, arg_2);
1426 call_VM_leaf_base(entry_point, 3);
1427 }
1428
super_call_VM_leaf(address entry_point,Register arg_0)1429 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1430 pass_arg0(this, arg_0);
1431 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1432 }
1433
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1434 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1435
1436 assert(arg_0 != c_rarg1, "smashed arg");
1437 pass_arg1(this, arg_1);
1438 pass_arg0(this, arg_0);
1439 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1440 }
1441
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1442 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1443 assert(arg_0 != c_rarg2, "smashed arg");
1444 assert(arg_1 != c_rarg2, "smashed arg");
1445 pass_arg2(this, arg_2);
1446 assert(arg_0 != c_rarg1, "smashed arg");
1447 pass_arg1(this, arg_1);
1448 pass_arg0(this, arg_0);
1449 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1450 }
1451
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2,Register arg_3)1452 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1453 assert(arg_0 != c_rarg3, "smashed arg");
1454 assert(arg_1 != c_rarg3, "smashed arg");
1455 assert(arg_2 != c_rarg3, "smashed arg");
1456 pass_arg3(this, arg_3);
1457 assert(arg_0 != c_rarg2, "smashed arg");
1458 assert(arg_1 != c_rarg2, "smashed arg");
1459 pass_arg2(this, arg_2);
1460 assert(arg_0 != c_rarg1, "smashed arg");
1461 pass_arg1(this, arg_1);
1462 pass_arg0(this, arg_0);
1463 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1464 }
1465
null_check(Register reg,int offset)1466 void MacroAssembler::null_check(Register reg, int offset) {
1467 if (needs_explicit_null_check(offset)) {
1468 // provoke OS NULL exception if reg = NULL by
1469 // accessing M[reg] w/o changing any registers
1470 // NOTE: this is plenty to provoke a segv
1471 ldr(zr, Address(reg));
1472 } else {
1473 // nothing to do, (later) access of M[reg + offset]
1474 // will provoke OS NULL exception if reg = NULL
1475 }
1476 }
1477
1478 // MacroAssembler protected routines needed to implement
1479 // public methods
1480
mov(Register r,Address dest)1481 void MacroAssembler::mov(Register r, Address dest) {
1482 code_section()->relocate(pc(), dest.rspec());
1483 uint64_t imm64 = (uint64_t)dest.target();
1484 movptr(r, imm64);
1485 }
1486
1487 // Move a constant pointer into r. In AArch64 mode the virtual
1488 // address space is 48 bits in size, so we only need three
1489 // instructions to create a patchable instruction sequence that can
1490 // reach anywhere.
movptr(Register r,uintptr_t imm64)1491 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1492 #ifndef PRODUCT
1493 {
1494 char buffer[64];
1495 snprintf(buffer, sizeof(buffer), INTPTR_FORMAT, imm64);
1496 block_comment(buffer);
1497 }
1498 #endif
1499 assert(imm64 < (1ull << 48), "48-bit overflow in address constant");
1500 movz(r, imm64 & 0xffff);
1501 imm64 >>= 16;
1502 movk(r, imm64 & 0xffff, 16);
1503 imm64 >>= 16;
1504 movk(r, imm64 & 0xffff, 32);
1505 }
1506
1507 // Macro to mov replicated immediate to vector register.
1508 // Vd will get the following values for different arrangements in T
1509 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh
1510 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh
1511 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh
1512 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh
1513 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh
1514 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh
1515 // T1D/T2D: invalid
mov(FloatRegister Vd,SIMD_Arrangement T,uint32_t imm32)1516 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, uint32_t imm32) {
1517 assert(T != T1D && T != T2D, "invalid arrangement");
1518 if (T == T8B || T == T16B) {
1519 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1520 movi(Vd, T, imm32 & 0xff, 0);
1521 return;
1522 }
1523 uint32_t nimm32 = ~imm32;
1524 if (T == T4H || T == T8H) {
1525 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1526 imm32 &= 0xffff;
1527 nimm32 &= 0xffff;
1528 }
1529 uint32_t x = imm32;
1530 int movi_cnt = 0;
1531 int movn_cnt = 0;
1532 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1533 x = nimm32;
1534 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1535 if (movn_cnt < movi_cnt) imm32 = nimm32;
1536 unsigned lsl = 0;
1537 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1538 if (movn_cnt < movi_cnt)
1539 mvni(Vd, T, imm32 & 0xff, lsl);
1540 else
1541 movi(Vd, T, imm32 & 0xff, lsl);
1542 imm32 >>= 8; lsl += 8;
1543 while (imm32) {
1544 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1545 if (movn_cnt < movi_cnt)
1546 bici(Vd, T, imm32 & 0xff, lsl);
1547 else
1548 orri(Vd, T, imm32 & 0xff, lsl);
1549 lsl += 8; imm32 >>= 8;
1550 }
1551 }
1552
mov_immediate64(Register dst,uint64_t imm64)1553 void MacroAssembler::mov_immediate64(Register dst, uint64_t imm64)
1554 {
1555 #ifndef PRODUCT
1556 {
1557 char buffer[64];
1558 snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1559 block_comment(buffer);
1560 }
1561 #endif
1562 if (operand_valid_for_logical_immediate(false, imm64)) {
1563 orr(dst, zr, imm64);
1564 } else {
1565 // we can use a combination of MOVZ or MOVN with
1566 // MOVK to build up the constant
1567 uint64_t imm_h[4];
1568 int zero_count = 0;
1569 int neg_count = 0;
1570 int i;
1571 for (i = 0; i < 4; i++) {
1572 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1573 if (imm_h[i] == 0) {
1574 zero_count++;
1575 } else if (imm_h[i] == 0xffffL) {
1576 neg_count++;
1577 }
1578 }
1579 if (zero_count == 4) {
1580 // one MOVZ will do
1581 movz(dst, 0);
1582 } else if (neg_count == 4) {
1583 // one MOVN will do
1584 movn(dst, 0);
1585 } else if (zero_count == 3) {
1586 for (i = 0; i < 4; i++) {
1587 if (imm_h[i] != 0L) {
1588 movz(dst, (uint32_t)imm_h[i], (i << 4));
1589 break;
1590 }
1591 }
1592 } else if (neg_count == 3) {
1593 // one MOVN will do
1594 for (int i = 0; i < 4; i++) {
1595 if (imm_h[i] != 0xffffL) {
1596 movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1597 break;
1598 }
1599 }
1600 } else if (zero_count == 2) {
1601 // one MOVZ and one MOVK will do
1602 for (i = 0; i < 3; i++) {
1603 if (imm_h[i] != 0L) {
1604 movz(dst, (uint32_t)imm_h[i], (i << 4));
1605 i++;
1606 break;
1607 }
1608 }
1609 for (;i < 4; i++) {
1610 if (imm_h[i] != 0L) {
1611 movk(dst, (uint32_t)imm_h[i], (i << 4));
1612 }
1613 }
1614 } else if (neg_count == 2) {
1615 // one MOVN and one MOVK will do
1616 for (i = 0; i < 4; i++) {
1617 if (imm_h[i] != 0xffffL) {
1618 movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1619 i++;
1620 break;
1621 }
1622 }
1623 for (;i < 4; i++) {
1624 if (imm_h[i] != 0xffffL) {
1625 movk(dst, (uint32_t)imm_h[i], (i << 4));
1626 }
1627 }
1628 } else if (zero_count == 1) {
1629 // one MOVZ and two MOVKs will do
1630 for (i = 0; i < 4; i++) {
1631 if (imm_h[i] != 0L) {
1632 movz(dst, (uint32_t)imm_h[i], (i << 4));
1633 i++;
1634 break;
1635 }
1636 }
1637 for (;i < 4; i++) {
1638 if (imm_h[i] != 0x0L) {
1639 movk(dst, (uint32_t)imm_h[i], (i << 4));
1640 }
1641 }
1642 } else if (neg_count == 1) {
1643 // one MOVN and two MOVKs will do
1644 for (i = 0; i < 4; i++) {
1645 if (imm_h[i] != 0xffffL) {
1646 movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1647 i++;
1648 break;
1649 }
1650 }
1651 for (;i < 4; i++) {
1652 if (imm_h[i] != 0xffffL) {
1653 movk(dst, (uint32_t)imm_h[i], (i << 4));
1654 }
1655 }
1656 } else {
1657 // use a MOVZ and 3 MOVKs (makes it easier to debug)
1658 movz(dst, (uint32_t)imm_h[0], 0);
1659 for (i = 1; i < 4; i++) {
1660 movk(dst, (uint32_t)imm_h[i], (i << 4));
1661 }
1662 }
1663 }
1664 }
1665
mov_immediate32(Register dst,uint32_t imm32)1666 void MacroAssembler::mov_immediate32(Register dst, uint32_t imm32)
1667 {
1668 #ifndef PRODUCT
1669 {
1670 char buffer[64];
1671 snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1672 block_comment(buffer);
1673 }
1674 #endif
1675 if (operand_valid_for_logical_immediate(true, imm32)) {
1676 orrw(dst, zr, imm32);
1677 } else {
1678 // we can use MOVZ, MOVN or two calls to MOVK to build up the
1679 // constant
1680 uint32_t imm_h[2];
1681 imm_h[0] = imm32 & 0xffff;
1682 imm_h[1] = ((imm32 >> 16) & 0xffff);
1683 if (imm_h[0] == 0) {
1684 movzw(dst, imm_h[1], 16);
1685 } else if (imm_h[0] == 0xffff) {
1686 movnw(dst, imm_h[1] ^ 0xffff, 16);
1687 } else if (imm_h[1] == 0) {
1688 movzw(dst, imm_h[0], 0);
1689 } else if (imm_h[1] == 0xffff) {
1690 movnw(dst, imm_h[0] ^ 0xffff, 0);
1691 } else {
1692 // use a MOVZ and MOVK (makes it easier to debug)
1693 movzw(dst, imm_h[0], 0);
1694 movkw(dst, imm_h[1], 16);
1695 }
1696 }
1697 }
1698
1699 // Form an address from base + offset in Rd. Rd may or may
1700 // not actually be used: you must use the Address that is returned.
1701 // It is up to you to ensure that the shift provided matches the size
1702 // of your data.
form_address(Register Rd,Register base,int64_t byte_offset,int shift)1703 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset, int shift) {
1704 if (Address::offset_ok_for_immed(byte_offset, shift))
1705 // It fits; no need for any heroics
1706 return Address(base, byte_offset);
1707
1708 // Don't do anything clever with negative or misaligned offsets
1709 unsigned mask = (1 << shift) - 1;
1710 if (byte_offset < 0 || byte_offset & mask) {
1711 mov(Rd, byte_offset);
1712 add(Rd, base, Rd);
1713 return Address(Rd);
1714 }
1715
1716 // See if we can do this with two 12-bit offsets
1717 {
1718 uint64_t word_offset = byte_offset >> shift;
1719 uint64_t masked_offset = word_offset & 0xfff000;
1720 if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
1721 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1722 add(Rd, base, masked_offset << shift);
1723 word_offset -= masked_offset;
1724 return Address(Rd, word_offset << shift);
1725 }
1726 }
1727
1728 // Do it the hard way
1729 mov(Rd, byte_offset);
1730 add(Rd, base, Rd);
1731 return Address(Rd);
1732 }
1733
atomic_incw(Register counter_addr,Register tmp,Register tmp2)1734 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1735 if (UseLSE) {
1736 mov(tmp, 1);
1737 ldadd(Assembler::word, tmp, zr, counter_addr);
1738 return;
1739 }
1740 Label retry_load;
1741 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1742 prfm(Address(counter_addr), PSTL1STRM);
1743 bind(retry_load);
1744 // flush and load exclusive from the memory location
1745 ldxrw(tmp, counter_addr);
1746 addw(tmp, tmp, 1);
1747 // if we store+flush with no intervening write tmp wil be zero
1748 stxrw(tmp2, tmp, counter_addr);
1749 cbnzw(tmp2, retry_load);
1750 }
1751
1752
corrected_idivl(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1753 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1754 bool want_remainder, Register scratch)
1755 {
1756 // Full implementation of Java idiv and irem. The function
1757 // returns the (pc) offset of the div instruction - may be needed
1758 // for implicit exceptions.
1759 //
1760 // constraint : ra/rb =/= scratch
1761 // normal case
1762 //
1763 // input : ra: dividend
1764 // rb: divisor
1765 //
1766 // result: either
1767 // quotient (= ra idiv rb)
1768 // remainder (= ra irem rb)
1769
1770 assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1771
1772 int idivl_offset = offset();
1773 if (! want_remainder) {
1774 sdivw(result, ra, rb);
1775 } else {
1776 sdivw(scratch, ra, rb);
1777 Assembler::msubw(result, scratch, rb, ra);
1778 }
1779
1780 return idivl_offset;
1781 }
1782
corrected_idivq(Register result,Register ra,Register rb,bool want_remainder,Register scratch)1783 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1784 bool want_remainder, Register scratch)
1785 {
1786 // Full implementation of Java ldiv and lrem. The function
1787 // returns the (pc) offset of the div instruction - may be needed
1788 // for implicit exceptions.
1789 //
1790 // constraint : ra/rb =/= scratch
1791 // normal case
1792 //
1793 // input : ra: dividend
1794 // rb: divisor
1795 //
1796 // result: either
1797 // quotient (= ra idiv rb)
1798 // remainder (= ra irem rb)
1799
1800 assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1801
1802 int idivq_offset = offset();
1803 if (! want_remainder) {
1804 sdiv(result, ra, rb);
1805 } else {
1806 sdiv(scratch, ra, rb);
1807 Assembler::msub(result, scratch, rb, ra);
1808 }
1809
1810 return idivq_offset;
1811 }
1812
membar(Membar_mask_bits order_constraint)1813 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1814 address prev = pc() - NativeMembar::instruction_size;
1815 address last = code()->last_insn();
1816 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1817 NativeMembar *bar = NativeMembar_at(prev);
1818 // We are merging two memory barrier instructions. On AArch64 we
1819 // can do this simply by ORing them together.
1820 bar->set_kind(bar->get_kind() | order_constraint);
1821 BLOCK_COMMENT("merged membar");
1822 } else {
1823 code()->set_last_insn(pc());
1824 dmb(Assembler::barrier(order_constraint));
1825 }
1826 }
1827
try_merge_ldst(Register rt,const Address & adr,size_t size_in_bytes,bool is_store)1828 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1829 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1830 merge_ldst(rt, adr, size_in_bytes, is_store);
1831 code()->clear_last_insn();
1832 return true;
1833 } else {
1834 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1835 const unsigned mask = size_in_bytes - 1;
1836 if (adr.getMode() == Address::base_plus_offset &&
1837 (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1838 code()->set_last_insn(pc());
1839 }
1840 return false;
1841 }
1842 }
1843
ldr(Register Rx,const Address & adr)1844 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1845 // We always try to merge two adjacent loads into one ldp.
1846 if (!try_merge_ldst(Rx, adr, 8, false)) {
1847 Assembler::ldr(Rx, adr);
1848 }
1849 }
1850
ldrw(Register Rw,const Address & adr)1851 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1852 // We always try to merge two adjacent loads into one ldp.
1853 if (!try_merge_ldst(Rw, adr, 4, false)) {
1854 Assembler::ldrw(Rw, adr);
1855 }
1856 }
1857
str(Register Rx,const Address & adr)1858 void MacroAssembler::str(Register Rx, const Address &adr) {
1859 // We always try to merge two adjacent stores into one stp.
1860 if (!try_merge_ldst(Rx, adr, 8, true)) {
1861 Assembler::str(Rx, adr);
1862 }
1863 }
1864
strw(Register Rw,const Address & adr)1865 void MacroAssembler::strw(Register Rw, const Address &adr) {
1866 // We always try to merge two adjacent stores into one stp.
1867 if (!try_merge_ldst(Rw, adr, 4, true)) {
1868 Assembler::strw(Rw, adr);
1869 }
1870 }
1871
1872 // MacroAssembler routines found actually to be needed
1873
push(Register src)1874 void MacroAssembler::push(Register src)
1875 {
1876 str(src, Address(pre(esp, -1 * wordSize)));
1877 }
1878
pop(Register dst)1879 void MacroAssembler::pop(Register dst)
1880 {
1881 ldr(dst, Address(post(esp, 1 * wordSize)));
1882 }
1883
1884 // Note: load_unsigned_short used to be called load_unsigned_word.
load_unsigned_short(Register dst,Address src)1885 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1886 int off = offset();
1887 ldrh(dst, src);
1888 return off;
1889 }
1890
load_unsigned_byte(Register dst,Address src)1891 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1892 int off = offset();
1893 ldrb(dst, src);
1894 return off;
1895 }
1896
load_signed_short(Register dst,Address src)1897 int MacroAssembler::load_signed_short(Register dst, Address src) {
1898 int off = offset();
1899 ldrsh(dst, src);
1900 return off;
1901 }
1902
load_signed_byte(Register dst,Address src)1903 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1904 int off = offset();
1905 ldrsb(dst, src);
1906 return off;
1907 }
1908
load_signed_short32(Register dst,Address src)1909 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1910 int off = offset();
1911 ldrshw(dst, src);
1912 return off;
1913 }
1914
load_signed_byte32(Register dst,Address src)1915 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1916 int off = offset();
1917 ldrsbw(dst, src);
1918 return off;
1919 }
1920
load_sized_value(Register dst,Address src,size_t size_in_bytes,bool is_signed,Register dst2)1921 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1922 switch (size_in_bytes) {
1923 case 8: ldr(dst, src); break;
1924 case 4: ldrw(dst, src); break;
1925 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1926 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1927 default: ShouldNotReachHere();
1928 }
1929 }
1930
store_sized_value(Address dst,Register src,size_t size_in_bytes,Register src2)1931 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1932 switch (size_in_bytes) {
1933 case 8: str(src, dst); break;
1934 case 4: strw(src, dst); break;
1935 case 2: strh(src, dst); break;
1936 case 1: strb(src, dst); break;
1937 default: ShouldNotReachHere();
1938 }
1939 }
1940
decrementw(Register reg,int value)1941 void MacroAssembler::decrementw(Register reg, int value)
1942 {
1943 if (value < 0) { incrementw(reg, -value); return; }
1944 if (value == 0) { return; }
1945 if (value < (1 << 12)) { subw(reg, reg, value); return; }
1946 /* else */ {
1947 guarantee(reg != rscratch2, "invalid dst for register decrement");
1948 movw(rscratch2, (unsigned)value);
1949 subw(reg, reg, rscratch2);
1950 }
1951 }
1952
decrement(Register reg,int value)1953 void MacroAssembler::decrement(Register reg, int value)
1954 {
1955 if (value < 0) { increment(reg, -value); return; }
1956 if (value == 0) { return; }
1957 if (value < (1 << 12)) { sub(reg, reg, value); return; }
1958 /* else */ {
1959 assert(reg != rscratch2, "invalid dst for register decrement");
1960 mov(rscratch2, (uint64_t)value);
1961 sub(reg, reg, rscratch2);
1962 }
1963 }
1964
decrementw(Address dst,int value)1965 void MacroAssembler::decrementw(Address dst, int value)
1966 {
1967 assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1968 if (dst.getMode() == Address::literal) {
1969 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1970 lea(rscratch2, dst);
1971 dst = Address(rscratch2);
1972 }
1973 ldrw(rscratch1, dst);
1974 decrementw(rscratch1, value);
1975 strw(rscratch1, dst);
1976 }
1977
decrement(Address dst,int value)1978 void MacroAssembler::decrement(Address dst, int value)
1979 {
1980 assert(!dst.uses(rscratch1), "invalid address for decrement");
1981 if (dst.getMode() == Address::literal) {
1982 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1983 lea(rscratch2, dst);
1984 dst = Address(rscratch2);
1985 }
1986 ldr(rscratch1, dst);
1987 decrement(rscratch1, value);
1988 str(rscratch1, dst);
1989 }
1990
incrementw(Register reg,int value)1991 void MacroAssembler::incrementw(Register reg, int value)
1992 {
1993 if (value < 0) { decrementw(reg, -value); return; }
1994 if (value == 0) { return; }
1995 if (value < (1 << 12)) { addw(reg, reg, value); return; }
1996 /* else */ {
1997 assert(reg != rscratch2, "invalid dst for register increment");
1998 movw(rscratch2, (unsigned)value);
1999 addw(reg, reg, rscratch2);
2000 }
2001 }
2002
increment(Register reg,int value)2003 void MacroAssembler::increment(Register reg, int value)
2004 {
2005 if (value < 0) { decrement(reg, -value); return; }
2006 if (value == 0) { return; }
2007 if (value < (1 << 12)) { add(reg, reg, value); return; }
2008 /* else */ {
2009 assert(reg != rscratch2, "invalid dst for register increment");
2010 movw(rscratch2, (unsigned)value);
2011 add(reg, reg, rscratch2);
2012 }
2013 }
2014
incrementw(Address dst,int value)2015 void MacroAssembler::incrementw(Address dst, int value)
2016 {
2017 assert(!dst.uses(rscratch1), "invalid dst for address increment");
2018 if (dst.getMode() == Address::literal) {
2019 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2020 lea(rscratch2, dst);
2021 dst = Address(rscratch2);
2022 }
2023 ldrw(rscratch1, dst);
2024 incrementw(rscratch1, value);
2025 strw(rscratch1, dst);
2026 }
2027
increment(Address dst,int value)2028 void MacroAssembler::increment(Address dst, int value)
2029 {
2030 assert(!dst.uses(rscratch1), "invalid dst for address increment");
2031 if (dst.getMode() == Address::literal) {
2032 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2033 lea(rscratch2, dst);
2034 dst = Address(rscratch2);
2035 }
2036 ldr(rscratch1, dst);
2037 increment(rscratch1, value);
2038 str(rscratch1, dst);
2039 }
2040
2041
pusha()2042 void MacroAssembler::pusha() {
2043 push(0x7fffffff, sp);
2044 }
2045
popa()2046 void MacroAssembler::popa() {
2047 pop(0x7fffffff, sp);
2048 }
2049
2050 // Push lots of registers in the bit set supplied. Don't push sp.
2051 // Return the number of words pushed
push(unsigned int bitset,Register stack)2052 int MacroAssembler::push(unsigned int bitset, Register stack) {
2053 int words_pushed = 0;
2054
2055 // Scan bitset to accumulate register pairs
2056 unsigned char regs[32];
2057 int count = 0;
2058 for (int reg = 0; reg <= 30; reg++) {
2059 if (1 & bitset)
2060 regs[count++] = reg;
2061 bitset >>= 1;
2062 }
2063 regs[count++] = zr->encoding_nocheck();
2064 count &= ~1; // Only push an even nuber of regs
2065
2066 if (count) {
2067 stp(as_Register(regs[0]), as_Register(regs[1]),
2068 Address(pre(stack, -count * wordSize)));
2069 words_pushed += 2;
2070 }
2071 for (int i = 2; i < count; i += 2) {
2072 stp(as_Register(regs[i]), as_Register(regs[i+1]),
2073 Address(stack, i * wordSize));
2074 words_pushed += 2;
2075 }
2076
2077 assert(words_pushed == count, "oops, pushed != count");
2078
2079 return count;
2080 }
2081
pop(unsigned int bitset,Register stack)2082 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2083 int words_pushed = 0;
2084
2085 // Scan bitset to accumulate register pairs
2086 unsigned char regs[32];
2087 int count = 0;
2088 for (int reg = 0; reg <= 30; reg++) {
2089 if (1 & bitset)
2090 regs[count++] = reg;
2091 bitset >>= 1;
2092 }
2093 regs[count++] = zr->encoding_nocheck();
2094 count &= ~1;
2095
2096 for (int i = 2; i < count; i += 2) {
2097 ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2098 Address(stack, i * wordSize));
2099 words_pushed += 2;
2100 }
2101 if (count) {
2102 ldp(as_Register(regs[0]), as_Register(regs[1]),
2103 Address(post(stack, count * wordSize)));
2104 words_pushed += 2;
2105 }
2106
2107 assert(words_pushed == count, "oops, pushed != count");
2108
2109 return count;
2110 }
2111 #ifdef ASSERT
verify_heapbase(const char * msg)2112 void MacroAssembler::verify_heapbase(const char* msg) {
2113 #if 0
2114 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2115 assert (Universe::heap() != NULL, "java heap should be initialized");
2116 if (CheckCompressedOops) {
2117 Label ok;
2118 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2119 cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2120 br(Assembler::EQ, ok);
2121 stop(msg);
2122 bind(ok);
2123 pop(1 << rscratch1->encoding(), sp);
2124 }
2125 #endif
2126 }
2127 #endif
2128
resolve_jobject(Register value,Register thread,Register tmp)2129 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2130 Label done, not_weak;
2131 cbz(value, done); // Use NULL as-is.
2132
2133 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2134 tbz(r0, 0, not_weak); // Test for jweak tag.
2135
2136 // Resolve jweak.
2137 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2138 Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2139 verify_oop(value);
2140 b(done);
2141
2142 bind(not_weak);
2143 // Resolve (untagged) jobject.
2144 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2145 verify_oop(value);
2146 bind(done);
2147 }
2148
stop(const char * msg)2149 void MacroAssembler::stop(const char* msg) {
2150 address ip = pc();
2151 pusha();
2152 movptr(c_rarg0, (uintptr_t)(address)msg);
2153 movptr(c_rarg1, (uintptr_t)(address)ip);
2154 mov(c_rarg2, sp);
2155 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2156 blr(c_rarg3);
2157 hlt(0);
2158 }
2159
warn(const char * msg)2160 void MacroAssembler::warn(const char* msg) {
2161 pusha();
2162 mov(c_rarg0, (address)msg);
2163 mov(lr, CAST_FROM_FN_PTR(address, warning));
2164 blr(lr);
2165 popa();
2166 }
2167
unimplemented(const char * what)2168 void MacroAssembler::unimplemented(const char* what) {
2169 const char* buf = NULL;
2170 {
2171 ResourceMark rm;
2172 stringStream ss;
2173 ss.print("unimplemented: %s", what);
2174 buf = code_string(ss.as_string());
2175 }
2176 stop(buf);
2177 }
2178
2179 // If a constant does not fit in an immediate field, generate some
2180 // number of MOV instructions and then perform the operation.
wrap_add_sub_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2181 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2182 add_sub_imm_insn insn1,
2183 add_sub_reg_insn insn2) {
2184 assert(Rd != zr, "Rd = zr and not setting flags?");
2185 if (operand_valid_for_add_sub_immediate((int)imm)) {
2186 (this->*insn1)(Rd, Rn, imm);
2187 } else {
2188 if (uabs(imm) < (1 << 24)) {
2189 (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2190 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2191 } else {
2192 assert_different_registers(Rd, Rn);
2193 mov(Rd, (uint64_t)imm);
2194 (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2195 }
2196 }
2197 }
2198
2199 // Seperate vsn which sets the flags. Optimisations are more restricted
2200 // because we must set the flags correctly.
wrap_adds_subs_imm_insn(Register Rd,Register Rn,unsigned imm,add_sub_imm_insn insn1,add_sub_reg_insn insn2)2201 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2202 add_sub_imm_insn insn1,
2203 add_sub_reg_insn insn2) {
2204 if (operand_valid_for_add_sub_immediate((int)imm)) {
2205 (this->*insn1)(Rd, Rn, imm);
2206 } else {
2207 assert_different_registers(Rd, Rn);
2208 assert(Rd != zr, "overflow in immediate operand");
2209 mov(Rd, (uint64_t)imm);
2210 (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2211 }
2212 }
2213
2214
add(Register Rd,Register Rn,RegisterOrConstant increment)2215 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2216 if (increment.is_register()) {
2217 add(Rd, Rn, increment.as_register());
2218 } else {
2219 add(Rd, Rn, increment.as_constant());
2220 }
2221 }
2222
addw(Register Rd,Register Rn,RegisterOrConstant increment)2223 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2224 if (increment.is_register()) {
2225 addw(Rd, Rn, increment.as_register());
2226 } else {
2227 addw(Rd, Rn, increment.as_constant());
2228 }
2229 }
2230
sub(Register Rd,Register Rn,RegisterOrConstant decrement)2231 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2232 if (decrement.is_register()) {
2233 sub(Rd, Rn, decrement.as_register());
2234 } else {
2235 sub(Rd, Rn, decrement.as_constant());
2236 }
2237 }
2238
subw(Register Rd,Register Rn,RegisterOrConstant decrement)2239 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2240 if (decrement.is_register()) {
2241 subw(Rd, Rn, decrement.as_register());
2242 } else {
2243 subw(Rd, Rn, decrement.as_constant());
2244 }
2245 }
2246
reinit_heapbase()2247 void MacroAssembler::reinit_heapbase()
2248 {
2249 if (UseCompressedOops) {
2250 if (Universe::is_fully_initialized()) {
2251 mov(rheapbase, Universe::narrow_ptrs_base());
2252 } else {
2253 lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2254 ldr(rheapbase, Address(rheapbase));
2255 }
2256 }
2257 }
2258
2259 // this simulates the behaviour of the x86 cmpxchg instruction using a
2260 // load linked/store conditional pair. we use the acquire/release
2261 // versions of these instructions so that we flush pending writes as
2262 // per Java semantics.
2263
2264 // n.b the x86 version assumes the old value to be compared against is
2265 // in rax and updates rax with the value located in memory if the
2266 // cmpxchg fails. we supply a register for the old value explicitly
2267
2268 // the aarch64 load linked/store conditional instructions do not
2269 // accept an offset. so, unlike x86, we must provide a plain register
2270 // to identify the memory word to be compared/exchanged rather than a
2271 // register+offset Address.
2272
cmpxchgptr(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2273 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2274 Label &succeed, Label *fail) {
2275 // oldv holds comparison value
2276 // newv holds value to write in exchange
2277 // addr identifies memory word to compare against/update
2278 if (UseLSE) {
2279 mov(tmp, oldv);
2280 casal(Assembler::xword, oldv, newv, addr);
2281 cmp(tmp, oldv);
2282 br(Assembler::EQ, succeed);
2283 membar(AnyAny);
2284 } else {
2285 Label retry_load, nope;
2286 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2287 prfm(Address(addr), PSTL1STRM);
2288 bind(retry_load);
2289 // flush and load exclusive from the memory location
2290 // and fail if it is not what we expect
2291 ldaxr(tmp, addr);
2292 cmp(tmp, oldv);
2293 br(Assembler::NE, nope);
2294 // if we store+flush with no intervening write tmp wil be zero
2295 stlxr(tmp, newv, addr);
2296 cbzw(tmp, succeed);
2297 // retry so we only ever return after a load fails to compare
2298 // ensures we don't return a stale value after a failed write.
2299 b(retry_load);
2300 // if the memory word differs we return it in oldv and signal a fail
2301 bind(nope);
2302 membar(AnyAny);
2303 mov(oldv, tmp);
2304 }
2305 if (fail)
2306 b(*fail);
2307 }
2308
cmpxchg_obj_header(Register oldv,Register newv,Register obj,Register tmp,Label & succeed,Label * fail)2309 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2310 Label &succeed, Label *fail) {
2311 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2312 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2313 }
2314
cmpxchgw(Register oldv,Register newv,Register addr,Register tmp,Label & succeed,Label * fail)2315 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2316 Label &succeed, Label *fail) {
2317 // oldv holds comparison value
2318 // newv holds value to write in exchange
2319 // addr identifies memory word to compare against/update
2320 // tmp returns 0/1 for success/failure
2321 if (UseLSE) {
2322 mov(tmp, oldv);
2323 casal(Assembler::word, oldv, newv, addr);
2324 cmp(tmp, oldv);
2325 br(Assembler::EQ, succeed);
2326 membar(AnyAny);
2327 } else {
2328 Label retry_load, nope;
2329 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2330 prfm(Address(addr), PSTL1STRM);
2331 bind(retry_load);
2332 // flush and load exclusive from the memory location
2333 // and fail if it is not what we expect
2334 ldaxrw(tmp, addr);
2335 cmp(tmp, oldv);
2336 br(Assembler::NE, nope);
2337 // if we store+flush with no intervening write tmp wil be zero
2338 stlxrw(tmp, newv, addr);
2339 cbzw(tmp, succeed);
2340 // retry so we only ever return after a load fails to compare
2341 // ensures we don't return a stale value after a failed write.
2342 b(retry_load);
2343 // if the memory word differs we return it in oldv and signal a fail
2344 bind(nope);
2345 membar(AnyAny);
2346 mov(oldv, tmp);
2347 }
2348 if (fail)
2349 b(*fail);
2350 }
2351
2352 // A generic CAS; success or failure is in the EQ flag. A weak CAS
2353 // doesn't retry and may fail spuriously. If the oldval is wanted,
2354 // Pass a register for the result, otherwise pass noreg.
2355
2356 // Clobbers rscratch1
cmpxchg(Register addr,Register expected,Register new_val,enum operand_size size,bool acquire,bool release,bool weak,Register result)2357 void MacroAssembler::cmpxchg(Register addr, Register expected,
2358 Register new_val,
2359 enum operand_size size,
2360 bool acquire, bool release,
2361 bool weak,
2362 Register result) {
2363 if (result == noreg) result = rscratch1;
2364 BLOCK_COMMENT("cmpxchg {");
2365 if (UseLSE) {
2366 mov(result, expected);
2367 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2368 compare_eq(result, expected, size);
2369 } else {
2370 Label retry_load, done;
2371 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2372 prfm(Address(addr), PSTL1STRM);
2373 bind(retry_load);
2374 load_exclusive(result, addr, size, acquire);
2375 compare_eq(result, expected, size);
2376 br(Assembler::NE, done);
2377 store_exclusive(rscratch1, new_val, addr, size, release);
2378 if (weak) {
2379 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller.
2380 } else {
2381 cbnzw(rscratch1, retry_load);
2382 }
2383 bind(done);
2384 }
2385 BLOCK_COMMENT("} cmpxchg");
2386 }
2387
2388 // A generic comparison. Only compares for equality, clobbers rscratch1.
compare_eq(Register rm,Register rn,enum operand_size size)2389 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2390 if (size == xword) {
2391 cmp(rm, rn);
2392 } else if (size == word) {
2393 cmpw(rm, rn);
2394 } else if (size == halfword) {
2395 eorw(rscratch1, rm, rn);
2396 ands(zr, rscratch1, 0xffff);
2397 } else if (size == byte) {
2398 eorw(rscratch1, rm, rn);
2399 ands(zr, rscratch1, 0xff);
2400 } else {
2401 ShouldNotReachHere();
2402 }
2403 }
2404
2405
different(Register a,RegisterOrConstant b,Register c)2406 static bool different(Register a, RegisterOrConstant b, Register c) {
2407 if (b.is_constant())
2408 return a != c;
2409 else
2410 return a != b.as_register() && a != c && b.as_register() != c;
2411 }
2412
2413 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \
2414 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2415 if (UseLSE) { \
2416 prev = prev->is_valid() ? prev : zr; \
2417 if (incr.is_register()) { \
2418 AOP(sz, incr.as_register(), prev, addr); \
2419 } else { \
2420 mov(rscratch2, incr.as_constant()); \
2421 AOP(sz, rscratch2, prev, addr); \
2422 } \
2423 return; \
2424 } \
2425 Register result = rscratch2; \
2426 if (prev->is_valid()) \
2427 result = different(prev, incr, addr) ? prev : rscratch2; \
2428 \
2429 Label retry_load; \
2430 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \
2431 prfm(Address(addr), PSTL1STRM); \
2432 bind(retry_load); \
2433 LDXR(result, addr); \
2434 OP(rscratch1, result, incr); \
2435 STXR(rscratch2, rscratch1, addr); \
2436 cbnzw(rscratch2, retry_load); \
2437 if (prev->is_valid() && prev != result) { \
2438 IOP(prev, rscratch1, incr); \
2439 } \
2440 }
2441
2442 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2443 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2444 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2445 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2446
2447 #undef ATOMIC_OP
2448
2449 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \
2450 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2451 if (UseLSE) { \
2452 prev = prev->is_valid() ? prev : zr; \
2453 AOP(sz, newv, prev, addr); \
2454 return; \
2455 } \
2456 Register result = rscratch2; \
2457 if (prev->is_valid()) \
2458 result = different(prev, newv, addr) ? prev : rscratch2; \
2459 \
2460 Label retry_load; \
2461 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \
2462 prfm(Address(addr), PSTL1STRM); \
2463 bind(retry_load); \
2464 LDXR(result, addr); \
2465 STXR(rscratch1, newv, addr); \
2466 cbnzw(rscratch1, retry_load); \
2467 if (prev->is_valid() && prev != result) \
2468 mov(prev, result); \
2469 }
2470
2471 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2472 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2473 ATOMIC_XCHG(xchgl, swpl, ldxr, stlxr, Assembler::xword)
2474 ATOMIC_XCHG(xchglw, swpl, ldxrw, stlxrw, Assembler::word)
2475 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2476 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2477
2478 #undef ATOMIC_XCHG
2479
2480 #ifndef PRODUCT
2481 extern "C" void findpc(intptr_t x);
2482 #endif
2483
debug64(char * msg,int64_t pc,int64_t regs[])2484 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2485 {
2486 // In order to get locks to work, we need to fake a in_VM state
2487 if (ShowMessageBoxOnError ) {
2488 JavaThread* thread = JavaThread::current();
2489 JavaThreadState saved_state = thread->thread_state();
2490 thread->set_thread_state(_thread_in_vm);
2491 #ifndef PRODUCT
2492 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2493 ttyLocker ttyl;
2494 BytecodeCounter::print();
2495 }
2496 #endif
2497 if (os::message_box(msg, "Execution stopped, print registers?")) {
2498 ttyLocker ttyl;
2499 tty->print_cr(" pc = 0x" UINT64_FORMAT_X, pc);
2500 #ifndef PRODUCT
2501 tty->cr();
2502 findpc(pc);
2503 tty->cr();
2504 #endif
2505 tty->print_cr(" r0 = 0x" UINT64_FORMAT_X, regs[0]);
2506 tty->print_cr(" r1 = 0x" UINT64_FORMAT_X, regs[1]);
2507 tty->print_cr(" r2 = 0x" UINT64_FORMAT_X, regs[2]);
2508 tty->print_cr(" r3 = 0x" UINT64_FORMAT_X, regs[3]);
2509 tty->print_cr(" r4 = 0x" UINT64_FORMAT_X, regs[4]);
2510 tty->print_cr(" r5 = 0x" UINT64_FORMAT_X, regs[5]);
2511 tty->print_cr(" r6 = 0x" UINT64_FORMAT_X, regs[6]);
2512 tty->print_cr(" r7 = 0x" UINT64_FORMAT_X, regs[7]);
2513 tty->print_cr(" r8 = 0x" UINT64_FORMAT_X, regs[8]);
2514 tty->print_cr(" r9 = 0x" UINT64_FORMAT_X, regs[9]);
2515 tty->print_cr("r10 = 0x" UINT64_FORMAT_X, regs[10]);
2516 tty->print_cr("r11 = 0x" UINT64_FORMAT_X, regs[11]);
2517 tty->print_cr("r12 = 0x" UINT64_FORMAT_X, regs[12]);
2518 tty->print_cr("r13 = 0x" UINT64_FORMAT_X, regs[13]);
2519 tty->print_cr("r14 = 0x" UINT64_FORMAT_X, regs[14]);
2520 tty->print_cr("r15 = 0x" UINT64_FORMAT_X, regs[15]);
2521 tty->print_cr("r16 = 0x" UINT64_FORMAT_X, regs[16]);
2522 tty->print_cr("r17 = 0x" UINT64_FORMAT_X, regs[17]);
2523 tty->print_cr("r18 = 0x" UINT64_FORMAT_X, regs[18]);
2524 tty->print_cr("r19 = 0x" UINT64_FORMAT_X, regs[19]);
2525 tty->print_cr("r20 = 0x" UINT64_FORMAT_X, regs[20]);
2526 tty->print_cr("r21 = 0x" UINT64_FORMAT_X, regs[21]);
2527 tty->print_cr("r22 = 0x" UINT64_FORMAT_X, regs[22]);
2528 tty->print_cr("r23 = 0x" UINT64_FORMAT_X, regs[23]);
2529 tty->print_cr("r24 = 0x" UINT64_FORMAT_X, regs[24]);
2530 tty->print_cr("r25 = 0x" UINT64_FORMAT_X, regs[25]);
2531 tty->print_cr("r26 = 0x" UINT64_FORMAT_X, regs[26]);
2532 tty->print_cr("r27 = 0x" UINT64_FORMAT_X, regs[27]);
2533 tty->print_cr("r28 = 0x" UINT64_FORMAT_X, regs[28]);
2534 tty->print_cr("r30 = 0x" UINT64_FORMAT_X, regs[30]);
2535 tty->print_cr("r31 = 0x" UINT64_FORMAT_X, regs[31]);
2536 BREAKPOINT;
2537 }
2538 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2539 } else {
2540 ttyLocker ttyl;
2541 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2542 msg);
2543 assert(false, "DEBUG MESSAGE: %s", msg);
2544 }
2545 }
2546
push_call_clobbered_registers()2547 void MacroAssembler::push_call_clobbered_registers() {
2548 int step = 4 * wordSize;
2549 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2550 sub(sp, sp, step);
2551 mov(rscratch1, -step);
2552 // Push v0-v7, v16-v31.
2553 for (int i = 31; i>= 4; i -= 4) {
2554 if (i <= v7->encoding() || i >= v16->encoding())
2555 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2556 as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2557 }
2558 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2559 as_FloatRegister(3), T1D, Address(sp));
2560 }
2561
pop_call_clobbered_registers()2562 void MacroAssembler::pop_call_clobbered_registers() {
2563 for (int i = 0; i < 32; i += 4) {
2564 if (i <= v7->encoding() || i >= v16->encoding())
2565 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2566 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2567 }
2568
2569 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2570 }
2571
push_CPU_state(bool save_vectors)2572 void MacroAssembler::push_CPU_state(bool save_vectors) {
2573 int step = (save_vectors ? 8 : 4) * wordSize;
2574 push(0x3fffffff, sp); // integer registers except lr & sp
2575 mov(rscratch1, -step);
2576 sub(sp, sp, step);
2577 for (int i = 28; i >= 4; i -= 4) {
2578 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2579 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2580 }
2581 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2582 }
2583
pop_CPU_state(bool restore_vectors)2584 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2585 int step = (restore_vectors ? 8 : 4) * wordSize;
2586 for (int i = 0; i <= 28; i += 4)
2587 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2588 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2589 pop(0x3fffffff, sp); // integer registers except lr & sp
2590 }
2591
2592 /**
2593 * Helpers for multiply_to_len().
2594 */
add2_with_carry(Register final_dest_hi,Register dest_hi,Register dest_lo,Register src1,Register src2)2595 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2596 Register src1, Register src2) {
2597 adds(dest_lo, dest_lo, src1);
2598 adc(dest_hi, dest_hi, zr);
2599 adds(dest_lo, dest_lo, src2);
2600 adc(final_dest_hi, dest_hi, zr);
2601 }
2602
2603 // Generate an address from (r + r1 extend offset). "size" is the
2604 // size of the operand. The result may be in rscratch2.
offsetted_address(Register r,Register r1,Address::extend ext,int offset,int size)2605 Address MacroAssembler::offsetted_address(Register r, Register r1,
2606 Address::extend ext, int offset, int size) {
2607 if (offset || (ext.shift() % size != 0)) {
2608 lea(rscratch2, Address(r, r1, ext));
2609 return Address(rscratch2, offset);
2610 } else {
2611 return Address(r, r1, ext);
2612 }
2613 }
2614
spill_address(int size,int offset,Register tmp)2615 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2616 {
2617 assert(offset >= 0, "spill to negative address?");
2618 // Offset reachable ?
2619 // Not aligned - 9 bits signed offset
2620 // Aligned - 12 bits unsigned offset shifted
2621 Register base = sp;
2622 if ((offset & (size-1)) && offset >= (1<<8)) {
2623 add(tmp, base, offset & ((1<<12)-1));
2624 base = tmp;
2625 offset &= -1u<<12;
2626 }
2627
2628 if (offset >= (1<<12) * size) {
2629 add(tmp, base, offset & (((1<<12)-1)<<12));
2630 base = tmp;
2631 offset &= ~(((1<<12)-1)<<12);
2632 }
2633
2634 return Address(base, offset);
2635 }
2636
2637 // Checks whether offset is aligned.
2638 // Returns true if it is, else false.
merge_alignment_check(Register base,size_t size,int64_t cur_offset,int64_t prev_offset) const2639 bool MacroAssembler::merge_alignment_check(Register base,
2640 size_t size,
2641 int64_t cur_offset,
2642 int64_t prev_offset) const {
2643 if (AvoidUnalignedAccesses) {
2644 if (base == sp) {
2645 // Checks whether low offset if aligned to pair of registers.
2646 int64_t pair_mask = size * 2 - 1;
2647 int64_t offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2648 return (offset & pair_mask) == 0;
2649 } else { // If base is not sp, we can't guarantee the access is aligned.
2650 return false;
2651 }
2652 } else {
2653 int64_t mask = size - 1;
2654 // Load/store pair instruction only supports element size aligned offset.
2655 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2656 }
2657 }
2658
2659 // Checks whether current and previous loads/stores can be merged.
2660 // Returns true if it can be merged, else false.
ldst_can_merge(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store) const2661 bool MacroAssembler::ldst_can_merge(Register rt,
2662 const Address &adr,
2663 size_t cur_size_in_bytes,
2664 bool is_store) const {
2665 address prev = pc() - NativeInstruction::instruction_size;
2666 address last = code()->last_insn();
2667
2668 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2669 return false;
2670 }
2671
2672 if (adr.getMode() != Address::base_plus_offset || prev != last) {
2673 return false;
2674 }
2675
2676 NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2677 size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2678
2679 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2680 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2681
2682 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2683 return false;
2684 }
2685
2686 int64_t max_offset = 63 * prev_size_in_bytes;
2687 int64_t min_offset = -64 * prev_size_in_bytes;
2688
2689 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2690
2691 // Only same base can be merged.
2692 if (adr.base() != prev_ldst->base()) {
2693 return false;
2694 }
2695
2696 int64_t cur_offset = adr.offset();
2697 int64_t prev_offset = prev_ldst->offset();
2698 size_t diff = abs(cur_offset - prev_offset);
2699 if (diff != prev_size_in_bytes) {
2700 return false;
2701 }
2702
2703 // Following cases can not be merged:
2704 // ldr x2, [x2, #8]
2705 // ldr x3, [x2, #16]
2706 // or:
2707 // ldr x2, [x3, #8]
2708 // ldr x2, [x3, #16]
2709 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2710 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2711 return false;
2712 }
2713
2714 int64_t low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2715 // Offset range must be in ldp/stp instruction's range.
2716 if (low_offset > max_offset || low_offset < min_offset) {
2717 return false;
2718 }
2719
2720 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2721 return true;
2722 }
2723
2724 return false;
2725 }
2726
2727 // Merge current load/store with previous load/store into ldp/stp.
merge_ldst(Register rt,const Address & adr,size_t cur_size_in_bytes,bool is_store)2728 void MacroAssembler::merge_ldst(Register rt,
2729 const Address &adr,
2730 size_t cur_size_in_bytes,
2731 bool is_store) {
2732
2733 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2734
2735 Register rt_low, rt_high;
2736 address prev = pc() - NativeInstruction::instruction_size;
2737 NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2738
2739 int64_t offset;
2740
2741 if (adr.offset() < prev_ldst->offset()) {
2742 offset = adr.offset();
2743 rt_low = rt;
2744 rt_high = prev_ldst->target();
2745 } else {
2746 offset = prev_ldst->offset();
2747 rt_low = prev_ldst->target();
2748 rt_high = rt;
2749 }
2750
2751 Address adr_p = Address(prev_ldst->base(), offset);
2752 // Overwrite previous generated binary.
2753 code_section()->set_end(prev);
2754
2755 const int sz = prev_ldst->size_in_bytes();
2756 assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2757 if (!is_store) {
2758 BLOCK_COMMENT("merged ldr pair");
2759 if (sz == 8) {
2760 ldp(rt_low, rt_high, adr_p);
2761 } else {
2762 ldpw(rt_low, rt_high, adr_p);
2763 }
2764 } else {
2765 BLOCK_COMMENT("merged str pair");
2766 if (sz == 8) {
2767 stp(rt_low, rt_high, adr_p);
2768 } else {
2769 stpw(rt_low, rt_high, adr_p);
2770 }
2771 }
2772 }
2773
2774 /**
2775 * Multiply 64 bit by 64 bit first loop.
2776 */
multiply_64_x_64_loop(Register x,Register xstart,Register x_xstart,Register y,Register y_idx,Register z,Register carry,Register product,Register idx,Register kdx)2777 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2778 Register y, Register y_idx, Register z,
2779 Register carry, Register product,
2780 Register idx, Register kdx) {
2781 //
2782 // jlong carry, x[], y[], z[];
2783 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2784 // huge_128 product = y[idx] * x[xstart] + carry;
2785 // z[kdx] = (jlong)product;
2786 // carry = (jlong)(product >>> 64);
2787 // }
2788 // z[xstart] = carry;
2789 //
2790
2791 Label L_first_loop, L_first_loop_exit;
2792 Label L_one_x, L_one_y, L_multiply;
2793
2794 subsw(xstart, xstart, 1);
2795 br(Assembler::MI, L_one_x);
2796
2797 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2798 ldr(x_xstart, Address(rscratch1));
2799 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2800
2801 bind(L_first_loop);
2802 subsw(idx, idx, 1);
2803 br(Assembler::MI, L_first_loop_exit);
2804 subsw(idx, idx, 1);
2805 br(Assembler::MI, L_one_y);
2806 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2807 ldr(y_idx, Address(rscratch1));
2808 ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2809 bind(L_multiply);
2810
2811 // AArch64 has a multiply-accumulate instruction that we can't use
2812 // here because it has no way to process carries, so we have to use
2813 // separate add and adc instructions. Bah.
2814 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2815 mul(product, x_xstart, y_idx);
2816 adds(product, product, carry);
2817 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product
2818
2819 subw(kdx, kdx, 2);
2820 ror(product, product, 32); // back to big-endian
2821 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2822
2823 b(L_first_loop);
2824
2825 bind(L_one_y);
2826 ldrw(y_idx, Address(y, 0));
2827 b(L_multiply);
2828
2829 bind(L_one_x);
2830 ldrw(x_xstart, Address(x, 0));
2831 b(L_first_loop);
2832
2833 bind(L_first_loop_exit);
2834 }
2835
2836 /**
2837 * Multiply 128 bit by 128. Unrolled inner loop.
2838 *
2839 */
multiply_128_x_128_loop(Register y,Register z,Register carry,Register carry2,Register idx,Register jdx,Register yz_idx1,Register yz_idx2,Register tmp,Register tmp3,Register tmp4,Register tmp6,Register product_hi)2840 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2841 Register carry, Register carry2,
2842 Register idx, Register jdx,
2843 Register yz_idx1, Register yz_idx2,
2844 Register tmp, Register tmp3, Register tmp4,
2845 Register tmp6, Register product_hi) {
2846
2847 // jlong carry, x[], y[], z[];
2848 // int kdx = ystart+1;
2849 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2850 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2851 // jlong carry2 = (jlong)(tmp3 >>> 64);
2852 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2;
2853 // carry = (jlong)(tmp4 >>> 64);
2854 // z[kdx+idx+1] = (jlong)tmp3;
2855 // z[kdx+idx] = (jlong)tmp4;
2856 // }
2857 // idx += 2;
2858 // if (idx > 0) {
2859 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2860 // z[kdx+idx] = (jlong)yz_idx1;
2861 // carry = (jlong)(yz_idx1 >>> 64);
2862 // }
2863 //
2864
2865 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2866
2867 lsrw(jdx, idx, 2);
2868
2869 bind(L_third_loop);
2870
2871 subsw(jdx, jdx, 1);
2872 br(Assembler::MI, L_third_loop_exit);
2873 subw(idx, idx, 4);
2874
2875 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2876
2877 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2878
2879 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2880
2881 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2882 ror(yz_idx2, yz_idx2, 32);
2883
2884 ldp(rscratch2, rscratch1, Address(tmp6, 0));
2885
2886 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
2887 umulh(tmp4, product_hi, yz_idx1);
2888
2889 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2890 ror(rscratch2, rscratch2, 32);
2891
2892 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp
2893 umulh(carry2, product_hi, yz_idx2);
2894
2895 // propagate sum of both multiplications into carry:tmp4:tmp3
2896 adds(tmp3, tmp3, carry);
2897 adc(tmp4, tmp4, zr);
2898 adds(tmp3, tmp3, rscratch1);
2899 adcs(tmp4, tmp4, tmp);
2900 adc(carry, carry2, zr);
2901 adds(tmp4, tmp4, rscratch2);
2902 adc(carry, carry, zr);
2903
2904 ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2905 ror(tmp4, tmp4, 32);
2906 stp(tmp4, tmp3, Address(tmp6, 0));
2907
2908 b(L_third_loop);
2909 bind (L_third_loop_exit);
2910
2911 andw (idx, idx, 0x3);
2912 cbz(idx, L_post_third_loop_done);
2913
2914 Label L_check_1;
2915 subsw(idx, idx, 2);
2916 br(Assembler::MI, L_check_1);
2917
2918 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2919 ldr(yz_idx1, Address(rscratch1, 0));
2920 ror(yz_idx1, yz_idx1, 32);
2921 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
2922 umulh(tmp4, product_hi, yz_idx1);
2923 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2924 ldr(yz_idx2, Address(rscratch1, 0));
2925 ror(yz_idx2, yz_idx2, 32);
2926
2927 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2928
2929 ror(tmp3, tmp3, 32);
2930 str(tmp3, Address(rscratch1, 0));
2931
2932 bind (L_check_1);
2933
2934 andw (idx, idx, 0x1);
2935 subsw(idx, idx, 1);
2936 br(Assembler::MI, L_post_third_loop_done);
2937 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2938 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3
2939 umulh(carry2, tmp4, product_hi);
2940 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2941
2942 add2_with_carry(carry2, tmp3, tmp4, carry);
2943
2944 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2945 extr(carry, carry2, tmp3, 32);
2946
2947 bind(L_post_third_loop_done);
2948 }
2949
2950 /**
2951 * Code for BigInteger::multiplyToLen() instrinsic.
2952 *
2953 * r0: x
2954 * r1: xlen
2955 * r2: y
2956 * r3: ylen
2957 * r4: z
2958 * r5: zlen
2959 * r10: tmp1
2960 * r11: tmp2
2961 * r12: tmp3
2962 * r13: tmp4
2963 * r14: tmp5
2964 * r15: tmp6
2965 * r16: tmp7
2966 *
2967 */
multiply_to_len(Register x,Register xlen,Register y,Register ylen,Register z,Register zlen,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,Register product_hi)2968 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2969 Register z, Register zlen,
2970 Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2971 Register tmp5, Register tmp6, Register product_hi) {
2972
2973 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2974
2975 const Register idx = tmp1;
2976 const Register kdx = tmp2;
2977 const Register xstart = tmp3;
2978
2979 const Register y_idx = tmp4;
2980 const Register carry = tmp5;
2981 const Register product = xlen;
2982 const Register x_xstart = zlen; // reuse register
2983
2984 // First Loop.
2985 //
2986 // final static long LONG_MASK = 0xffffffffL;
2987 // int xstart = xlen - 1;
2988 // int ystart = ylen - 1;
2989 // long carry = 0;
2990 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2991 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2992 // z[kdx] = (int)product;
2993 // carry = product >>> 32;
2994 // }
2995 // z[xstart] = (int)carry;
2996 //
2997
2998 movw(idx, ylen); // idx = ylen;
2999 movw(kdx, zlen); // kdx = xlen+ylen;
3000 mov(carry, zr); // carry = 0;
3001
3002 Label L_done;
3003
3004 movw(xstart, xlen);
3005 subsw(xstart, xstart, 1);
3006 br(Assembler::MI, L_done);
3007
3008 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3009
3010 Label L_second_loop;
3011 cbzw(kdx, L_second_loop);
3012
3013 Label L_carry;
3014 subw(kdx, kdx, 1);
3015 cbzw(kdx, L_carry);
3016
3017 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3018 lsr(carry, carry, 32);
3019 subw(kdx, kdx, 1);
3020
3021 bind(L_carry);
3022 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3023
3024 // Second and third (nested) loops.
3025 //
3026 // for (int i = xstart-1; i >= 0; i--) { // Second loop
3027 // carry = 0;
3028 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3029 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3030 // (z[k] & LONG_MASK) + carry;
3031 // z[k] = (int)product;
3032 // carry = product >>> 32;
3033 // }
3034 // z[i] = (int)carry;
3035 // }
3036 //
3037 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3038
3039 const Register jdx = tmp1;
3040
3041 bind(L_second_loop);
3042 mov(carry, zr); // carry = 0;
3043 movw(jdx, ylen); // j = ystart+1
3044
3045 subsw(xstart, xstart, 1); // i = xstart-1;
3046 br(Assembler::MI, L_done);
3047
3048 str(z, Address(pre(sp, -4 * wordSize)));
3049
3050 Label L_last_x;
3051 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3052 subsw(xstart, xstart, 1); // i = xstart-1;
3053 br(Assembler::MI, L_last_x);
3054
3055 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3056 ldr(product_hi, Address(rscratch1));
3057 ror(product_hi, product_hi, 32); // convert big-endian to little-endian
3058
3059 Label L_third_loop_prologue;
3060 bind(L_third_loop_prologue);
3061
3062 str(ylen, Address(sp, wordSize));
3063 stp(x, xstart, Address(sp, 2 * wordSize));
3064 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3065 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3066 ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3067 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen
3068
3069 addw(tmp3, xlen, 1);
3070 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3071 subsw(tmp3, tmp3, 1);
3072 br(Assembler::MI, L_done);
3073
3074 lsr(carry, carry, 32);
3075 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3076 b(L_second_loop);
3077
3078 // Next infrequent code is moved outside loops.
3079 bind(L_last_x);
3080 ldrw(product_hi, Address(x, 0));
3081 b(L_third_loop_prologue);
3082
3083 bind(L_done);
3084 }
3085
3086 // Code for BigInteger::mulAdd instrinsic
3087 // out = r0
3088 // in = r1
3089 // offset = r2 (already out.length-offset)
3090 // len = r3
3091 // k = r4
3092 //
3093 // pseudo code from java implementation:
3094 // carry = 0;
3095 // offset = out.length-offset - 1;
3096 // for (int j=len-1; j >= 0; j--) {
3097 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3098 // out[offset--] = (int)product;
3099 // carry = product >>> 32;
3100 // }
3101 // return (int)carry;
mul_add(Register out,Register in,Register offset,Register len,Register k)3102 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3103 Register len, Register k) {
3104 Label LOOP, END;
3105 // pre-loop
3106 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3107 csel(out, zr, out, Assembler::EQ);
3108 br(Assembler::EQ, END);
3109 add(in, in, len, LSL, 2); // in[j+1] address
3110 add(offset, out, offset, LSL, 2); // out[offset + 1] address
3111 mov(out, zr); // used to keep carry now
3112 BIND(LOOP);
3113 ldrw(rscratch1, Address(pre(in, -4)));
3114 madd(rscratch1, rscratch1, k, out);
3115 ldrw(rscratch2, Address(pre(offset, -4)));
3116 add(rscratch1, rscratch1, rscratch2);
3117 strw(rscratch1, Address(offset));
3118 lsr(out, rscratch1, 32);
3119 subs(len, len, 1);
3120 br(Assembler::NE, LOOP);
3121 BIND(END);
3122 }
3123
3124 /**
3125 * Emits code to update CRC-32 with a byte value according to constants in table
3126 *
3127 * @param [in,out]crc Register containing the crc.
3128 * @param [in]val Register containing the byte to fold into the CRC.
3129 * @param [in]table Register containing the table of crc constants.
3130 *
3131 * uint32_t crc;
3132 * val = crc_table[(val ^ crc) & 0xFF];
3133 * crc = val ^ (crc >> 8);
3134 *
3135 */
update_byte_crc32(Register crc,Register val,Register table)3136 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3137 eor(val, val, crc);
3138 andr(val, val, 0xff);
3139 ldrw(val, Address(table, val, Address::lsl(2)));
3140 eor(crc, val, crc, Assembler::LSR, 8);
3141 }
3142
3143 /**
3144 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3145 *
3146 * @param [in,out]crc Register containing the crc.
3147 * @param [in]v Register containing the 32-bit to fold into the CRC.
3148 * @param [in]table0 Register containing table 0 of crc constants.
3149 * @param [in]table1 Register containing table 1 of crc constants.
3150 * @param [in]table2 Register containing table 2 of crc constants.
3151 * @param [in]table3 Register containing table 3 of crc constants.
3152 *
3153 * uint32_t crc;
3154 * v = crc ^ v
3155 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3156 *
3157 */
update_word_crc32(Register crc,Register v,Register tmp,Register table0,Register table1,Register table2,Register table3,bool upper)3158 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3159 Register table0, Register table1, Register table2, Register table3,
3160 bool upper) {
3161 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3162 uxtb(tmp, v);
3163 ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3164 ubfx(tmp, v, 8, 8);
3165 ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3166 eor(crc, crc, tmp);
3167 ubfx(tmp, v, 16, 8);
3168 ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3169 eor(crc, crc, tmp);
3170 ubfx(tmp, v, 24, 8);
3171 ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3172 eor(crc, crc, tmp);
3173 }
3174
kernel_crc32_using_crc32(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3175 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3176 Register len, Register tmp0, Register tmp1, Register tmp2,
3177 Register tmp3) {
3178 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3179 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3180
3181 mvnw(crc, crc);
3182
3183 subs(len, len, 128);
3184 br(Assembler::GE, CRC_by64_pre);
3185 BIND(CRC_less64);
3186 adds(len, len, 128-32);
3187 br(Assembler::GE, CRC_by32_loop);
3188 BIND(CRC_less32);
3189 adds(len, len, 32-4);
3190 br(Assembler::GE, CRC_by4_loop);
3191 adds(len, len, 4);
3192 br(Assembler::GT, CRC_by1_loop);
3193 b(L_exit);
3194
3195 BIND(CRC_by32_loop);
3196 ldp(tmp0, tmp1, Address(post(buf, 16)));
3197 subs(len, len, 32);
3198 crc32x(crc, crc, tmp0);
3199 ldr(tmp2, Address(post(buf, 8)));
3200 crc32x(crc, crc, tmp1);
3201 ldr(tmp3, Address(post(buf, 8)));
3202 crc32x(crc, crc, tmp2);
3203 crc32x(crc, crc, tmp3);
3204 br(Assembler::GE, CRC_by32_loop);
3205 cmn(len, 32);
3206 br(Assembler::NE, CRC_less32);
3207 b(L_exit);
3208
3209 BIND(CRC_by4_loop);
3210 ldrw(tmp0, Address(post(buf, 4)));
3211 subs(len, len, 4);
3212 crc32w(crc, crc, tmp0);
3213 br(Assembler::GE, CRC_by4_loop);
3214 adds(len, len, 4);
3215 br(Assembler::LE, L_exit);
3216 BIND(CRC_by1_loop);
3217 ldrb(tmp0, Address(post(buf, 1)));
3218 subs(len, len, 1);
3219 crc32b(crc, crc, tmp0);
3220 br(Assembler::GT, CRC_by1_loop);
3221 b(L_exit);
3222
3223 BIND(CRC_by64_pre);
3224 sub(buf, buf, 8);
3225 ldp(tmp0, tmp1, Address(buf, 8));
3226 crc32x(crc, crc, tmp0);
3227 ldr(tmp2, Address(buf, 24));
3228 crc32x(crc, crc, tmp1);
3229 ldr(tmp3, Address(buf, 32));
3230 crc32x(crc, crc, tmp2);
3231 ldr(tmp0, Address(buf, 40));
3232 crc32x(crc, crc, tmp3);
3233 ldr(tmp1, Address(buf, 48));
3234 crc32x(crc, crc, tmp0);
3235 ldr(tmp2, Address(buf, 56));
3236 crc32x(crc, crc, tmp1);
3237 ldr(tmp3, Address(pre(buf, 64)));
3238
3239 b(CRC_by64_loop);
3240
3241 align(CodeEntryAlignment);
3242 BIND(CRC_by64_loop);
3243 subs(len, len, 64);
3244 crc32x(crc, crc, tmp2);
3245 ldr(tmp0, Address(buf, 8));
3246 crc32x(crc, crc, tmp3);
3247 ldr(tmp1, Address(buf, 16));
3248 crc32x(crc, crc, tmp0);
3249 ldr(tmp2, Address(buf, 24));
3250 crc32x(crc, crc, tmp1);
3251 ldr(tmp3, Address(buf, 32));
3252 crc32x(crc, crc, tmp2);
3253 ldr(tmp0, Address(buf, 40));
3254 crc32x(crc, crc, tmp3);
3255 ldr(tmp1, Address(buf, 48));
3256 crc32x(crc, crc, tmp0);
3257 ldr(tmp2, Address(buf, 56));
3258 crc32x(crc, crc, tmp1);
3259 ldr(tmp3, Address(pre(buf, 64)));
3260 br(Assembler::GE, CRC_by64_loop);
3261
3262 // post-loop
3263 crc32x(crc, crc, tmp2);
3264 crc32x(crc, crc, tmp3);
3265
3266 sub(len, len, 64);
3267 add(buf, buf, 8);
3268 cmn(len, 128);
3269 br(Assembler::NE, CRC_less64);
3270 BIND(L_exit);
3271 mvnw(crc, crc);
3272 }
3273
3274 /**
3275 * @param crc register containing existing CRC (32-bit)
3276 * @param buf register pointing to input byte buffer (byte*)
3277 * @param len register containing number of bytes
3278 * @param table register that will contain address of CRC table
3279 * @param tmp scratch register
3280 */
kernel_crc32(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3281 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3282 Register table0, Register table1, Register table2, Register table3,
3283 Register tmp, Register tmp2, Register tmp3) {
3284 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3285 uint64_t offset;
3286
3287 if (UseCRC32) {
3288 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3289 return;
3290 }
3291
3292 mvnw(crc, crc);
3293
3294 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3295 if (offset) add(table0, table0, offset);
3296 add(table1, table0, 1*256*sizeof(juint));
3297 add(table2, table0, 2*256*sizeof(juint));
3298 add(table3, table0, 3*256*sizeof(juint));
3299
3300 if (UseNeon) {
3301 cmp(len, 64);
3302 br(Assembler::LT, L_by16);
3303 eor(v16, T16B, v16, v16);
3304
3305 Label L_fold;
3306
3307 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3308
3309 ld1(v0, v1, T2D, post(buf, 32));
3310 ld1r(v4, T2D, post(tmp, 8));
3311 ld1r(v5, T2D, post(tmp, 8));
3312 ld1r(v6, T2D, post(tmp, 8));
3313 ld1r(v7, T2D, post(tmp, 8));
3314 mov(v16, T4S, 0, crc);
3315
3316 eor(v0, T16B, v0, v16);
3317 sub(len, len, 64);
3318
3319 BIND(L_fold);
3320 pmull(v22, T8H, v0, v5, T8B);
3321 pmull(v20, T8H, v0, v7, T8B);
3322 pmull(v23, T8H, v0, v4, T8B);
3323 pmull(v21, T8H, v0, v6, T8B);
3324
3325 pmull2(v18, T8H, v0, v5, T16B);
3326 pmull2(v16, T8H, v0, v7, T16B);
3327 pmull2(v19, T8H, v0, v4, T16B);
3328 pmull2(v17, T8H, v0, v6, T16B);
3329
3330 uzp1(v24, T8H, v20, v22);
3331 uzp2(v25, T8H, v20, v22);
3332 eor(v20, T16B, v24, v25);
3333
3334 uzp1(v26, T8H, v16, v18);
3335 uzp2(v27, T8H, v16, v18);
3336 eor(v16, T16B, v26, v27);
3337
3338 ushll2(v22, T4S, v20, T8H, 8);
3339 ushll(v20, T4S, v20, T4H, 8);
3340
3341 ushll2(v18, T4S, v16, T8H, 8);
3342 ushll(v16, T4S, v16, T4H, 8);
3343
3344 eor(v22, T16B, v23, v22);
3345 eor(v18, T16B, v19, v18);
3346 eor(v20, T16B, v21, v20);
3347 eor(v16, T16B, v17, v16);
3348
3349 uzp1(v17, T2D, v16, v20);
3350 uzp2(v21, T2D, v16, v20);
3351 eor(v17, T16B, v17, v21);
3352
3353 ushll2(v20, T2D, v17, T4S, 16);
3354 ushll(v16, T2D, v17, T2S, 16);
3355
3356 eor(v20, T16B, v20, v22);
3357 eor(v16, T16B, v16, v18);
3358
3359 uzp1(v17, T2D, v20, v16);
3360 uzp2(v21, T2D, v20, v16);
3361 eor(v28, T16B, v17, v21);
3362
3363 pmull(v22, T8H, v1, v5, T8B);
3364 pmull(v20, T8H, v1, v7, T8B);
3365 pmull(v23, T8H, v1, v4, T8B);
3366 pmull(v21, T8H, v1, v6, T8B);
3367
3368 pmull2(v18, T8H, v1, v5, T16B);
3369 pmull2(v16, T8H, v1, v7, T16B);
3370 pmull2(v19, T8H, v1, v4, T16B);
3371 pmull2(v17, T8H, v1, v6, T16B);
3372
3373 ld1(v0, v1, T2D, post(buf, 32));
3374
3375 uzp1(v24, T8H, v20, v22);
3376 uzp2(v25, T8H, v20, v22);
3377 eor(v20, T16B, v24, v25);
3378
3379 uzp1(v26, T8H, v16, v18);
3380 uzp2(v27, T8H, v16, v18);
3381 eor(v16, T16B, v26, v27);
3382
3383 ushll2(v22, T4S, v20, T8H, 8);
3384 ushll(v20, T4S, v20, T4H, 8);
3385
3386 ushll2(v18, T4S, v16, T8H, 8);
3387 ushll(v16, T4S, v16, T4H, 8);
3388
3389 eor(v22, T16B, v23, v22);
3390 eor(v18, T16B, v19, v18);
3391 eor(v20, T16B, v21, v20);
3392 eor(v16, T16B, v17, v16);
3393
3394 uzp1(v17, T2D, v16, v20);
3395 uzp2(v21, T2D, v16, v20);
3396 eor(v16, T16B, v17, v21);
3397
3398 ushll2(v20, T2D, v16, T4S, 16);
3399 ushll(v16, T2D, v16, T2S, 16);
3400
3401 eor(v20, T16B, v22, v20);
3402 eor(v16, T16B, v16, v18);
3403
3404 uzp1(v17, T2D, v20, v16);
3405 uzp2(v21, T2D, v20, v16);
3406 eor(v20, T16B, v17, v21);
3407
3408 shl(v16, T2D, v28, 1);
3409 shl(v17, T2D, v20, 1);
3410
3411 eor(v0, T16B, v0, v16);
3412 eor(v1, T16B, v1, v17);
3413
3414 subs(len, len, 32);
3415 br(Assembler::GE, L_fold);
3416
3417 mov(crc, 0);
3418 mov(tmp, v0, T1D, 0);
3419 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3420 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3421 mov(tmp, v0, T1D, 1);
3422 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3423 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3424 mov(tmp, v1, T1D, 0);
3425 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3426 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3427 mov(tmp, v1, T1D, 1);
3428 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3429 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3430
3431 add(len, len, 32);
3432 }
3433
3434 BIND(L_by16);
3435 subs(len, len, 16);
3436 br(Assembler::GE, L_by16_loop);
3437 adds(len, len, 16-4);
3438 br(Assembler::GE, L_by4_loop);
3439 adds(len, len, 4);
3440 br(Assembler::GT, L_by1_loop);
3441 b(L_exit);
3442
3443 BIND(L_by4_loop);
3444 ldrw(tmp, Address(post(buf, 4)));
3445 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3446 subs(len, len, 4);
3447 br(Assembler::GE, L_by4_loop);
3448 adds(len, len, 4);
3449 br(Assembler::LE, L_exit);
3450 BIND(L_by1_loop);
3451 subs(len, len, 1);
3452 ldrb(tmp, Address(post(buf, 1)));
3453 update_byte_crc32(crc, tmp, table0);
3454 br(Assembler::GT, L_by1_loop);
3455 b(L_exit);
3456
3457 align(CodeEntryAlignment);
3458 BIND(L_by16_loop);
3459 subs(len, len, 16);
3460 ldp(tmp, tmp3, Address(post(buf, 16)));
3461 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3462 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3463 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3464 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3465 br(Assembler::GE, L_by16_loop);
3466 adds(len, len, 16-4);
3467 br(Assembler::GE, L_by4_loop);
3468 adds(len, len, 4);
3469 br(Assembler::GT, L_by1_loop);
3470 BIND(L_exit);
3471 mvnw(crc, crc);
3472 }
3473
kernel_crc32c_using_crc32c(Register crc,Register buf,Register len,Register tmp0,Register tmp1,Register tmp2,Register tmp3)3474 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3475 Register len, Register tmp0, Register tmp1, Register tmp2,
3476 Register tmp3) {
3477 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3478 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3479
3480 subs(len, len, 128);
3481 br(Assembler::GE, CRC_by64_pre);
3482 BIND(CRC_less64);
3483 adds(len, len, 128-32);
3484 br(Assembler::GE, CRC_by32_loop);
3485 BIND(CRC_less32);
3486 adds(len, len, 32-4);
3487 br(Assembler::GE, CRC_by4_loop);
3488 adds(len, len, 4);
3489 br(Assembler::GT, CRC_by1_loop);
3490 b(L_exit);
3491
3492 BIND(CRC_by32_loop);
3493 ldp(tmp0, tmp1, Address(post(buf, 16)));
3494 subs(len, len, 32);
3495 crc32cx(crc, crc, tmp0);
3496 ldr(tmp2, Address(post(buf, 8)));
3497 crc32cx(crc, crc, tmp1);
3498 ldr(tmp3, Address(post(buf, 8)));
3499 crc32cx(crc, crc, tmp2);
3500 crc32cx(crc, crc, tmp3);
3501 br(Assembler::GE, CRC_by32_loop);
3502 cmn(len, 32);
3503 br(Assembler::NE, CRC_less32);
3504 b(L_exit);
3505
3506 BIND(CRC_by4_loop);
3507 ldrw(tmp0, Address(post(buf, 4)));
3508 subs(len, len, 4);
3509 crc32cw(crc, crc, tmp0);
3510 br(Assembler::GE, CRC_by4_loop);
3511 adds(len, len, 4);
3512 br(Assembler::LE, L_exit);
3513 BIND(CRC_by1_loop);
3514 ldrb(tmp0, Address(post(buf, 1)));
3515 subs(len, len, 1);
3516 crc32cb(crc, crc, tmp0);
3517 br(Assembler::GT, CRC_by1_loop);
3518 b(L_exit);
3519
3520 BIND(CRC_by64_pre);
3521 sub(buf, buf, 8);
3522 ldp(tmp0, tmp1, Address(buf, 8));
3523 crc32cx(crc, crc, tmp0);
3524 ldr(tmp2, Address(buf, 24));
3525 crc32cx(crc, crc, tmp1);
3526 ldr(tmp3, Address(buf, 32));
3527 crc32cx(crc, crc, tmp2);
3528 ldr(tmp0, Address(buf, 40));
3529 crc32cx(crc, crc, tmp3);
3530 ldr(tmp1, Address(buf, 48));
3531 crc32cx(crc, crc, tmp0);
3532 ldr(tmp2, Address(buf, 56));
3533 crc32cx(crc, crc, tmp1);
3534 ldr(tmp3, Address(pre(buf, 64)));
3535
3536 b(CRC_by64_loop);
3537
3538 align(CodeEntryAlignment);
3539 BIND(CRC_by64_loop);
3540 subs(len, len, 64);
3541 crc32cx(crc, crc, tmp2);
3542 ldr(tmp0, Address(buf, 8));
3543 crc32cx(crc, crc, tmp3);
3544 ldr(tmp1, Address(buf, 16));
3545 crc32cx(crc, crc, tmp0);
3546 ldr(tmp2, Address(buf, 24));
3547 crc32cx(crc, crc, tmp1);
3548 ldr(tmp3, Address(buf, 32));
3549 crc32cx(crc, crc, tmp2);
3550 ldr(tmp0, Address(buf, 40));
3551 crc32cx(crc, crc, tmp3);
3552 ldr(tmp1, Address(buf, 48));
3553 crc32cx(crc, crc, tmp0);
3554 ldr(tmp2, Address(buf, 56));
3555 crc32cx(crc, crc, tmp1);
3556 ldr(tmp3, Address(pre(buf, 64)));
3557 br(Assembler::GE, CRC_by64_loop);
3558
3559 // post-loop
3560 crc32cx(crc, crc, tmp2);
3561 crc32cx(crc, crc, tmp3);
3562
3563 sub(len, len, 64);
3564 add(buf, buf, 8);
3565 cmn(len, 128);
3566 br(Assembler::NE, CRC_less64);
3567 BIND(L_exit);
3568 }
3569
3570 /**
3571 * @param crc register containing existing CRC (32-bit)
3572 * @param buf register pointing to input byte buffer (byte*)
3573 * @param len register containing number of bytes
3574 * @param table register that will contain address of CRC table
3575 * @param tmp scratch register
3576 */
kernel_crc32c(Register crc,Register buf,Register len,Register table0,Register table1,Register table2,Register table3,Register tmp,Register tmp2,Register tmp3)3577 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3578 Register table0, Register table1, Register table2, Register table3,
3579 Register tmp, Register tmp2, Register tmp3) {
3580 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3581 }
3582
3583
SkipIfEqual(MacroAssembler * masm,const bool * flag_addr,bool value)3584 SkipIfEqual::SkipIfEqual(
3585 MacroAssembler* masm, const bool* flag_addr, bool value) {
3586 _masm = masm;
3587 uint64_t offset;
3588 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3589 _masm->ldrb(rscratch1, Address(rscratch1, offset));
3590 _masm->cbzw(rscratch1, _label);
3591 }
3592
~SkipIfEqual()3593 SkipIfEqual::~SkipIfEqual() {
3594 _masm->bind(_label);
3595 }
3596
addptr(const Address & dst,int32_t src)3597 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3598 Address adr;
3599 switch(dst.getMode()) {
3600 case Address::base_plus_offset:
3601 // This is the expected mode, although we allow all the other
3602 // forms below.
3603 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3604 break;
3605 default:
3606 lea(rscratch2, dst);
3607 adr = Address(rscratch2);
3608 break;
3609 }
3610 ldr(rscratch1, adr);
3611 add(rscratch1, rscratch1, src);
3612 str(rscratch1, adr);
3613 }
3614
cmpptr(Register src1,Address src2)3615 void MacroAssembler::cmpptr(Register src1, Address src2) {
3616 uint64_t offset;
3617 adrp(rscratch1, src2, offset);
3618 ldr(rscratch1, Address(rscratch1, offset));
3619 cmp(src1, rscratch1);
3620 }
3621
cmpoop(Register obj1,Register obj2)3622 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3623 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3624 bs->obj_equals(this, obj1, obj2);
3625 }
3626
load_klass(Register dst,Register src)3627 void MacroAssembler::load_klass(Register dst, Register src) {
3628 if (UseCompressedClassPointers) {
3629 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3630 decode_klass_not_null(dst);
3631 } else {
3632 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3633 }
3634 }
3635
3636 // ((OopHandle)result).resolve();
resolve_oop_handle(Register result,Register tmp)3637 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3638 // OopHandle::resolve is an indirection.
3639 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3640 }
3641
load_mirror(Register dst,Register method,Register tmp)3642 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3643 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3644 ldr(dst, Address(rmethod, Method::const_offset()));
3645 ldr(dst, Address(dst, ConstMethod::constants_offset()));
3646 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3647 ldr(dst, Address(dst, mirror_offset));
3648 resolve_oop_handle(dst, tmp);
3649 }
3650
cmp_klass(Register oop,Register trial_klass,Register tmp)3651 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3652 if (UseCompressedClassPointers) {
3653 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3654 if (Universe::narrow_klass_base() == NULL) {
3655 cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3656 return;
3657 } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3658 && Universe::narrow_klass_shift() == 0) {
3659 // Only the bottom 32 bits matter
3660 cmpw(trial_klass, tmp);
3661 return;
3662 }
3663 decode_klass_not_null(tmp);
3664 } else {
3665 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3666 }
3667 cmp(trial_klass, tmp);
3668 }
3669
load_prototype_header(Register dst,Register src)3670 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3671 load_klass(dst, src);
3672 ldr(dst, Address(dst, Klass::prototype_header_offset()));
3673 }
3674
store_klass(Register dst,Register src)3675 void MacroAssembler::store_klass(Register dst, Register src) {
3676 // FIXME: Should this be a store release? concurrent gcs assumes
3677 // klass length is valid if klass field is not null.
3678 if (UseCompressedClassPointers) {
3679 encode_klass_not_null(src);
3680 strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3681 } else {
3682 str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3683 }
3684 }
3685
store_klass_gap(Register dst,Register src)3686 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3687 if (UseCompressedClassPointers) {
3688 // Store to klass gap in destination
3689 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3690 }
3691 }
3692
3693 // Algorithm must match CompressedOops::encode.
encode_heap_oop(Register d,Register s)3694 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3695 #ifdef ASSERT
3696 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3697 #endif
3698 verify_oop(s, "broken oop in encode_heap_oop");
3699 if (Universe::narrow_oop_base() == NULL) {
3700 if (Universe::narrow_oop_shift() != 0) {
3701 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3702 lsr(d, s, LogMinObjAlignmentInBytes);
3703 } else {
3704 mov(d, s);
3705 }
3706 } else {
3707 subs(d, s, rheapbase);
3708 csel(d, d, zr, Assembler::HS);
3709 lsr(d, d, LogMinObjAlignmentInBytes);
3710
3711 /* Old algorithm: is this any worse?
3712 Label nonnull;
3713 cbnz(r, nonnull);
3714 sub(r, r, rheapbase);
3715 bind(nonnull);
3716 lsr(r, r, LogMinObjAlignmentInBytes);
3717 */
3718 }
3719 }
3720
encode_heap_oop_not_null(Register r)3721 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3722 #ifdef ASSERT
3723 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3724 if (CheckCompressedOops) {
3725 Label ok;
3726 cbnz(r, ok);
3727 stop("null oop passed to encode_heap_oop_not_null");
3728 bind(ok);
3729 }
3730 #endif
3731 verify_oop(r, "broken oop in encode_heap_oop_not_null");
3732 if (Universe::narrow_oop_base() != NULL) {
3733 sub(r, r, rheapbase);
3734 }
3735 if (Universe::narrow_oop_shift() != 0) {
3736 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3737 lsr(r, r, LogMinObjAlignmentInBytes);
3738 }
3739 }
3740
encode_heap_oop_not_null(Register dst,Register src)3741 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3742 #ifdef ASSERT
3743 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3744 if (CheckCompressedOops) {
3745 Label ok;
3746 cbnz(src, ok);
3747 stop("null oop passed to encode_heap_oop_not_null2");
3748 bind(ok);
3749 }
3750 #endif
3751 verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3752
3753 Register data = src;
3754 if (Universe::narrow_oop_base() != NULL) {
3755 sub(dst, src, rheapbase);
3756 data = dst;
3757 }
3758 if (Universe::narrow_oop_shift() != 0) {
3759 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3760 lsr(dst, data, LogMinObjAlignmentInBytes);
3761 data = dst;
3762 }
3763 if (data == src)
3764 mov(dst, src);
3765 }
3766
decode_heap_oop(Register d,Register s)3767 void MacroAssembler::decode_heap_oop(Register d, Register s) {
3768 #ifdef ASSERT
3769 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3770 #endif
3771 if (Universe::narrow_oop_base() == NULL) {
3772 if (Universe::narrow_oop_shift() != 0 || d != s) {
3773 lsl(d, s, Universe::narrow_oop_shift());
3774 }
3775 } else {
3776 Label done;
3777 if (d != s)
3778 mov(d, s);
3779 cbz(s, done);
3780 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3781 bind(done);
3782 }
3783 verify_oop(d, "broken oop in decode_heap_oop");
3784 }
3785
decode_heap_oop_not_null(Register r)3786 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3787 assert (UseCompressedOops, "should only be used for compressed headers");
3788 assert (Universe::heap() != NULL, "java heap should be initialized");
3789 // Cannot assert, unverified entry point counts instructions (see .ad file)
3790 // vtableStubs also counts instructions in pd_code_size_limit.
3791 // Also do not verify_oop as this is called by verify_oop.
3792 if (Universe::narrow_oop_shift() != 0) {
3793 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3794 if (Universe::narrow_oop_base() != NULL) {
3795 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3796 } else {
3797 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3798 }
3799 } else {
3800 assert (Universe::narrow_oop_base() == NULL, "sanity");
3801 }
3802 }
3803
decode_heap_oop_not_null(Register dst,Register src)3804 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3805 assert (UseCompressedOops, "should only be used for compressed headers");
3806 assert (Universe::heap() != NULL, "java heap should be initialized");
3807 // Cannot assert, unverified entry point counts instructions (see .ad file)
3808 // vtableStubs also counts instructions in pd_code_size_limit.
3809 // Also do not verify_oop as this is called by verify_oop.
3810 if (Universe::narrow_oop_shift() != 0) {
3811 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3812 if (Universe::narrow_oop_base() != NULL) {
3813 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3814 } else {
3815 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3816 }
3817 } else {
3818 assert (Universe::narrow_oop_base() == NULL, "sanity");
3819 if (dst != src) {
3820 mov(dst, src);
3821 }
3822 }
3823 }
3824
encode_klass_not_null(Register dst,Register src)3825 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3826 if (Universe::narrow_klass_base() == NULL) {
3827 if (Universe::narrow_klass_shift() != 0) {
3828 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3829 lsr(dst, src, LogKlassAlignmentInBytes);
3830 } else {
3831 if (dst != src) mov(dst, src);
3832 }
3833 return;
3834 }
3835
3836 if (use_XOR_for_compressed_class_base) {
3837 if (Universe::narrow_klass_shift() != 0) {
3838 eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3839 lsr(dst, dst, LogKlassAlignmentInBytes);
3840 } else {
3841 eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3842 }
3843 return;
3844 }
3845
3846 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3847 && Universe::narrow_klass_shift() == 0) {
3848 movw(dst, src);
3849 return;
3850 }
3851
3852 #ifdef ASSERT
3853 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3854 #endif
3855
3856 Register rbase = dst;
3857 if (dst == src) rbase = rheapbase;
3858 mov(rbase, (uint64_t)Universe::narrow_klass_base());
3859 sub(dst, src, rbase);
3860 if (Universe::narrow_klass_shift() != 0) {
3861 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3862 lsr(dst, dst, LogKlassAlignmentInBytes);
3863 }
3864 if (dst == src) reinit_heapbase();
3865 }
3866
encode_klass_not_null(Register r)3867 void MacroAssembler::encode_klass_not_null(Register r) {
3868 encode_klass_not_null(r, r);
3869 }
3870
decode_klass_not_null(Register dst,Register src)3871 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3872 Register rbase = dst;
3873 assert (UseCompressedClassPointers, "should only be used for compressed headers");
3874
3875 if (Universe::narrow_klass_base() == NULL) {
3876 if (Universe::narrow_klass_shift() != 0) {
3877 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3878 lsl(dst, src, LogKlassAlignmentInBytes);
3879 } else {
3880 if (dst != src) mov(dst, src);
3881 }
3882 return;
3883 }
3884
3885 if (use_XOR_for_compressed_class_base) {
3886 if (Universe::narrow_klass_shift() != 0) {
3887 lsl(dst, src, LogKlassAlignmentInBytes);
3888 eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3889 } else {
3890 eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3891 }
3892 return;
3893 }
3894
3895 if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3896 && Universe::narrow_klass_shift() == 0) {
3897 if (dst != src)
3898 movw(dst, src);
3899 movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3900 return;
3901 }
3902
3903 // Cannot assert, unverified entry point counts instructions (see .ad file)
3904 // vtableStubs also counts instructions in pd_code_size_limit.
3905 // Also do not verify_oop as this is called by verify_oop.
3906 if (dst == src) rbase = rheapbase;
3907 mov(rbase, (uint64_t)Universe::narrow_klass_base());
3908 if (Universe::narrow_klass_shift() != 0) {
3909 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3910 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3911 } else {
3912 add(dst, rbase, src);
3913 }
3914 if (dst == src) reinit_heapbase();
3915 }
3916
decode_klass_not_null(Register r)3917 void MacroAssembler::decode_klass_not_null(Register r) {
3918 decode_klass_not_null(r, r);
3919 }
3920
set_narrow_oop(Register dst,jobject obj)3921 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3922 #ifdef ASSERT
3923 {
3924 ThreadInVMfromUnknown tiv;
3925 assert (UseCompressedOops, "should only be used for compressed oops");
3926 assert (Universe::heap() != NULL, "java heap should be initialized");
3927 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3928 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3929 }
3930 #endif
3931 int oop_index = oop_recorder()->find_index(obj);
3932 InstructionMark im(this);
3933 RelocationHolder rspec = oop_Relocation::spec(oop_index);
3934 code_section()->relocate(inst_mark(), rspec);
3935 movz(dst, 0xDEAD, 16);
3936 movk(dst, 0xBEEF);
3937 }
3938
set_narrow_klass(Register dst,Klass * k)3939 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3940 assert (UseCompressedClassPointers, "should only be used for compressed headers");
3941 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3942 int index = oop_recorder()->find_index(k);
3943 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3944
3945 InstructionMark im(this);
3946 RelocationHolder rspec = metadata_Relocation::spec(index);
3947 code_section()->relocate(inst_mark(), rspec);
3948 narrowKlass nk = Klass::encode_klass(k);
3949 movz(dst, (nk >> 16), 16);
3950 movk(dst, nk & 0xffff);
3951 }
3952
access_load_at(BasicType type,DecoratorSet decorators,Register dst,Address src,Register tmp1,Register thread_tmp)3953 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3954 Register dst, Address src,
3955 Register tmp1, Register thread_tmp) {
3956 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3957 decorators = AccessInternal::decorator_fixup(decorators);
3958 bool as_raw = (decorators & AS_RAW) != 0;
3959 if (as_raw) {
3960 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3961 } else {
3962 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3963 }
3964 }
3965
access_store_at(BasicType type,DecoratorSet decorators,Address dst,Register src,Register tmp1,Register thread_tmp)3966 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3967 Address dst, Register src,
3968 Register tmp1, Register thread_tmp) {
3969 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3970 decorators = AccessInternal::decorator_fixup(decorators);
3971 bool as_raw = (decorators & AS_RAW) != 0;
3972 if (as_raw) {
3973 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3974 } else {
3975 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3976 }
3977 }
3978
load_heap_oop(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)3979 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3980 Register thread_tmp, DecoratorSet decorators) {
3981 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
3982 }
3983
load_heap_oop_not_null(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)3984 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3985 Register thread_tmp, DecoratorSet decorators) {
3986 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
3987 }
3988
store_heap_oop(Address dst,Register src,Register tmp1,Register thread_tmp,DecoratorSet decorators)3989 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
3990 Register thread_tmp, DecoratorSet decorators) {
3991 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
3992 }
3993
3994 // Used for storing NULLs.
store_heap_oop_null(Address dst)3995 void MacroAssembler::store_heap_oop_null(Address dst) {
3996 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
3997 }
3998
allocate_metadata_address(Metadata * obj)3999 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4000 assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4001 int index = oop_recorder()->allocate_metadata_index(obj);
4002 RelocationHolder rspec = metadata_Relocation::spec(index);
4003 return Address((address)obj, rspec);
4004 }
4005
4006 // Move an oop into a register. immediate is true if we want
4007 // immediate instrcutions, i.e. we are not going to patch this
4008 // instruction while the code is being executed by another thread. In
4009 // that case we can use move immediates rather than the constant pool.
movoop(Register dst,jobject obj,bool immediate)4010 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4011 int oop_index;
4012 if (obj == NULL) {
4013 oop_index = oop_recorder()->allocate_oop_index(obj);
4014 } else {
4015 #ifdef ASSERT
4016 {
4017 ThreadInVMfromUnknown tiv;
4018 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4019 }
4020 #endif
4021 oop_index = oop_recorder()->find_index(obj);
4022 }
4023 RelocationHolder rspec = oop_Relocation::spec(oop_index);
4024 if (! immediate) {
4025 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4026 ldr_constant(dst, Address(dummy, rspec));
4027 } else
4028 mov(dst, Address((address)obj, rspec));
4029 }
4030
4031 // Move a metadata address into a register.
mov_metadata(Register dst,Metadata * obj)4032 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4033 int oop_index;
4034 if (obj == NULL) {
4035 oop_index = oop_recorder()->allocate_metadata_index(obj);
4036 } else {
4037 oop_index = oop_recorder()->find_index(obj);
4038 }
4039 RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4040 mov(dst, Address((address)obj, rspec));
4041 }
4042
constant_oop_address(jobject obj)4043 Address MacroAssembler::constant_oop_address(jobject obj) {
4044 #ifdef ASSERT
4045 {
4046 ThreadInVMfromUnknown tiv;
4047 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4048 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4049 }
4050 #endif
4051 int oop_index = oop_recorder()->find_index(obj);
4052 return Address((address)obj, oop_Relocation::spec(oop_index));
4053 }
4054
4055 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
tlab_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Register t2,Label & slow_case)4056 void MacroAssembler::tlab_allocate(Register obj,
4057 Register var_size_in_bytes,
4058 int con_size_in_bytes,
4059 Register t1,
4060 Register t2,
4061 Label& slow_case) {
4062 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4063 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4064 }
4065
4066 // Defines obj, preserves var_size_in_bytes
eden_allocate(Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Label & slow_case)4067 void MacroAssembler::eden_allocate(Register obj,
4068 Register var_size_in_bytes,
4069 int con_size_in_bytes,
4070 Register t1,
4071 Label& slow_case) {
4072 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4073 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4074 }
4075
4076 // Zero words; len is in bytes
4077 // Destroys all registers except addr
4078 // len must be a nonzero multiple of wordSize
zero_memory(Register addr,Register len,Register t1)4079 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4080 assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4081
4082 #ifdef ASSERT
4083 { Label L;
4084 tst(len, BytesPerWord - 1);
4085 br(Assembler::EQ, L);
4086 stop("len is not a multiple of BytesPerWord");
4087 bind(L);
4088 }
4089 #endif
4090
4091 #ifndef PRODUCT
4092 block_comment("zero memory");
4093 #endif
4094
4095 Label loop;
4096 Label entry;
4097
4098 // Algorithm:
4099 //
4100 // scratch1 = cnt & 7;
4101 // cnt -= scratch1;
4102 // p += scratch1;
4103 // switch (scratch1) {
4104 // do {
4105 // cnt -= 8;
4106 // p[-8] = 0;
4107 // case 7:
4108 // p[-7] = 0;
4109 // case 6:
4110 // p[-6] = 0;
4111 // // ...
4112 // case 1:
4113 // p[-1] = 0;
4114 // case 0:
4115 // p += 8;
4116 // } while (cnt);
4117 // }
4118
4119 const int unroll = 8; // Number of str(zr) instructions we'll unroll
4120
4121 lsr(len, len, LogBytesPerWord);
4122 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll
4123 sub(len, len, rscratch1); // cnt -= unroll
4124 // t1 always points to the end of the region we're about to zero
4125 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4126 adr(rscratch2, entry);
4127 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4128 br(rscratch2);
4129 bind(loop);
4130 sub(len, len, unroll);
4131 for (int i = -unroll; i < 0; i++)
4132 Assembler::str(zr, Address(t1, i * wordSize));
4133 bind(entry);
4134 add(t1, t1, unroll * wordSize);
4135 cbnz(len, loop);
4136 }
4137
verify_tlab()4138 void MacroAssembler::verify_tlab() {
4139 #ifdef ASSERT
4140 if (UseTLAB && VerifyOops) {
4141 Label next, ok;
4142
4143 stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4144
4145 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4146 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4147 cmp(rscratch2, rscratch1);
4148 br(Assembler::HS, next);
4149 STOP("assert(top >= start)");
4150 should_not_reach_here();
4151
4152 bind(next);
4153 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4154 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4155 cmp(rscratch2, rscratch1);
4156 br(Assembler::HS, ok);
4157 STOP("assert(top <= end)");
4158 should_not_reach_here();
4159
4160 bind(ok);
4161 ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4162 }
4163 #endif
4164 }
4165
4166 // Writes to stack successive pages until offset reached to check for
4167 // stack overflow + shadow pages. This clobbers tmp.
bang_stack_size(Register size,Register tmp)4168 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4169 assert_different_registers(tmp, size, rscratch1);
4170 mov(tmp, sp);
4171 // Bang stack for total size given plus shadow page size.
4172 // Bang one page at a time because large size can bang beyond yellow and
4173 // red zones.
4174 Label loop;
4175 mov(rscratch1, os::vm_page_size());
4176 bind(loop);
4177 lea(tmp, Address(tmp, -os::vm_page_size()));
4178 subsw(size, size, rscratch1);
4179 str(size, Address(tmp));
4180 br(Assembler::GT, loop);
4181
4182 // Bang down shadow pages too.
4183 // At this point, (tmp-0) is the last address touched, so don't
4184 // touch it again. (It was touched as (tmp-pagesize) but then tmp
4185 // was post-decremented.) Skip this address by starting at i=1, and
4186 // touch a few more pages below. N.B. It is important to touch all
4187 // the way down to and including i=StackShadowPages.
4188 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4189 // this could be any sized move but this is can be a debugging crumb
4190 // so the bigger the better.
4191 lea(tmp, Address(tmp, -os::vm_page_size()));
4192 str(size, Address(tmp));
4193 }
4194 }
4195
4196
4197 // Move the address of the polling page into dest.
get_polling_page(Register dest,address page,relocInfo::relocType rtype)4198 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4199 if (SafepointMechanism::uses_thread_local_poll()) {
4200 ldr(dest, Address(rthread, Thread::polling_page_offset()));
4201 } else {
4202 unsigned long off;
4203 adrp(dest, Address(page, rtype), off);
4204 assert(off == 0, "polling page must be page aligned");
4205 }
4206 }
4207
4208 // Move the address of the polling page into r, then read the polling
4209 // page.
read_polling_page(Register r,address page,relocInfo::relocType rtype)4210 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4211 get_polling_page(r, page, rtype);
4212 return read_polling_page(r, rtype);
4213 }
4214
4215 // Read the polling page. The address of the polling page must
4216 // already be in r.
read_polling_page(Register r,relocInfo::relocType rtype)4217 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4218 InstructionMark im(this);
4219 code_section()->relocate(inst_mark(), rtype);
4220 ldrw(zr, Address(r, 0));
4221 return inst_mark();
4222 }
4223
adrp(Register reg1,const Address & dest,uint64_t & byte_offset)4224 void MacroAssembler::adrp(Register reg1, const Address &dest, uint64_t &byte_offset) {
4225 uint64_t low_page = (uint64_t)CodeCache::low_bound() >> 12;
4226 uint64_t high_page = (uint64_t)(CodeCache::high_bound()-1) >> 12;
4227 uint64_t dest_page = (uint64_t)dest.target() >> 12;
4228 int64_t offset_low = dest_page - low_page;
4229 int64_t offset_high = dest_page - high_page;
4230
4231 assert(is_valid_AArch64_address(dest.target()), "bad address");
4232 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4233
4234 InstructionMark im(this);
4235 code_section()->relocate(inst_mark(), dest.rspec());
4236 // 8143067: Ensure that the adrp can reach the dest from anywhere within
4237 // the code cache so that if it is relocated we know it will still reach
4238 if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4239 _adrp(reg1, dest.target());
4240 } else {
4241 uint64_t target = (uint64_t)dest.target();
4242 uint64_t adrp_target
4243 = (target & 0xffffffffULL) | ((uint64_t)pc() & 0xffff00000000ULL);
4244
4245 _adrp(reg1, (address)adrp_target);
4246 movk(reg1, target >> 32, 32);
4247 }
4248 byte_offset = (uint64_t)dest.target() & 0xfff;
4249 }
4250
load_byte_map_base(Register reg)4251 void MacroAssembler::load_byte_map_base(Register reg) {
4252 jbyte *byte_map_base =
4253 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4254
4255 // Strictly speaking the byte_map_base isn't an address at all, and it might
4256 // even be negative. It is thus materialised as a constant.
4257 mov(reg, (uint64_t)byte_map_base);
4258 }
4259
build_frame(int framesize)4260 void MacroAssembler::build_frame(int framesize) {
4261 assert(framesize > 0, "framesize must be > 0");
4262 if (framesize < ((1 << 9) + 2 * wordSize)) {
4263 sub(sp, sp, framesize);
4264 stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4265 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4266 } else {
4267 stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4268 if (PreserveFramePointer) mov(rfp, sp);
4269 if (framesize < ((1 << 12) + 2 * wordSize))
4270 sub(sp, sp, framesize - 2 * wordSize);
4271 else {
4272 mov(rscratch1, framesize - 2 * wordSize);
4273 sub(sp, sp, rscratch1);
4274 }
4275 }
4276 }
4277
remove_frame(int framesize)4278 void MacroAssembler::remove_frame(int framesize) {
4279 assert(framesize > 0, "framesize must be > 0");
4280 if (framesize < ((1 << 9) + 2 * wordSize)) {
4281 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4282 add(sp, sp, framesize);
4283 } else {
4284 if (framesize < ((1 << 12) + 2 * wordSize))
4285 add(sp, sp, framesize - 2 * wordSize);
4286 else {
4287 mov(rscratch1, framesize - 2 * wordSize);
4288 add(sp, sp, rscratch1);
4289 }
4290 ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4291 }
4292 }
4293
4294 #ifdef COMPILER2
4295 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4296
4297 // Search for str1 in str2 and return index or -1
string_indexof(Register str2,Register str1,Register cnt2,Register cnt1,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,int icnt1,Register result,int ae)4298 void MacroAssembler::string_indexof(Register str2, Register str1,
4299 Register cnt2, Register cnt1,
4300 Register tmp1, Register tmp2,
4301 Register tmp3, Register tmp4,
4302 Register tmp5, Register tmp6,
4303 int icnt1, Register result, int ae) {
4304 // NOTE: tmp5, tmp6 can be zr depending on specific method version
4305 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4306
4307 Register ch1 = rscratch1;
4308 Register ch2 = rscratch2;
4309 Register cnt1tmp = tmp1;
4310 Register cnt2tmp = tmp2;
4311 Register cnt1_neg = cnt1;
4312 Register cnt2_neg = cnt2;
4313 Register result_tmp = tmp4;
4314
4315 bool isL = ae == StrIntrinsicNode::LL;
4316
4317 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4318 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4319 int str1_chr_shift = str1_isL ? 0:1;
4320 int str2_chr_shift = str2_isL ? 0:1;
4321 int str1_chr_size = str1_isL ? 1:2;
4322 int str2_chr_size = str2_isL ? 1:2;
4323 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4324 (chr_insn)&MacroAssembler::ldrh;
4325 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4326 (chr_insn)&MacroAssembler::ldrh;
4327 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4328 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4329
4330 // Note, inline_string_indexOf() generates checks:
4331 // if (substr.count > string.count) return -1;
4332 // if (substr.count == 0) return 0;
4333
4334 // We have two strings, a source string in str2, cnt2 and a pattern string
4335 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4336
4337 // For larger pattern and source we use a simplified Boyer Moore algorithm.
4338 // With a small pattern and source we use linear scan.
4339
4340 if (icnt1 == -1) {
4341 sub(result_tmp, cnt2, cnt1);
4342 cmp(cnt1, 8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4343 br(LT, LINEARSEARCH);
4344 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4345 cmp(cnt1, 256);
4346 lsr(tmp1, cnt2, 2);
4347 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4348 br(GE, LINEARSTUB);
4349 }
4350
4351 // The Boyer Moore alogorithm is based on the description here:-
4352 //
4353 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4354 //
4355 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4356 // and the 'Good Suffix' rule.
4357 //
4358 // These rules are essentially heuristics for how far we can shift the
4359 // pattern along the search string.
4360 //
4361 // The implementation here uses the 'Bad Character' rule only because of the
4362 // complexity of initialisation for the 'Good Suffix' rule.
4363 //
4364 // This is also known as the Boyer-Moore-Horspool algorithm:-
4365 //
4366 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4367 //
4368 // This particular implementation has few java-specific optimizations.
4369 //
4370 // #define ASIZE 256
4371 //
4372 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
4373 // int i, j;
4374 // unsigned c;
4375 // unsigned char bc[ASIZE];
4376 //
4377 // /* Preprocessing */
4378 // for (i = 0; i < ASIZE; ++i)
4379 // bc[i] = m;
4380 // for (i = 0; i < m - 1; ) {
4381 // c = x[i];
4382 // ++i;
4383 // // c < 256 for Latin1 string, so, no need for branch
4384 // #ifdef PATTERN_STRING_IS_LATIN1
4385 // bc[c] = m - i;
4386 // #else
4387 // if (c < ASIZE) bc[c] = m - i;
4388 // #endif
4389 // }
4390 //
4391 // /* Searching */
4392 // j = 0;
4393 // while (j <= n - m) {
4394 // c = y[i+j];
4395 // if (x[m-1] == c)
4396 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4397 // if (i < 0) return j;
4398 // // c < 256 for Latin1 string, so, no need for branch
4399 // #ifdef SOURCE_STRING_IS_LATIN1
4400 // // LL case: (c< 256) always true. Remove branch
4401 // j += bc[y[j+m-1]];
4402 // #endif
4403 // #ifndef PATTERN_STRING_IS_UTF
4404 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4405 // if (c < ASIZE)
4406 // j += bc[y[j+m-1]];
4407 // else
4408 // j += 1
4409 // #endif
4410 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4411 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4412 // if (c < ASIZE)
4413 // j += bc[y[j+m-1]];
4414 // else
4415 // j += m
4416 // #endif
4417 // }
4418 // }
4419
4420 if (icnt1 == -1) {
4421 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4422 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4423 Register cnt1end = tmp2;
4424 Register str2end = cnt2;
4425 Register skipch = tmp2;
4426
4427 // str1 length is >=8, so, we can read at least 1 register for cases when
4428 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4429 // UL case. We'll re-read last character in inner pre-loop code to have
4430 // single outer pre-loop load
4431 const int firstStep = isL ? 7 : 3;
4432
4433 const int ASIZE = 256;
4434 const int STORED_BYTES = 32; // amount of bytes stored per instruction
4435 sub(sp, sp, ASIZE);
4436 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4437 mov(ch1, sp);
4438 BIND(BM_INIT_LOOP);
4439 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4440 subs(tmp5, tmp5, 1);
4441 br(GT, BM_INIT_LOOP);
4442
4443 sub(cnt1tmp, cnt1, 1);
4444 mov(tmp5, str2);
4445 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4446 sub(ch2, cnt1, 1);
4447 mov(tmp3, str1);
4448 BIND(BCLOOP);
4449 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4450 if (!str1_isL) {
4451 cmp(ch1, ASIZE);
4452 br(HS, BCSKIP);
4453 }
4454 strb(ch2, Address(sp, ch1));
4455 BIND(BCSKIP);
4456 subs(ch2, ch2, 1);
4457 br(GT, BCLOOP);
4458
4459 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4460 if (str1_isL == str2_isL) {
4461 // load last 8 bytes (8LL/4UU symbols)
4462 ldr(tmp6, Address(tmp6, -wordSize));
4463 } else {
4464 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4465 // convert Latin1 to UTF. We'll have to wait until load completed, but
4466 // it's still faster than per-character loads+checks
4467 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4468 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4469 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4470 andr(tmp6, tmp6, 0xFF); // str1[N-4]
4471 orr(ch2, ch1, ch2, LSL, 16);
4472 orr(tmp6, tmp6, tmp3, LSL, 48);
4473 orr(tmp6, tmp6, ch2, LSL, 16);
4474 }
4475 BIND(BMLOOPSTR2);
4476 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4477 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4478 if (str1_isL == str2_isL) {
4479 // re-init tmp3. It's for free because it's executed in parallel with
4480 // load above. Alternative is to initialize it before loop, but it'll
4481 // affect performance on in-order systems with 2 or more ld/st pipelines
4482 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4483 }
4484 if (!isL) { // UU/UL case
4485 lsl(ch2, cnt1tmp, 1); // offset in bytes
4486 }
4487 cmp(tmp3, skipch);
4488 br(NE, BMSKIP);
4489 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4490 mov(ch1, tmp6);
4491 if (isL) {
4492 b(BMLOOPSTR1_AFTER_LOAD);
4493 } else {
4494 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4495 b(BMLOOPSTR1_CMP);
4496 }
4497 BIND(BMLOOPSTR1);
4498 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4499 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4500 BIND(BMLOOPSTR1_AFTER_LOAD);
4501 subs(cnt1tmp, cnt1tmp, 1);
4502 br(LT, BMLOOPSTR1_LASTCMP);
4503 BIND(BMLOOPSTR1_CMP);
4504 cmp(ch1, ch2);
4505 br(EQ, BMLOOPSTR1);
4506 BIND(BMSKIP);
4507 if (!isL) {
4508 // if we've met UTF symbol while searching Latin1 pattern, then we can
4509 // skip cnt1 symbols
4510 if (str1_isL != str2_isL) {
4511 mov(result_tmp, cnt1);
4512 } else {
4513 mov(result_tmp, 1);
4514 }
4515 cmp(skipch, ASIZE);
4516 br(HS, BMADV);
4517 }
4518 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4519 BIND(BMADV);
4520 sub(cnt1tmp, cnt1, 1);
4521 add(str2, str2, result_tmp, LSL, str2_chr_shift);
4522 cmp(str2, str2end);
4523 br(LE, BMLOOPSTR2);
4524 add(sp, sp, ASIZE);
4525 b(NOMATCH);
4526 BIND(BMLOOPSTR1_LASTCMP);
4527 cmp(ch1, ch2);
4528 br(NE, BMSKIP);
4529 BIND(BMMATCH);
4530 sub(result, str2, tmp5);
4531 if (!str2_isL) lsr(result, result, 1);
4532 add(sp, sp, ASIZE);
4533 b(DONE);
4534
4535 BIND(LINEARSTUB);
4536 cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
4537 br(LT, LINEAR_MEDIUM);
4538 mov(result, zr);
4539 RuntimeAddress stub = NULL;
4540 if (isL) {
4541 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4542 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4543 } else if (str1_isL) {
4544 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4545 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4546 } else {
4547 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4548 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4549 }
4550 trampoline_call(stub);
4551 b(DONE);
4552 }
4553
4554 BIND(LINEARSEARCH);
4555 {
4556 Label DO1, DO2, DO3;
4557
4558 Register str2tmp = tmp2;
4559 Register first = tmp3;
4560
4561 if (icnt1 == -1)
4562 {
4563 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4564
4565 cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4566 br(LT, DOSHORT);
4567 BIND(LINEAR_MEDIUM);
4568 (this->*str1_load_1chr)(first, Address(str1));
4569 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4570 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4571 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4572 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4573
4574 BIND(FIRST_LOOP);
4575 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4576 cmp(first, ch2);
4577 br(EQ, STR1_LOOP);
4578 BIND(STR2_NEXT);
4579 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4580 br(LE, FIRST_LOOP);
4581 b(NOMATCH);
4582
4583 BIND(STR1_LOOP);
4584 adds(cnt1tmp, cnt1_neg, str1_chr_size);
4585 add(cnt2tmp, cnt2_neg, str2_chr_size);
4586 br(GE, MATCH);
4587
4588 BIND(STR1_NEXT);
4589 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4590 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4591 cmp(ch1, ch2);
4592 br(NE, STR2_NEXT);
4593 adds(cnt1tmp, cnt1tmp, str1_chr_size);
4594 add(cnt2tmp, cnt2tmp, str2_chr_size);
4595 br(LT, STR1_NEXT);
4596 b(MATCH);
4597
4598 BIND(DOSHORT);
4599 if (str1_isL == str2_isL) {
4600 cmp(cnt1, 2);
4601 br(LT, DO1);
4602 br(GT, DO3);
4603 }
4604 }
4605
4606 if (icnt1 == 4) {
4607 Label CH1_LOOP;
4608
4609 (this->*load_4chr)(ch1, str1);
4610 sub(result_tmp, cnt2, 4);
4611 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4612 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4613
4614 BIND(CH1_LOOP);
4615 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4616 cmp(ch1, ch2);
4617 br(EQ, MATCH);
4618 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4619 br(LE, CH1_LOOP);
4620 b(NOMATCH);
4621 }
4622
4623 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4624 Label CH1_LOOP;
4625
4626 BIND(DO2);
4627 (this->*load_2chr)(ch1, str1);
4628 if (icnt1 == 2) {
4629 sub(result_tmp, cnt2, 2);
4630 }
4631 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4632 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4633 BIND(CH1_LOOP);
4634 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4635 cmp(ch1, ch2);
4636 br(EQ, MATCH);
4637 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4638 br(LE, CH1_LOOP);
4639 b(NOMATCH);
4640 }
4641
4642 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4643 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4644
4645 BIND(DO3);
4646 (this->*load_2chr)(first, str1);
4647 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4648 if (icnt1 == 3) {
4649 sub(result_tmp, cnt2, 3);
4650 }
4651 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4652 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4653 BIND(FIRST_LOOP);
4654 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4655 cmpw(first, ch2);
4656 br(EQ, STR1_LOOP);
4657 BIND(STR2_NEXT);
4658 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4659 br(LE, FIRST_LOOP);
4660 b(NOMATCH);
4661
4662 BIND(STR1_LOOP);
4663 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4664 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4665 cmp(ch1, ch2);
4666 br(NE, STR2_NEXT);
4667 b(MATCH);
4668 }
4669
4670 if (icnt1 == -1 || icnt1 == 1) {
4671 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4672
4673 BIND(DO1);
4674 (this->*str1_load_1chr)(ch1, str1);
4675 cmp(cnt2, 8);
4676 br(LT, DO1_SHORT);
4677
4678 sub(result_tmp, cnt2, 8/str2_chr_size);
4679 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4680 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4681 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4682
4683 if (str2_isL) {
4684 orr(ch1, ch1, ch1, LSL, 8);
4685 }
4686 orr(ch1, ch1, ch1, LSL, 16);
4687 orr(ch1, ch1, ch1, LSL, 32);
4688 BIND(CH1_LOOP);
4689 ldr(ch2, Address(str2, cnt2_neg));
4690 eor(ch2, ch1, ch2);
4691 sub(tmp1, ch2, tmp3);
4692 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4693 bics(tmp1, tmp1, tmp2);
4694 br(NE, HAS_ZERO);
4695 adds(cnt2_neg, cnt2_neg, 8);
4696 br(LT, CH1_LOOP);
4697
4698 cmp(cnt2_neg, 8);
4699 mov(cnt2_neg, 0);
4700 br(LT, CH1_LOOP);
4701 b(NOMATCH);
4702
4703 BIND(HAS_ZERO);
4704 rev(tmp1, tmp1);
4705 clz(tmp1, tmp1);
4706 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4707 b(MATCH);
4708
4709 BIND(DO1_SHORT);
4710 mov(result_tmp, cnt2);
4711 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4712 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4713 BIND(DO1_LOOP);
4714 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4715 cmpw(ch1, ch2);
4716 br(EQ, MATCH);
4717 adds(cnt2_neg, cnt2_neg, str2_chr_size);
4718 br(LT, DO1_LOOP);
4719 }
4720 }
4721 BIND(NOMATCH);
4722 mov(result, -1);
4723 b(DONE);
4724 BIND(MATCH);
4725 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4726 BIND(DONE);
4727 }
4728
4729 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4730 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4731
string_indexof_char(Register str1,Register cnt1,Register ch,Register result,Register tmp1,Register tmp2,Register tmp3)4732 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4733 Register ch, Register result,
4734 Register tmp1, Register tmp2, Register tmp3)
4735 {
4736 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4737 Register cnt1_neg = cnt1;
4738 Register ch1 = rscratch1;
4739 Register result_tmp = rscratch2;
4740
4741 cbz(cnt1, NOMATCH);
4742
4743 cmp(cnt1, 4);
4744 br(LT, DO1_SHORT);
4745
4746 orr(ch, ch, ch, LSL, 16);
4747 orr(ch, ch, ch, LSL, 32);
4748
4749 sub(cnt1, cnt1, 4);
4750 mov(result_tmp, cnt1);
4751 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4752 sub(cnt1_neg, zr, cnt1, LSL, 1);
4753
4754 mov(tmp3, 0x0001000100010001);
4755
4756 BIND(CH1_LOOP);
4757 ldr(ch1, Address(str1, cnt1_neg));
4758 eor(ch1, ch, ch1);
4759 sub(tmp1, ch1, tmp3);
4760 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4761 bics(tmp1, tmp1, tmp2);
4762 br(NE, HAS_ZERO);
4763 adds(cnt1_neg, cnt1_neg, 8);
4764 br(LT, CH1_LOOP);
4765
4766 cmp(cnt1_neg, 8);
4767 mov(cnt1_neg, 0);
4768 br(LT, CH1_LOOP);
4769 b(NOMATCH);
4770
4771 BIND(HAS_ZERO);
4772 rev(tmp1, tmp1);
4773 clz(tmp1, tmp1);
4774 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4775 b(MATCH);
4776
4777 BIND(DO1_SHORT);
4778 mov(result_tmp, cnt1);
4779 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4780 sub(cnt1_neg, zr, cnt1, LSL, 1);
4781 BIND(DO1_LOOP);
4782 ldrh(ch1, Address(str1, cnt1_neg));
4783 cmpw(ch, ch1);
4784 br(EQ, MATCH);
4785 adds(cnt1_neg, cnt1_neg, 2);
4786 br(LT, DO1_LOOP);
4787 BIND(NOMATCH);
4788 mov(result, -1);
4789 b(DONE);
4790 BIND(MATCH);
4791 add(result, result_tmp, cnt1_neg, ASR, 1);
4792 BIND(DONE);
4793 }
4794
4795 // Compare strings.
string_compare(Register str1,Register str2,Register cnt1,Register cnt2,Register result,Register tmp1,Register tmp2,FloatRegister vtmp1,FloatRegister vtmp2,FloatRegister vtmp3,int ae)4796 void MacroAssembler::string_compare(Register str1, Register str2,
4797 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4798 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4799 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4800 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4801 SHORT_LOOP_START, TAIL_CHECK;
4802
4803 const int STUB_THRESHOLD = 64 + 8;
4804 bool isLL = ae == StrIntrinsicNode::LL;
4805 bool isLU = ae == StrIntrinsicNode::LU;
4806 bool isUL = ae == StrIntrinsicNode::UL;
4807
4808 bool str1_isL = isLL || isLU;
4809 bool str2_isL = isLL || isUL;
4810
4811 int str1_chr_shift = str1_isL ? 0 : 1;
4812 int str2_chr_shift = str2_isL ? 0 : 1;
4813 int str1_chr_size = str1_isL ? 1 : 2;
4814 int str2_chr_size = str2_isL ? 1 : 2;
4815 int minCharsInWord = isLL ? wordSize : wordSize/2;
4816
4817 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4818 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4819 (chr_insn)&MacroAssembler::ldrh;
4820 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4821 (chr_insn)&MacroAssembler::ldrh;
4822 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4823 (uxt_insn)&MacroAssembler::uxthw;
4824
4825 BLOCK_COMMENT("string_compare {");
4826
4827 // Bizzarely, the counts are passed in bytes, regardless of whether they
4828 // are L or U strings, however the result is always in characters.
4829 if (!str1_isL) asrw(cnt1, cnt1, 1);
4830 if (!str2_isL) asrw(cnt2, cnt2, 1);
4831
4832 // Compute the minimum of the string lengths and save the difference.
4833 subsw(result, cnt1, cnt2);
4834 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4835
4836 // A very short string
4837 cmpw(cnt2, minCharsInWord);
4838 br(Assembler::LE, SHORT_STRING);
4839
4840 // Compare longwords
4841 // load first parts of strings and finish initialization while loading
4842 {
4843 if (str1_isL == str2_isL) { // LL or UU
4844 ldr(tmp1, Address(str1));
4845 cmp(str1, str2);
4846 br(Assembler::EQ, DONE);
4847 ldr(tmp2, Address(str2));
4848 cmp(cnt2, STUB_THRESHOLD);
4849 br(GE, STUB);
4850 subsw(cnt2, cnt2, minCharsInWord);
4851 br(EQ, TAIL_CHECK);
4852 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4853 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4854 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4855 } else if (isLU) {
4856 ldrs(vtmp, Address(str1));
4857 ldr(tmp2, Address(str2));
4858 cmp(cnt2, STUB_THRESHOLD);
4859 br(GE, STUB);
4860 subw(cnt2, cnt2, 4);
4861 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4862 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4863 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4864 zip1(vtmp, T8B, vtmp, vtmpZ);
4865 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4866 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4867 add(cnt1, cnt1, 4);
4868 fmovd(tmp1, vtmp);
4869 } else { // UL case
4870 ldr(tmp1, Address(str1));
4871 ldrs(vtmp, Address(str2));
4872 cmp(cnt2, STUB_THRESHOLD);
4873 br(GE, STUB);
4874 subw(cnt2, cnt2, 4);
4875 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4876 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4877 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4878 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4879 zip1(vtmp, T8B, vtmp, vtmpZ);
4880 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4881 add(cnt1, cnt1, 8);
4882 fmovd(tmp2, vtmp);
4883 }
4884 adds(cnt2, cnt2, isUL ? 4 : 8);
4885 br(GE, TAIL);
4886 eor(rscratch2, tmp1, tmp2);
4887 cbnz(rscratch2, DIFFERENCE);
4888 // main loop
4889 bind(NEXT_WORD);
4890 if (str1_isL == str2_isL) {
4891 ldr(tmp1, Address(str1, cnt2));
4892 ldr(tmp2, Address(str2, cnt2));
4893 adds(cnt2, cnt2, 8);
4894 } else if (isLU) {
4895 ldrs(vtmp, Address(str1, cnt1));
4896 ldr(tmp2, Address(str2, cnt2));
4897 add(cnt1, cnt1, 4);
4898 zip1(vtmp, T8B, vtmp, vtmpZ);
4899 fmovd(tmp1, vtmp);
4900 adds(cnt2, cnt2, 8);
4901 } else { // UL
4902 ldrs(vtmp, Address(str2, cnt2));
4903 ldr(tmp1, Address(str1, cnt1));
4904 zip1(vtmp, T8B, vtmp, vtmpZ);
4905 add(cnt1, cnt1, 8);
4906 fmovd(tmp2, vtmp);
4907 adds(cnt2, cnt2, 4);
4908 }
4909 br(GE, TAIL);
4910
4911 eor(rscratch2, tmp1, tmp2);
4912 cbz(rscratch2, NEXT_WORD);
4913 b(DIFFERENCE);
4914 bind(TAIL);
4915 eor(rscratch2, tmp1, tmp2);
4916 cbnz(rscratch2, DIFFERENCE);
4917 // Last longword. In the case where length == 4 we compare the
4918 // same longword twice, but that's still faster than another
4919 // conditional branch.
4920 if (str1_isL == str2_isL) {
4921 ldr(tmp1, Address(str1));
4922 ldr(tmp2, Address(str2));
4923 } else if (isLU) {
4924 ldrs(vtmp, Address(str1));
4925 ldr(tmp2, Address(str2));
4926 zip1(vtmp, T8B, vtmp, vtmpZ);
4927 fmovd(tmp1, vtmp);
4928 } else { // UL
4929 ldrs(vtmp, Address(str2));
4930 ldr(tmp1, Address(str1));
4931 zip1(vtmp, T8B, vtmp, vtmpZ);
4932 fmovd(tmp2, vtmp);
4933 }
4934 bind(TAIL_CHECK);
4935 eor(rscratch2, tmp1, tmp2);
4936 cbz(rscratch2, DONE);
4937
4938 // Find the first different characters in the longwords and
4939 // compute their difference.
4940 bind(DIFFERENCE);
4941 rev(rscratch2, rscratch2);
4942 clz(rscratch2, rscratch2);
4943 andr(rscratch2, rscratch2, isLL ? -8 : -16);
4944 lsrv(tmp1, tmp1, rscratch2);
4945 (this->*ext_chr)(tmp1, tmp1);
4946 lsrv(tmp2, tmp2, rscratch2);
4947 (this->*ext_chr)(tmp2, tmp2);
4948 subw(result, tmp1, tmp2);
4949 b(DONE);
4950 }
4951
4952 bind(STUB);
4953 RuntimeAddress stub = NULL;
4954 switch(ae) {
4955 case StrIntrinsicNode::LL:
4956 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
4957 break;
4958 case StrIntrinsicNode::UU:
4959 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
4960 break;
4961 case StrIntrinsicNode::LU:
4962 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
4963 break;
4964 case StrIntrinsicNode::UL:
4965 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
4966 break;
4967 default:
4968 ShouldNotReachHere();
4969 }
4970 assert(stub.target() != NULL, "compare_long_string stub has not been generated");
4971 trampoline_call(stub);
4972 b(DONE);
4973
4974 bind(SHORT_STRING);
4975 // Is the minimum length zero?
4976 cbz(cnt2, DONE);
4977 // arrange code to do most branches while loading and loading next characters
4978 // while comparing previous
4979 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
4980 subs(cnt2, cnt2, 1);
4981 br(EQ, SHORT_LAST_INIT);
4982 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
4983 b(SHORT_LOOP_START);
4984 bind(SHORT_LOOP);
4985 subs(cnt2, cnt2, 1);
4986 br(EQ, SHORT_LAST);
4987 bind(SHORT_LOOP_START);
4988 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
4989 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
4990 cmp(tmp1, cnt1);
4991 br(NE, SHORT_LOOP_TAIL);
4992 subs(cnt2, cnt2, 1);
4993 br(EQ, SHORT_LAST2);
4994 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
4995 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
4996 cmp(tmp2, rscratch1);
4997 br(EQ, SHORT_LOOP);
4998 sub(result, tmp2, rscratch1);
4999 b(DONE);
5000 bind(SHORT_LOOP_TAIL);
5001 sub(result, tmp1, cnt1);
5002 b(DONE);
5003 bind(SHORT_LAST2);
5004 cmp(tmp2, rscratch1);
5005 br(EQ, DONE);
5006 sub(result, tmp2, rscratch1);
5007
5008 b(DONE);
5009 bind(SHORT_LAST_INIT);
5010 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5011 bind(SHORT_LAST);
5012 cmp(tmp1, cnt1);
5013 br(EQ, DONE);
5014 sub(result, tmp1, cnt1);
5015
5016 bind(DONE);
5017
5018 BLOCK_COMMENT("} string_compare");
5019 }
5020 #endif // COMPILER2
5021
5022 // This method checks if provided byte array contains byte with highest bit set.
has_negatives(Register ary1,Register len,Register result)5023 address MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5024 // Simple and most common case of aligned small array which is not at the
5025 // end of memory page is placed here. All other cases are in stub.
5026 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5027 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5028 assert_different_registers(ary1, len, result);
5029
5030 cmpw(len, 0);
5031 br(LE, SET_RESULT);
5032 cmpw(len, 4 * wordSize);
5033 br(GE, STUB_LONG); // size > 32 then go to stub
5034
5035 int shift = 64 - exact_log2(os::vm_page_size());
5036 lsl(rscratch1, ary1, shift);
5037 mov(rscratch2, (u_int64_t)(4 * wordSize) << shift);
5038 adds(rscratch2, rscratch1, rscratch2); // At end of page?
5039 br(CS, STUB); // at the end of page then go to stub
5040 subs(len, len, wordSize);
5041 br(LT, END);
5042
5043 BIND(LOOP);
5044 ldr(rscratch1, Address(post(ary1, wordSize)));
5045 tst(rscratch1, UPPER_BIT_MASK);
5046 br(NE, SET_RESULT);
5047 subs(len, len, wordSize);
5048 br(GE, LOOP);
5049 cmpw(len, -wordSize);
5050 br(EQ, SET_RESULT);
5051
5052 BIND(END);
5053 ldr(result, Address(ary1));
5054 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5055 lslv(result, result, len);
5056 tst(result, UPPER_BIT_MASK);
5057 b(SET_RESULT);
5058
5059 BIND(STUB);
5060 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives());
5061 assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5062 address tpc1 = trampoline_call(has_neg);
5063 if (tpc1 == NULL) {
5064 DEBUG_ONLY(reset_labels3(STUB_LONG, SET_RESULT, DONE));
5065 postcond(pc() == badAddress);
5066 return NULL;
5067 }
5068 b(DONE);
5069
5070 BIND(STUB_LONG);
5071 RuntimeAddress has_neg_long = RuntimeAddress(StubRoutines::aarch64::has_negatives_long());
5072 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5073 address tpc2 = trampoline_call(has_neg_long);
5074 if (tpc2 == NULL) {
5075 DEBUG_ONLY(reset_labels2(SET_RESULT, DONE));
5076 postcond(pc() == badAddress);
5077 return NULL;
5078 }
5079 b(DONE);
5080
5081 BIND(SET_RESULT);
5082 cset(result, NE); // set true or false
5083
5084 BIND(DONE);
5085 postcond(pc() != badAddress);
5086 return pc();
5087 }
5088
arrays_equals(Register a1,Register a2,Register tmp3,Register tmp4,Register tmp5,Register result,Register cnt1,int elem_size)5089 address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5090 Register tmp4, Register tmp5, Register result,
5091 Register cnt1, int elem_size) {
5092 Label DONE, SAME;
5093 Register tmp1 = rscratch1;
5094 Register tmp2 = rscratch2;
5095 Register cnt2 = tmp2; // cnt2 only used in array length compare
5096 int elem_per_word = wordSize/elem_size;
5097 int log_elem_size = exact_log2(elem_size);
5098 int length_offset = arrayOopDesc::length_offset_in_bytes();
5099 int base_offset
5100 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5101 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5102
5103 assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5104 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5105
5106 #ifndef PRODUCT
5107 {
5108 const char kind = (elem_size == 2) ? 'U' : 'L';
5109 char comment[64];
5110 snprintf(comment, sizeof comment, "array_equals%c{", kind);
5111 BLOCK_COMMENT(comment);
5112 }
5113 #endif
5114
5115 // if (a1 == a2)
5116 // return true;
5117 cmpoop(a1, a2); // May have read barriers for a1 and a2.
5118 br(EQ, SAME);
5119
5120 if (UseSimpleArrayEquals) {
5121 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5122 // if (a1 == null || a2 == null)
5123 // return false;
5124 // a1 & a2 == 0 means (some-pointer is null) or
5125 // (very-rare-or-even-probably-impossible-pointer-values)
5126 // so, we can save one branch in most cases
5127 tst(a1, a2);
5128 mov(result, false);
5129 br(EQ, A_MIGHT_BE_NULL);
5130 // if (a1.length != a2.length)
5131 // return false;
5132 bind(A_IS_NOT_NULL);
5133 ldrw(cnt1, Address(a1, length_offset));
5134 ldrw(cnt2, Address(a2, length_offset));
5135 eorw(tmp5, cnt1, cnt2);
5136 cbnzw(tmp5, DONE);
5137 lea(a1, Address(a1, base_offset));
5138 lea(a2, Address(a2, base_offset));
5139 // Check for short strings, i.e. smaller than wordSize.
5140 subs(cnt1, cnt1, elem_per_word);
5141 br(Assembler::LT, SHORT);
5142 // Main 8 byte comparison loop.
5143 bind(NEXT_WORD); {
5144 ldr(tmp1, Address(post(a1, wordSize)));
5145 ldr(tmp2, Address(post(a2, wordSize)));
5146 subs(cnt1, cnt1, elem_per_word);
5147 eor(tmp5, tmp1, tmp2);
5148 cbnz(tmp5, DONE);
5149 } br(GT, NEXT_WORD);
5150 // Last longword. In the case where length == 4 we compare the
5151 // same longword twice, but that's still faster than another
5152 // conditional branch.
5153 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5154 // length == 4.
5155 if (log_elem_size > 0)
5156 lsl(cnt1, cnt1, log_elem_size);
5157 ldr(tmp3, Address(a1, cnt1));
5158 ldr(tmp4, Address(a2, cnt1));
5159 eor(tmp5, tmp3, tmp4);
5160 cbnz(tmp5, DONE);
5161 b(SAME);
5162 bind(A_MIGHT_BE_NULL);
5163 // in case both a1 and a2 are not-null, proceed with loads
5164 cbz(a1, DONE);
5165 cbz(a2, DONE);
5166 b(A_IS_NOT_NULL);
5167 bind(SHORT);
5168
5169 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5170 {
5171 ldrw(tmp1, Address(post(a1, 4)));
5172 ldrw(tmp2, Address(post(a2, 4)));
5173 eorw(tmp5, tmp1, tmp2);
5174 cbnzw(tmp5, DONE);
5175 }
5176 bind(TAIL03);
5177 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5178 {
5179 ldrh(tmp3, Address(post(a1, 2)));
5180 ldrh(tmp4, Address(post(a2, 2)));
5181 eorw(tmp5, tmp3, tmp4);
5182 cbnzw(tmp5, DONE);
5183 }
5184 bind(TAIL01);
5185 if (elem_size == 1) { // Only needed when comparing byte arrays.
5186 tbz(cnt1, 0, SAME); // 0-1 bytes left.
5187 {
5188 ldrb(tmp1, a1);
5189 ldrb(tmp2, a2);
5190 eorw(tmp5, tmp1, tmp2);
5191 cbnzw(tmp5, DONE);
5192 }
5193 }
5194 } else {
5195 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB,
5196 CSET_EQ, LAST_CHECK;
5197 mov(result, false);
5198 cbz(a1, DONE);
5199 ldrw(cnt1, Address(a1, length_offset));
5200 cbz(a2, DONE);
5201 ldrw(cnt2, Address(a2, length_offset));
5202 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5203 // faster to perform another branch before comparing a1 and a2
5204 cmp(cnt1, elem_per_word);
5205 br(LE, SHORT); // short or same
5206 ldr(tmp3, Address(pre(a1, base_offset)));
5207 cmp(cnt1, stubBytesThreshold);
5208 br(GE, STUB);
5209 ldr(tmp4, Address(pre(a2, base_offset)));
5210 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5211 cmp(cnt2, cnt1);
5212 br(NE, DONE);
5213
5214 // Main 16 byte comparison loop with 2 exits
5215 bind(NEXT_DWORD); {
5216 ldr(tmp1, Address(pre(a1, wordSize)));
5217 ldr(tmp2, Address(pre(a2, wordSize)));
5218 subs(cnt1, cnt1, 2 * elem_per_word);
5219 br(LE, TAIL);
5220 eor(tmp4, tmp3, tmp4);
5221 cbnz(tmp4, DONE);
5222 ldr(tmp3, Address(pre(a1, wordSize)));
5223 ldr(tmp4, Address(pre(a2, wordSize)));
5224 cmp(cnt1, elem_per_word);
5225 br(LE, TAIL2);
5226 cmp(tmp1, tmp2);
5227 } br(EQ, NEXT_DWORD);
5228 b(DONE);
5229
5230 bind(TAIL);
5231 eor(tmp4, tmp3, tmp4);
5232 eor(tmp2, tmp1, tmp2);
5233 lslv(tmp2, tmp2, tmp5);
5234 orr(tmp5, tmp4, tmp2);
5235 cmp(tmp5, zr);
5236 b(CSET_EQ);
5237
5238 bind(TAIL2);
5239 eor(tmp2, tmp1, tmp2);
5240 cbnz(tmp2, DONE);
5241 b(LAST_CHECK);
5242
5243 bind(STUB);
5244 ldr(tmp4, Address(pre(a2, base_offset)));
5245 cmp(cnt2, cnt1);
5246 br(NE, DONE);
5247 if (elem_size == 2) { // convert to byte counter
5248 lsl(cnt1, cnt1, 1);
5249 }
5250 eor(tmp5, tmp3, tmp4);
5251 cbnz(tmp5, DONE);
5252 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5253 assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5254 address tpc = trampoline_call(stub);
5255 if (tpc == NULL) {
5256 DEBUG_ONLY(reset_labels5(SHORT, LAST_CHECK, CSET_EQ, SAME, DONE));
5257 postcond(pc() == badAddress);
5258 return NULL;
5259 }
5260 b(DONE);
5261
5262 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5263 // so, if a2 == null => return false(0), else return true, so we can return a2
5264 mov(result, a2);
5265 b(DONE);
5266 bind(SHORT);
5267 cmp(cnt2, cnt1);
5268 br(NE, DONE);
5269 cbz(cnt1, SAME);
5270 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5271 ldr(tmp3, Address(a1, base_offset));
5272 ldr(tmp4, Address(a2, base_offset));
5273 bind(LAST_CHECK);
5274 eor(tmp4, tmp3, tmp4);
5275 lslv(tmp5, tmp4, tmp5);
5276 cmp(tmp5, zr);
5277 bind(CSET_EQ);
5278 cset(result, EQ);
5279 b(DONE);
5280 }
5281
5282 bind(SAME);
5283 mov(result, true);
5284 // That's it.
5285 bind(DONE);
5286
5287 BLOCK_COMMENT("} array_equals");
5288 postcond(pc() != badAddress);
5289 return pc();
5290 }
5291
5292 // Compare Strings
5293
5294 // For Strings we're passed the address of the first characters in a1
5295 // and a2 and the length in cnt1.
5296 // elem_size is the element size in bytes: either 1 or 2.
5297 // There are two implementations. For arrays >= 8 bytes, all
5298 // comparisons (including the final one, which may overlap) are
5299 // performed 8 bytes at a time. For strings < 8 bytes, we compare a
5300 // halfword, then a short, and then a byte.
5301
string_equals(Register a1,Register a2,Register result,Register cnt1,int elem_size)5302 void MacroAssembler::string_equals(Register a1, Register a2,
5303 Register result, Register cnt1, int elem_size)
5304 {
5305 Label SAME, DONE, SHORT, NEXT_WORD;
5306 Register tmp1 = rscratch1;
5307 Register tmp2 = rscratch2;
5308 Register cnt2 = tmp2; // cnt2 only used in array length compare
5309
5310 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5311 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5312
5313 #ifndef PRODUCT
5314 {
5315 const char kind = (elem_size == 2) ? 'U' : 'L';
5316 char comment[64];
5317 snprintf(comment, sizeof comment, "{string_equals%c", kind);
5318 BLOCK_COMMENT(comment);
5319 }
5320 #endif
5321
5322 mov(result, false);
5323
5324 // Check for short strings, i.e. smaller than wordSize.
5325 subs(cnt1, cnt1, wordSize);
5326 br(Assembler::LT, SHORT);
5327 // Main 8 byte comparison loop.
5328 bind(NEXT_WORD); {
5329 ldr(tmp1, Address(post(a1, wordSize)));
5330 ldr(tmp2, Address(post(a2, wordSize)));
5331 subs(cnt1, cnt1, wordSize);
5332 eor(tmp1, tmp1, tmp2);
5333 cbnz(tmp1, DONE);
5334 } br(GT, NEXT_WORD);
5335 // Last longword. In the case where length == 4 we compare the
5336 // same longword twice, but that's still faster than another
5337 // conditional branch.
5338 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5339 // length == 4.
5340 ldr(tmp1, Address(a1, cnt1));
5341 ldr(tmp2, Address(a2, cnt1));
5342 eor(tmp2, tmp1, tmp2);
5343 cbnz(tmp2, DONE);
5344 b(SAME);
5345
5346 bind(SHORT);
5347 Label TAIL03, TAIL01;
5348
5349 tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5350 {
5351 ldrw(tmp1, Address(post(a1, 4)));
5352 ldrw(tmp2, Address(post(a2, 4)));
5353 eorw(tmp1, tmp1, tmp2);
5354 cbnzw(tmp1, DONE);
5355 }
5356 bind(TAIL03);
5357 tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5358 {
5359 ldrh(tmp1, Address(post(a1, 2)));
5360 ldrh(tmp2, Address(post(a2, 2)));
5361 eorw(tmp1, tmp1, tmp2);
5362 cbnzw(tmp1, DONE);
5363 }
5364 bind(TAIL01);
5365 if (elem_size == 1) { // Only needed when comparing 1-byte elements
5366 tbz(cnt1, 0, SAME); // 0-1 bytes left.
5367 {
5368 ldrb(tmp1, a1);
5369 ldrb(tmp2, a2);
5370 eorw(tmp1, tmp1, tmp2);
5371 cbnzw(tmp1, DONE);
5372 }
5373 }
5374 // Arrays are equal.
5375 bind(SAME);
5376 mov(result, true);
5377
5378 // That's it.
5379 bind(DONE);
5380 BLOCK_COMMENT("} string_equals");
5381 }
5382
5383
5384 // The size of the blocks erased by the zero_blocks stub. We must
5385 // handle anything smaller than this ourselves in zero_words().
5386 const int MacroAssembler::zero_words_block_size = 8;
5387
5388 // zero_words() is used by C2 ClearArray patterns. It is as small as
5389 // possible, handling small word counts locally and delegating
5390 // anything larger to the zero_blocks stub. It is expanded many times
5391 // in compiled code, so it is important to keep it short.
5392
5393 // ptr: Address of a buffer to be zeroed.
5394 // cnt: Count in HeapWords.
5395 //
5396 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
zero_words(Register ptr,Register cnt)5397 address MacroAssembler::zero_words(Register ptr, Register cnt)
5398 {
5399 assert(is_power_of_2(zero_words_block_size), "adjust this");
5400 assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5401
5402 BLOCK_COMMENT("zero_words {");
5403 cmp(cnt, zero_words_block_size);
5404 Label around, done, done16;
5405 br(LO, around);
5406 {
5407 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5408 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5409 if (StubRoutines::aarch64::complete()) {
5410 address tpc = trampoline_call(zero_blocks);
5411 if (tpc == NULL) {
5412 DEBUG_ONLY(reset_labels1(around));
5413 postcond(pc() == badAddress);
5414 return NULL;
5415 }
5416 } else {
5417 bl(zero_blocks);
5418 }
5419 }
5420 bind(around);
5421 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5422 Label l;
5423 tbz(cnt, exact_log2(i), l);
5424 for (int j = 0; j < i; j += 2) {
5425 stp(zr, zr, post(ptr, 16));
5426 }
5427 bind(l);
5428 }
5429 {
5430 Label l;
5431 tbz(cnt, 0, l);
5432 str(zr, Address(ptr));
5433 bind(l);
5434 }
5435 BLOCK_COMMENT("} zero_words");
5436 postcond(pc() != badAddress);
5437 return pc();
5438 }
5439
5440 // base: Address of a buffer to be zeroed, 8 bytes aligned.
5441 // cnt: Immediate count in HeapWords.
5442 #define SmallArraySize (18 * BytesPerLong)
zero_words(Register base,uint64_t cnt)5443 void MacroAssembler::zero_words(Register base, uint64_t cnt)
5444 {
5445 BLOCK_COMMENT("zero_words {");
5446 int i = cnt & 1; // store any odd word to start
5447 if (i) str(zr, Address(base));
5448
5449 if (cnt <= SmallArraySize / BytesPerLong) {
5450 for (; i < (int)cnt; i += 2) {
5451 stp(zr, zr, Address(base, i * wordSize));
5452 }
5453 } else {
5454 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5455 int remainder = cnt % (2 * unroll);
5456 for (; i < remainder; i += 2) {
5457 stp(zr, zr, Address(base, i * wordSize));
5458 }
5459 Label loop;
5460 Register cnt_reg = rscratch1;
5461 Register loop_base = rscratch2;
5462 cnt = cnt - remainder;
5463 mov(cnt_reg, cnt);
5464 // adjust base and prebias by -2 * wordSize so we can pre-increment
5465 add(loop_base, base, (remainder - 2) * wordSize);
5466 bind(loop);
5467 sub(cnt_reg, cnt_reg, 2 * unroll);
5468 for (i = 1; i < unroll; i++) {
5469 stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5470 }
5471 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5472 cbnz(cnt_reg, loop);
5473 }
5474 BLOCK_COMMENT("} zero_words");
5475 }
5476
5477 // Zero blocks of memory by using DC ZVA.
5478 //
5479 // Aligns the base address first sufficently for DC ZVA, then uses
5480 // DC ZVA repeatedly for every full block. cnt is the size to be
5481 // zeroed in HeapWords. Returns the count of words left to be zeroed
5482 // in cnt.
5483 //
5484 // NOTE: This is intended to be used in the zero_blocks() stub. If
5485 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
zero_dcache_blocks(Register base,Register cnt)5486 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5487 Register tmp = rscratch1;
5488 Register tmp2 = rscratch2;
5489 int zva_length = VM_Version::zva_length();
5490 Label initial_table_end, loop_zva;
5491 Label fini;
5492
5493 // Base must be 16 byte aligned. If not just return and let caller handle it
5494 tst(base, 0x0f);
5495 br(Assembler::NE, fini);
5496 // Align base with ZVA length.
5497 neg(tmp, base);
5498 andr(tmp, tmp, zva_length - 1);
5499
5500 // tmp: the number of bytes to be filled to align the base with ZVA length.
5501 add(base, base, tmp);
5502 sub(cnt, cnt, tmp, Assembler::ASR, 3);
5503 adr(tmp2, initial_table_end);
5504 sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5505 br(tmp2);
5506
5507 for (int i = -zva_length + 16; i < 0; i += 16)
5508 stp(zr, zr, Address(base, i));
5509 bind(initial_table_end);
5510
5511 sub(cnt, cnt, zva_length >> 3);
5512 bind(loop_zva);
5513 dc(Assembler::ZVA, base);
5514 subs(cnt, cnt, zva_length >> 3);
5515 add(base, base, zva_length);
5516 br(Assembler::GE, loop_zva);
5517 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5518 bind(fini);
5519 }
5520
5521 // base: Address of a buffer to be filled, 8 bytes aligned.
5522 // cnt: Count in 8-byte unit.
5523 // value: Value to be filled with.
5524 // base will point to the end of the buffer after filling.
fill_words(Register base,Register cnt,Register value)5525 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5526 {
5527 // Algorithm:
5528 //
5529 // scratch1 = cnt & 7;
5530 // cnt -= scratch1;
5531 // p += scratch1;
5532 // switch (scratch1) {
5533 // do {
5534 // cnt -= 8;
5535 // p[-8] = v;
5536 // case 7:
5537 // p[-7] = v;
5538 // case 6:
5539 // p[-6] = v;
5540 // // ...
5541 // case 1:
5542 // p[-1] = v;
5543 // case 0:
5544 // p += 8;
5545 // } while (cnt);
5546 // }
5547
5548 assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5549
5550 Label fini, skip, entry, loop;
5551 const int unroll = 8; // Number of stp instructions we'll unroll
5552
5553 cbz(cnt, fini);
5554 tbz(base, 3, skip);
5555 str(value, Address(post(base, 8)));
5556 sub(cnt, cnt, 1);
5557 bind(skip);
5558
5559 andr(rscratch1, cnt, (unroll-1) * 2);
5560 sub(cnt, cnt, rscratch1);
5561 add(base, base, rscratch1, Assembler::LSL, 3);
5562 adr(rscratch2, entry);
5563 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5564 br(rscratch2);
5565
5566 bind(loop);
5567 add(base, base, unroll * 16);
5568 for (int i = -unroll; i < 0; i++)
5569 stp(value, value, Address(base, i * 16));
5570 bind(entry);
5571 subs(cnt, cnt, unroll * 2);
5572 br(Assembler::GE, loop);
5573
5574 tbz(cnt, 0, fini);
5575 str(value, Address(post(base, 8)));
5576 bind(fini);
5577 }
5578
5579 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5580 // java/lang/StringUTF16.compress.
encode_iso_array(Register src,Register dst,Register len,Register result,FloatRegister Vtmp1,FloatRegister Vtmp2,FloatRegister Vtmp3,FloatRegister Vtmp4)5581 void MacroAssembler::encode_iso_array(Register src, Register dst,
5582 Register len, Register result,
5583 FloatRegister Vtmp1, FloatRegister Vtmp2,
5584 FloatRegister Vtmp3, FloatRegister Vtmp4)
5585 {
5586 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5587 NEXT_32_START, NEXT_32_PRFM_START;
5588 Register tmp1 = rscratch1, tmp2 = rscratch2;
5589
5590 mov(result, len); // Save initial len
5591
5592 cmp(len, 8); // handle shortest strings first
5593 br(LT, LOOP_1);
5594 cmp(len, 32);
5595 br(LT, NEXT_8);
5596 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5597 // to convert chars to bytes
5598 if (SoftwarePrefetchHintDistance >= 0) {
5599 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5600 cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5601 br(LE, NEXT_32_START);
5602 b(NEXT_32_PRFM_START);
5603 BIND(NEXT_32_PRFM);
5604 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5605 BIND(NEXT_32_PRFM_START);
5606 prfm(Address(src, SoftwarePrefetchHintDistance));
5607 orr(v4, T16B, Vtmp1, Vtmp2);
5608 orr(v5, T16B, Vtmp3, Vtmp4);
5609 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5610 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5611 uzp2(v5, T16B, v4, v5); // high bytes
5612 umov(tmp2, v5, D, 1);
5613 fmovd(tmp1, v5);
5614 orr(tmp1, tmp1, tmp2);
5615 cbnz(tmp1, LOOP_8);
5616 stpq(Vtmp1, Vtmp3, dst);
5617 sub(len, len, 32);
5618 add(dst, dst, 32);
5619 add(src, src, 64);
5620 cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5621 br(GE, NEXT_32_PRFM);
5622 cmp(len, 32);
5623 br(LT, LOOP_8);
5624 BIND(NEXT_32);
5625 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5626 BIND(NEXT_32_START);
5627 } else {
5628 BIND(NEXT_32);
5629 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5630 }
5631 prfm(Address(src, SoftwarePrefetchHintDistance));
5632 uzp1(v4, T16B, Vtmp1, Vtmp2);
5633 uzp1(v5, T16B, Vtmp3, Vtmp4);
5634 orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5635 orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5636 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5637 umov(tmp2, Vtmp1, D, 1);
5638 fmovd(tmp1, Vtmp1);
5639 orr(tmp1, tmp1, tmp2);
5640 cbnz(tmp1, LOOP_8);
5641 stpq(v4, v5, dst);
5642 sub(len, len, 32);
5643 add(dst, dst, 32);
5644 add(src, src, 64);
5645 cmp(len, 32);
5646 br(GE, NEXT_32);
5647 cbz(len, DONE);
5648
5649 BIND(LOOP_8);
5650 cmp(len, 8);
5651 br(LT, LOOP_1);
5652 BIND(NEXT_8);
5653 ld1(Vtmp1, T8H, src);
5654 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5655 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5656 fmovd(tmp1, Vtmp3);
5657 cbnz(tmp1, NEXT_1);
5658 strd(Vtmp2, dst);
5659
5660 sub(len, len, 8);
5661 add(dst, dst, 8);
5662 add(src, src, 16);
5663 cmp(len, 8);
5664 br(GE, NEXT_8);
5665
5666 BIND(LOOP_1);
5667
5668 cbz(len, DONE);
5669 BIND(NEXT_1);
5670 ldrh(tmp1, Address(post(src, 2)));
5671 tst(tmp1, 0xff00);
5672 br(NE, SET_RESULT);
5673 strb(tmp1, Address(post(dst, 1)));
5674 subs(len, len, 1);
5675 br(GT, NEXT_1);
5676
5677 BIND(SET_RESULT);
5678 sub(result, result, len); // Return index where we stopped
5679 // Return len == 0 if we processed all
5680 // characters
5681 BIND(DONE);
5682 }
5683
5684
5685 // Inflate byte[] array to char[].
byte_array_inflate(Register src,Register dst,Register len,FloatRegister vtmp1,FloatRegister vtmp2,FloatRegister vtmp3,Register tmp4)5686 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5687 FloatRegister vtmp1, FloatRegister vtmp2,
5688 FloatRegister vtmp3, Register tmp4) {
5689 Label big, done, after_init, to_stub;
5690
5691 assert_different_registers(src, dst, len, tmp4, rscratch1);
5692
5693 fmovd(vtmp1, zr);
5694 lsrw(tmp4, len, 3);
5695 bind(after_init);
5696 cbnzw(tmp4, big);
5697 // Short string: less than 8 bytes.
5698 {
5699 Label loop, tiny;
5700
5701 cmpw(len, 4);
5702 br(LT, tiny);
5703 // Use SIMD to do 4 bytes.
5704 ldrs(vtmp2, post(src, 4));
5705 zip1(vtmp3, T8B, vtmp2, vtmp1);
5706 subw(len, len, 4);
5707 strd(vtmp3, post(dst, 8));
5708
5709 cbzw(len, done);
5710
5711 // Do the remaining bytes by steam.
5712 bind(loop);
5713 ldrb(tmp4, post(src, 1));
5714 strh(tmp4, post(dst, 2));
5715 subw(len, len, 1);
5716
5717 bind(tiny);
5718 cbnz(len, loop);
5719
5720 b(done);
5721 }
5722
5723 if (SoftwarePrefetchHintDistance >= 0) {
5724 bind(to_stub);
5725 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5726 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5727 address tpc = trampoline_call(stub);
5728 if (tpc == NULL) {
5729 DEBUG_ONLY(reset_labels2(big, done));
5730 postcond(pc() == badAddress);
5731 return NULL;
5732 }
5733 b(after_init);
5734 }
5735
5736 // Unpack the bytes 8 at a time.
5737 bind(big);
5738 {
5739 Label loop, around, loop_last, loop_start;
5740
5741 if (SoftwarePrefetchHintDistance >= 0) {
5742 const int large_loop_threshold = (64 + 16)/8;
5743 ldrd(vtmp2, post(src, 8));
5744 andw(len, len, 7);
5745 cmp(tmp4, large_loop_threshold);
5746 br(GE, to_stub);
5747 b(loop_start);
5748
5749 bind(loop);
5750 ldrd(vtmp2, post(src, 8));
5751 bind(loop_start);
5752 subs(tmp4, tmp4, 1);
5753 br(EQ, loop_last);
5754 zip1(vtmp2, T16B, vtmp2, vtmp1);
5755 ldrd(vtmp3, post(src, 8));
5756 st1(vtmp2, T8H, post(dst, 16));
5757 subs(tmp4, tmp4, 1);
5758 zip1(vtmp3, T16B, vtmp3, vtmp1);
5759 st1(vtmp3, T8H, post(dst, 16));
5760 br(NE, loop);
5761 b(around);
5762 bind(loop_last);
5763 zip1(vtmp2, T16B, vtmp2, vtmp1);
5764 st1(vtmp2, T8H, post(dst, 16));
5765 bind(around);
5766 cbz(len, done);
5767 } else {
5768 andw(len, len, 7);
5769 bind(loop);
5770 ldrd(vtmp2, post(src, 8));
5771 sub(tmp4, tmp4, 1);
5772 zip1(vtmp3, T16B, vtmp2, vtmp1);
5773 st1(vtmp3, T8H, post(dst, 16));
5774 cbnz(tmp4, loop);
5775 }
5776 }
5777
5778 // Do the tail of up to 8 bytes.
5779 add(src, src, len);
5780 ldrd(vtmp3, Address(src, -8));
5781 add(dst, dst, len, ext::uxtw, 1);
5782 zip1(vtmp3, T16B, vtmp3, vtmp1);
5783 strq(vtmp3, Address(dst, -16));
5784
5785 bind(done);
5786 postcond(pc() != badAddress);
5787 return pc();
5788 }
5789
5790 // Compress char[] array to byte[].
char_array_compress(Register src,Register dst,Register len,FloatRegister tmp1Reg,FloatRegister tmp2Reg,FloatRegister tmp3Reg,FloatRegister tmp4Reg,Register result)5791 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5792 FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5793 FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5794 Register result) {
5795 encode_iso_array(src, dst, len, result,
5796 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5797 cmp(len, zr);
5798 csel(result, result, zr, EQ);
5799 }
5800
5801 #ifdef __OpenBSD__
5802 // OpenBSD uses emulated tls so it can't use aarch64_get_thread_helper().
5803 // Save whatever non-callee save context might get clobbered by
5804 // Thread::current.
get_thread(Register dst)5805 void MacroAssembler::get_thread(Register dst) {
5806 RegSet saved_regs = RegSet::range(r0, r18) + lr - dst;
5807 push(saved_regs, sp);
5808
5809 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
5810 if (dst != c_rarg0) {
5811 mov(dst, c_rarg0);
5812 }
5813
5814 pop(saved_regs, sp);
5815 }
5816 #else
5817 // get_thread() can be called anywhere inside generated code so we
5818 // need to save whatever non-callee save context might get clobbered
5819 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5820 // the call setup code.
5821 //
5822 // On Linux, aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5823 // On other systems, the helper is a usual C function.
5824 //
get_thread(Register dst)5825 void MacroAssembler::get_thread(Register dst) {
5826 RegSet saved_regs =
5827 LINUX_ONLY(RegSet::range(r0, r1) + lr - dst)
5828 NOT_LINUX (RegSet::range(r0, r17) + lr - dst);
5829
5830 push(saved_regs, sp);
5831
5832 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5833 blr(lr);
5834 if (dst != c_rarg0) {
5835 mov(dst, c_rarg0);
5836 }
5837
5838 pop(saved_regs, sp);
5839 }
5840 #endif
5841