1 /*
2 * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "jvm.h"
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "compiler/disassembler.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "gc/shared/collectedHeap.inline.hpp"
33 #include "interpreter/interpreter.hpp"
34 #include "memory/resourceArea.hpp"
35 #include "memory/universe.hpp"
36 #include "oops/accessDecorators.hpp"
37 #include "oops/compressedOops.inline.hpp"
38 #include "oops/klass.inline.hpp"
39 #include "prims/methodHandles.hpp"
40 #include "runtime/biasedLocking.hpp"
41 #include "runtime/flags/flagSetting.hpp"
42 #include "runtime/interfaceSupport.inline.hpp"
43 #include "runtime/objectMonitor.hpp"
44 #include "runtime/os.hpp"
45 #include "runtime/safepoint.hpp"
46 #include "runtime/safepointMechanism.hpp"
47 #include "runtime/sharedRuntime.hpp"
48 #include "runtime/stubRoutines.hpp"
49 #include "runtime/thread.hpp"
50 #include "utilities/macros.hpp"
51 #include "crc32c.h"
52
53 #ifdef PRODUCT
54 #define BLOCK_COMMENT(str) /* nothing */
55 #define STOP(error) stop(error)
56 #else
57 #define BLOCK_COMMENT(str) block_comment(str)
58 #define STOP(error) block_comment(error); stop(error)
59 #endif
60
61 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
62
63 #ifdef ASSERT
pd_check_instruction_mark()64 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
65 #endif
66
67 static Assembler::Condition reverse[] = {
68 Assembler::noOverflow /* overflow = 0x0 */ ,
69 Assembler::overflow /* noOverflow = 0x1 */ ,
70 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
71 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
72 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
73 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
74 Assembler::above /* belowEqual = 0x6 */ ,
75 Assembler::belowEqual /* above = 0x7 */ ,
76 Assembler::positive /* negative = 0x8 */ ,
77 Assembler::negative /* positive = 0x9 */ ,
78 Assembler::noParity /* parity = 0xa */ ,
79 Assembler::parity /* noParity = 0xb */ ,
80 Assembler::greaterEqual /* less = 0xc */ ,
81 Assembler::less /* greaterEqual = 0xd */ ,
82 Assembler::greater /* lessEqual = 0xe */ ,
83 Assembler::lessEqual /* greater = 0xf, */
84
85 };
86
87
88 // Implementation of MacroAssembler
89
90 // First all the versions that have distinct versions depending on 32/64 bit
91 // Unless the difference is trivial (1 line or so).
92
93 #ifndef _LP64
94
95 // 32bit versions
96
as_Address(AddressLiteral adr)97 Address MacroAssembler::as_Address(AddressLiteral adr) {
98 return Address(adr.target(), adr.rspec());
99 }
100
as_Address(ArrayAddress adr)101 Address MacroAssembler::as_Address(ArrayAddress adr) {
102 return Address::make_array(adr);
103 }
104
call_VM_leaf_base(address entry_point,int number_of_arguments)105 void MacroAssembler::call_VM_leaf_base(address entry_point,
106 int number_of_arguments) {
107 call(RuntimeAddress(entry_point));
108 increment(rsp, number_of_arguments * wordSize);
109 }
110
cmpklass(Address src1,Metadata * obj)111 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
112 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
113 }
114
cmpklass(Register src1,Metadata * obj)115 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
116 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
117 }
118
cmpoop_raw(Address src1,jobject obj)119 void MacroAssembler::cmpoop_raw(Address src1, jobject obj) {
120 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
121 }
122
cmpoop_raw(Register src1,jobject obj)123 void MacroAssembler::cmpoop_raw(Register src1, jobject obj) {
124 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
125 }
126
cmpoop(Address src1,jobject obj)127 void MacroAssembler::cmpoop(Address src1, jobject obj) {
128 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
129 bs->obj_equals(this, src1, obj);
130 }
131
cmpoop(Register src1,jobject obj)132 void MacroAssembler::cmpoop(Register src1, jobject obj) {
133 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
134 bs->obj_equals(this, src1, obj);
135 }
136
extend_sign(Register hi,Register lo)137 void MacroAssembler::extend_sign(Register hi, Register lo) {
138 // According to Intel Doc. AP-526, "Integer Divide", p.18.
139 if (VM_Version::is_P6() && hi == rdx && lo == rax) {
140 cdql();
141 } else {
142 movl(hi, lo);
143 sarl(hi, 31);
144 }
145 }
146
jC2(Register tmp,Label & L)147 void MacroAssembler::jC2(Register tmp, Label& L) {
148 // set parity bit if FPU flag C2 is set (via rax)
149 save_rax(tmp);
150 fwait(); fnstsw_ax();
151 sahf();
152 restore_rax(tmp);
153 // branch
154 jcc(Assembler::parity, L);
155 }
156
jnC2(Register tmp,Label & L)157 void MacroAssembler::jnC2(Register tmp, Label& L) {
158 // set parity bit if FPU flag C2 is set (via rax)
159 save_rax(tmp);
160 fwait(); fnstsw_ax();
161 sahf();
162 restore_rax(tmp);
163 // branch
164 jcc(Assembler::noParity, L);
165 }
166
167 // 32bit can do a case table jump in one instruction but we no longer allow the base
168 // to be installed in the Address class
jump(ArrayAddress entry)169 void MacroAssembler::jump(ArrayAddress entry) {
170 jmp(as_Address(entry));
171 }
172
173 // Note: y_lo will be destroyed
lcmp2int(Register x_hi,Register x_lo,Register y_hi,Register y_lo)174 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
175 // Long compare for Java (semantics as described in JVM spec.)
176 Label high, low, done;
177
178 cmpl(x_hi, y_hi);
179 jcc(Assembler::less, low);
180 jcc(Assembler::greater, high);
181 // x_hi is the return register
182 xorl(x_hi, x_hi);
183 cmpl(x_lo, y_lo);
184 jcc(Assembler::below, low);
185 jcc(Assembler::equal, done);
186
187 bind(high);
188 xorl(x_hi, x_hi);
189 increment(x_hi);
190 jmp(done);
191
192 bind(low);
193 xorl(x_hi, x_hi);
194 decrementl(x_hi);
195
196 bind(done);
197 }
198
lea(Register dst,AddressLiteral src)199 void MacroAssembler::lea(Register dst, AddressLiteral src) {
200 mov_literal32(dst, (int32_t)src.target(), src.rspec());
201 }
202
lea(Address dst,AddressLiteral adr)203 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
204 // leal(dst, as_Address(adr));
205 // see note in movl as to why we must use a move
206 mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
207 }
208
leave()209 void MacroAssembler::leave() {
210 mov(rsp, rbp);
211 pop(rbp);
212 }
213
lmul(int x_rsp_offset,int y_rsp_offset)214 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
215 // Multiplication of two Java long values stored on the stack
216 // as illustrated below. Result is in rdx:rax.
217 //
218 // rsp ---> [ ?? ] \ \
219 // .... | y_rsp_offset |
220 // [ y_lo ] / (in bytes) | x_rsp_offset
221 // [ y_hi ] | (in bytes)
222 // .... |
223 // [ x_lo ] /
224 // [ x_hi ]
225 // ....
226 //
227 // Basic idea: lo(result) = lo(x_lo * y_lo)
228 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
229 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
230 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
231 Label quick;
232 // load x_hi, y_hi and check if quick
233 // multiplication is possible
234 movl(rbx, x_hi);
235 movl(rcx, y_hi);
236 movl(rax, rbx);
237 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0
238 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply
239 // do full multiplication
240 // 1st step
241 mull(y_lo); // x_hi * y_lo
242 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx,
243 // 2nd step
244 movl(rax, x_lo);
245 mull(rcx); // x_lo * y_hi
246 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx,
247 // 3rd step
248 bind(quick); // note: rbx, = 0 if quick multiply!
249 movl(rax, x_lo);
250 mull(y_lo); // x_lo * y_lo
251 addl(rdx, rbx); // correct hi(x_lo * y_lo)
252 }
253
lneg(Register hi,Register lo)254 void MacroAssembler::lneg(Register hi, Register lo) {
255 negl(lo);
256 adcl(hi, 0);
257 negl(hi);
258 }
259
lshl(Register hi,Register lo)260 void MacroAssembler::lshl(Register hi, Register lo) {
261 // Java shift left long support (semantics as described in JVM spec., p.305)
262 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
263 // shift value is in rcx !
264 assert(hi != rcx, "must not use rcx");
265 assert(lo != rcx, "must not use rcx");
266 const Register s = rcx; // shift count
267 const int n = BitsPerWord;
268 Label L;
269 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
270 cmpl(s, n); // if (s < n)
271 jcc(Assembler::less, L); // else (s >= n)
272 movl(hi, lo); // x := x << n
273 xorl(lo, lo);
274 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
275 bind(L); // s (mod n) < n
276 shldl(hi, lo); // x := x << s
277 shll(lo);
278 }
279
280
lshr(Register hi,Register lo,bool sign_extension)281 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
282 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
283 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
284 assert(hi != rcx, "must not use rcx");
285 assert(lo != rcx, "must not use rcx");
286 const Register s = rcx; // shift count
287 const int n = BitsPerWord;
288 Label L;
289 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
290 cmpl(s, n); // if (s < n)
291 jcc(Assembler::less, L); // else (s >= n)
292 movl(lo, hi); // x := x >> n
293 if (sign_extension) sarl(hi, 31);
294 else xorl(hi, hi);
295 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
296 bind(L); // s (mod n) < n
297 shrdl(lo, hi); // x := x >> s
298 if (sign_extension) sarl(hi);
299 else shrl(hi);
300 }
301
movoop(Register dst,jobject obj)302 void MacroAssembler::movoop(Register dst, jobject obj) {
303 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
304 }
305
movoop(Address dst,jobject obj)306 void MacroAssembler::movoop(Address dst, jobject obj) {
307 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
308 }
309
mov_metadata(Register dst,Metadata * obj)310 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
311 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
312 }
313
mov_metadata(Address dst,Metadata * obj)314 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
315 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
316 }
317
movptr(Register dst,AddressLiteral src,Register scratch)318 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
319 // scratch register is not used,
320 // it is defined to match parameters of 64-bit version of this method.
321 if (src.is_lval()) {
322 mov_literal32(dst, (intptr_t)src.target(), src.rspec());
323 } else {
324 movl(dst, as_Address(src));
325 }
326 }
327
movptr(ArrayAddress dst,Register src)328 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
329 movl(as_Address(dst), src);
330 }
331
movptr(Register dst,ArrayAddress src)332 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
333 movl(dst, as_Address(src));
334 }
335
336 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
movptr(Address dst,intptr_t src)337 void MacroAssembler::movptr(Address dst, intptr_t src) {
338 movl(dst, src);
339 }
340
341
pop_callee_saved_registers()342 void MacroAssembler::pop_callee_saved_registers() {
343 pop(rcx);
344 pop(rdx);
345 pop(rdi);
346 pop(rsi);
347 }
348
push_callee_saved_registers()349 void MacroAssembler::push_callee_saved_registers() {
350 push(rsi);
351 push(rdi);
352 push(rdx);
353 push(rcx);
354 }
355
pushoop(jobject obj)356 void MacroAssembler::pushoop(jobject obj) {
357 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
358 }
359
pushklass(Metadata * obj)360 void MacroAssembler::pushklass(Metadata* obj) {
361 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
362 }
363
pushptr(AddressLiteral src)364 void MacroAssembler::pushptr(AddressLiteral src) {
365 if (src.is_lval()) {
366 push_literal32((int32_t)src.target(), src.rspec());
367 } else {
368 pushl(as_Address(src));
369 }
370 }
371
set_word_if_not_zero(Register dst)372 void MacroAssembler::set_word_if_not_zero(Register dst) {
373 xorl(dst, dst);
374 set_byte_if_not_zero(dst);
375 }
376
pass_arg0(MacroAssembler * masm,Register arg)377 static void pass_arg0(MacroAssembler* masm, Register arg) {
378 masm->push(arg);
379 }
380
pass_arg1(MacroAssembler * masm,Register arg)381 static void pass_arg1(MacroAssembler* masm, Register arg) {
382 masm->push(arg);
383 }
384
pass_arg2(MacroAssembler * masm,Register arg)385 static void pass_arg2(MacroAssembler* masm, Register arg) {
386 masm->push(arg);
387 }
388
pass_arg3(MacroAssembler * masm,Register arg)389 static void pass_arg3(MacroAssembler* masm, Register arg) {
390 masm->push(arg);
391 }
392
393 #ifndef PRODUCT
394 extern "C" void findpc(intptr_t x);
395 #endif
396
debug32(int rdi,int rsi,int rbp,int rsp,int rbx,int rdx,int rcx,int rax,int eip,char * msg)397 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
398 // In order to get locks to work, we need to fake a in_VM state
399 JavaThread* thread = JavaThread::current();
400 JavaThreadState saved_state = thread->thread_state();
401 thread->set_thread_state(_thread_in_vm);
402 if (ShowMessageBoxOnError) {
403 JavaThread* thread = JavaThread::current();
404 JavaThreadState saved_state = thread->thread_state();
405 thread->set_thread_state(_thread_in_vm);
406 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
407 ttyLocker ttyl;
408 BytecodeCounter::print();
409 }
410 // To see where a verify_oop failed, get $ebx+40/X for this frame.
411 // This is the value of eip which points to where verify_oop will return.
412 if (os::message_box(msg, "Execution stopped, print registers?")) {
413 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
414 BREAKPOINT;
415 }
416 }
417 fatal("DEBUG MESSAGE: %s", msg);
418 }
419
print_state32(int rdi,int rsi,int rbp,int rsp,int rbx,int rdx,int rcx,int rax,int eip)420 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
421 ttyLocker ttyl;
422 FlagSetting fs(Debugging, true);
423 tty->print_cr("eip = 0x%08x", eip);
424 #ifndef PRODUCT
425 if ((WizardMode || Verbose) && PrintMiscellaneous) {
426 tty->cr();
427 findpc(eip);
428 tty->cr();
429 }
430 #endif
431 #define PRINT_REG(rax) \
432 { tty->print("%s = ", #rax); os::print_location(tty, rax); }
433 PRINT_REG(rax);
434 PRINT_REG(rbx);
435 PRINT_REG(rcx);
436 PRINT_REG(rdx);
437 PRINT_REG(rdi);
438 PRINT_REG(rsi);
439 PRINT_REG(rbp);
440 PRINT_REG(rsp);
441 #undef PRINT_REG
442 // Print some words near top of staack.
443 int* dump_sp = (int*) rsp;
444 for (int col1 = 0; col1 < 8; col1++) {
445 tty->print("(rsp+0x%03x) " INTPTR_FORMAT ": ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
446 os::print_location(tty, *dump_sp++);
447 }
448 for (int row = 0; row < 16; row++) {
449 tty->print("(rsp+0x%03x) " INTPTR_FORMAT ": ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
450 for (int col = 0; col < 8; col++) {
451 tty->print(" 0x%08x", *dump_sp++);
452 }
453 tty->cr();
454 }
455 // Print some instructions around pc:
456 Disassembler::decode((address)eip-64, (address)eip);
457 tty->print_cr("--------");
458 Disassembler::decode((address)eip, (address)eip+32);
459 }
460
stop(const char * msg)461 void MacroAssembler::stop(const char* msg) {
462 ExternalAddress message((address)msg);
463 // push address of message
464 pushptr(message.addr());
465 { Label L; call(L, relocInfo::none); bind(L); } // push eip
466 pusha(); // push registers
467 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
468 hlt();
469 }
470
warn(const char * msg)471 void MacroAssembler::warn(const char* msg) {
472 push_CPU_state();
473
474 ExternalAddress message((address) msg);
475 // push address of message
476 pushptr(message.addr());
477
478 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
479 addl(rsp, wordSize); // discard argument
480 pop_CPU_state();
481 }
482
print_state()483 void MacroAssembler::print_state() {
484 { Label L; call(L, relocInfo::none); bind(L); } // push eip
485 pusha(); // push registers
486
487 push_CPU_state();
488 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
489 pop_CPU_state();
490
491 popa();
492 addl(rsp, wordSize);
493 }
494
495 #else // _LP64
496
497 // 64 bit versions
498
as_Address(AddressLiteral adr)499 Address MacroAssembler::as_Address(AddressLiteral adr) {
500 // amd64 always does this as a pc-rel
501 // we can be absolute or disp based on the instruction type
502 // jmp/call are displacements others are absolute
503 assert(!adr.is_lval(), "must be rval");
504 assert(reachable(adr), "must be");
505 return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
506
507 }
508
as_Address(ArrayAddress adr)509 Address MacroAssembler::as_Address(ArrayAddress adr) {
510 AddressLiteral base = adr.base();
511 lea(rscratch1, base);
512 Address index = adr.index();
513 assert(index._disp == 0, "must not have disp"); // maybe it can?
514 Address array(rscratch1, index._index, index._scale, index._disp);
515 return array;
516 }
517
call_VM_leaf_base(address entry_point,int num_args)518 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
519 Label L, E;
520
521 #ifdef _WIN64
522 // Windows always allocates space for it's register args
523 assert(num_args <= 4, "only register arguments supported");
524 subq(rsp, frame::arg_reg_save_area_bytes);
525 #endif
526
527 // Align stack if necessary
528 testl(rsp, 15);
529 jcc(Assembler::zero, L);
530
531 subq(rsp, 8);
532 {
533 call(RuntimeAddress(entry_point));
534 }
535 addq(rsp, 8);
536 jmp(E);
537
538 bind(L);
539 {
540 call(RuntimeAddress(entry_point));
541 }
542
543 bind(E);
544
545 #ifdef _WIN64
546 // restore stack pointer
547 addq(rsp, frame::arg_reg_save_area_bytes);
548 #endif
549
550 }
551
cmp64(Register src1,AddressLiteral src2)552 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
553 assert(!src2.is_lval(), "should use cmpptr");
554
555 if (reachable(src2)) {
556 cmpq(src1, as_Address(src2));
557 } else {
558 lea(rscratch1, src2);
559 Assembler::cmpq(src1, Address(rscratch1, 0));
560 }
561 }
562
corrected_idivq(Register reg)563 int MacroAssembler::corrected_idivq(Register reg) {
564 // Full implementation of Java ldiv and lrem; checks for special
565 // case as described in JVM spec., p.243 & p.271. The function
566 // returns the (pc) offset of the idivl instruction - may be needed
567 // for implicit exceptions.
568 //
569 // normal case special case
570 //
571 // input : rax: dividend min_long
572 // reg: divisor (may not be eax/edx) -1
573 //
574 // output: rax: quotient (= rax idiv reg) min_long
575 // rdx: remainder (= rax irem reg) 0
576 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
577 static const int64_t min_long = 0x8000000000000000;
578 Label normal_case, special_case;
579
580 // check for special case
581 cmp64(rax, ExternalAddress((address) &min_long));
582 jcc(Assembler::notEqual, normal_case);
583 xorl(rdx, rdx); // prepare rdx for possible special case (where
584 // remainder = 0)
585 cmpq(reg, -1);
586 jcc(Assembler::equal, special_case);
587
588 // handle normal case
589 bind(normal_case);
590 cdqq();
591 int idivq_offset = offset();
592 idivq(reg);
593
594 // normal and special case exit
595 bind(special_case);
596
597 return idivq_offset;
598 }
599
decrementq(Register reg,int value)600 void MacroAssembler::decrementq(Register reg, int value) {
601 if (value == min_jint) { subq(reg, value); return; }
602 if (value < 0) { incrementq(reg, -value); return; }
603 if (value == 0) { ; return; }
604 if (value == 1 && UseIncDec) { decq(reg) ; return; }
605 /* else */ { subq(reg, value) ; return; }
606 }
607
decrementq(Address dst,int value)608 void MacroAssembler::decrementq(Address dst, int value) {
609 if (value == min_jint) { subq(dst, value); return; }
610 if (value < 0) { incrementq(dst, -value); return; }
611 if (value == 0) { ; return; }
612 if (value == 1 && UseIncDec) { decq(dst) ; return; }
613 /* else */ { subq(dst, value) ; return; }
614 }
615
incrementq(AddressLiteral dst)616 void MacroAssembler::incrementq(AddressLiteral dst) {
617 if (reachable(dst)) {
618 incrementq(as_Address(dst));
619 } else {
620 lea(rscratch1, dst);
621 incrementq(Address(rscratch1, 0));
622 }
623 }
624
incrementq(Register reg,int value)625 void MacroAssembler::incrementq(Register reg, int value) {
626 if (value == min_jint) { addq(reg, value); return; }
627 if (value < 0) { decrementq(reg, -value); return; }
628 if (value == 0) { ; return; }
629 if (value == 1 && UseIncDec) { incq(reg) ; return; }
630 /* else */ { addq(reg, value) ; return; }
631 }
632
incrementq(Address dst,int value)633 void MacroAssembler::incrementq(Address dst, int value) {
634 if (value == min_jint) { addq(dst, value); return; }
635 if (value < 0) { decrementq(dst, -value); return; }
636 if (value == 0) { ; return; }
637 if (value == 1 && UseIncDec) { incq(dst) ; return; }
638 /* else */ { addq(dst, value) ; return; }
639 }
640
641 // 32bit can do a case table jump in one instruction but we no longer allow the base
642 // to be installed in the Address class
jump(ArrayAddress entry)643 void MacroAssembler::jump(ArrayAddress entry) {
644 lea(rscratch1, entry.base());
645 Address dispatch = entry.index();
646 assert(dispatch._base == noreg, "must be");
647 dispatch._base = rscratch1;
648 jmp(dispatch);
649 }
650
lcmp2int(Register x_hi,Register x_lo,Register y_hi,Register y_lo)651 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
652 ShouldNotReachHere(); // 64bit doesn't use two regs
653 cmpq(x_lo, y_lo);
654 }
655
lea(Register dst,AddressLiteral src)656 void MacroAssembler::lea(Register dst, AddressLiteral src) {
657 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
658 }
659
lea(Address dst,AddressLiteral adr)660 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
661 mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
662 movptr(dst, rscratch1);
663 }
664
leave()665 void MacroAssembler::leave() {
666 // %%% is this really better? Why not on 32bit too?
667 emit_int8((unsigned char)0xC9); // LEAVE
668 }
669
lneg(Register hi,Register lo)670 void MacroAssembler::lneg(Register hi, Register lo) {
671 ShouldNotReachHere(); // 64bit doesn't use two regs
672 negq(lo);
673 }
674
movoop(Register dst,jobject obj)675 void MacroAssembler::movoop(Register dst, jobject obj) {
676 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
677 }
678
movoop(Address dst,jobject obj)679 void MacroAssembler::movoop(Address dst, jobject obj) {
680 mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
681 movq(dst, rscratch1);
682 }
683
mov_metadata(Register dst,Metadata * obj)684 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
685 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
686 }
687
mov_metadata(Address dst,Metadata * obj)688 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
689 mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
690 movq(dst, rscratch1);
691 }
692
movptr(Register dst,AddressLiteral src,Register scratch)693 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
694 if (src.is_lval()) {
695 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
696 } else {
697 if (reachable(src)) {
698 movq(dst, as_Address(src));
699 } else {
700 lea(scratch, src);
701 movq(dst, Address(scratch, 0));
702 }
703 }
704 }
705
movptr(ArrayAddress dst,Register src)706 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
707 movq(as_Address(dst), src);
708 }
709
movptr(Register dst,ArrayAddress src)710 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
711 movq(dst, as_Address(src));
712 }
713
714 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
movptr(Address dst,intptr_t src)715 void MacroAssembler::movptr(Address dst, intptr_t src) {
716 mov64(rscratch1, src);
717 movq(dst, rscratch1);
718 }
719
720 // These are mostly for initializing NULL
movptr(Address dst,int32_t src)721 void MacroAssembler::movptr(Address dst, int32_t src) {
722 movslq(dst, src);
723 }
724
movptr(Register dst,int32_t src)725 void MacroAssembler::movptr(Register dst, int32_t src) {
726 mov64(dst, (intptr_t)src);
727 }
728
pushoop(jobject obj)729 void MacroAssembler::pushoop(jobject obj) {
730 movoop(rscratch1, obj);
731 push(rscratch1);
732 }
733
pushklass(Metadata * obj)734 void MacroAssembler::pushklass(Metadata* obj) {
735 mov_metadata(rscratch1, obj);
736 push(rscratch1);
737 }
738
pushptr(AddressLiteral src)739 void MacroAssembler::pushptr(AddressLiteral src) {
740 lea(rscratch1, src);
741 if (src.is_lval()) {
742 push(rscratch1);
743 } else {
744 pushq(Address(rscratch1, 0));
745 }
746 }
747
reset_last_Java_frame(bool clear_fp)748 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
749 // we must set sp to zero to clear frame
750 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
751 // must clear fp, so that compiled frames are not confused; it is
752 // possible that we need it only for debugging
753 if (clear_fp) {
754 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
755 }
756
757 // Always clear the pc because it could have been set by make_walkable()
758 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
759 vzeroupper();
760 }
761
set_last_Java_frame(Register last_java_sp,Register last_java_fp,address last_java_pc)762 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
763 Register last_java_fp,
764 address last_java_pc) {
765 vzeroupper();
766 // determine last_java_sp register
767 if (!last_java_sp->is_valid()) {
768 last_java_sp = rsp;
769 }
770
771 // last_java_fp is optional
772 if (last_java_fp->is_valid()) {
773 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
774 last_java_fp);
775 }
776
777 // last_java_pc is optional
778 if (last_java_pc != NULL) {
779 Address java_pc(r15_thread,
780 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
781 lea(rscratch1, InternalAddress(last_java_pc));
782 movptr(java_pc, rscratch1);
783 }
784
785 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
786 }
787
pass_arg0(MacroAssembler * masm,Register arg)788 static void pass_arg0(MacroAssembler* masm, Register arg) {
789 if (c_rarg0 != arg ) {
790 masm->mov(c_rarg0, arg);
791 }
792 }
793
pass_arg1(MacroAssembler * masm,Register arg)794 static void pass_arg1(MacroAssembler* masm, Register arg) {
795 if (c_rarg1 != arg ) {
796 masm->mov(c_rarg1, arg);
797 }
798 }
799
pass_arg2(MacroAssembler * masm,Register arg)800 static void pass_arg2(MacroAssembler* masm, Register arg) {
801 if (c_rarg2 != arg ) {
802 masm->mov(c_rarg2, arg);
803 }
804 }
805
pass_arg3(MacroAssembler * masm,Register arg)806 static void pass_arg3(MacroAssembler* masm, Register arg) {
807 if (c_rarg3 != arg ) {
808 masm->mov(c_rarg3, arg);
809 }
810 }
811
stop(const char * msg)812 void MacroAssembler::stop(const char* msg) {
813 if (ShowMessageBoxOnError) {
814 address rip = pc();
815 pusha(); // get regs on stack
816 lea(c_rarg1, InternalAddress(rip));
817 movq(c_rarg2, rsp); // pass pointer to regs array
818 }
819 lea(c_rarg0, ExternalAddress((address) msg));
820 andq(rsp, -16); // align stack as required by ABI
821 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
822 hlt();
823 }
824
warn(const char * msg)825 void MacroAssembler::warn(const char* msg) {
826 push(rbp);
827 movq(rbp, rsp);
828 andq(rsp, -16); // align stack as required by push_CPU_state and call
829 push_CPU_state(); // keeps alignment at 16 bytes
830 lea(c_rarg0, ExternalAddress((address) msg));
831 lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
832 call(rax);
833 pop_CPU_state();
834 mov(rsp, rbp);
835 pop(rbp);
836 }
837
print_state()838 void MacroAssembler::print_state() {
839 address rip = pc();
840 pusha(); // get regs on stack
841 push(rbp);
842 movq(rbp, rsp);
843 andq(rsp, -16); // align stack as required by push_CPU_state and call
844 push_CPU_state(); // keeps alignment at 16 bytes
845
846 lea(c_rarg0, InternalAddress(rip));
847 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
848 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
849
850 pop_CPU_state();
851 mov(rsp, rbp);
852 pop(rbp);
853 popa();
854 }
855
856 #ifndef PRODUCT
857 extern "C" void findpc(intptr_t x);
858 #endif
859
debug64(char * msg,int64_t pc,int64_t regs[])860 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
861 // In order to get locks to work, we need to fake a in_VM state
862 if (ShowMessageBoxOnError) {
863 JavaThread* thread = JavaThread::current();
864 JavaThreadState saved_state = thread->thread_state();
865 thread->set_thread_state(_thread_in_vm);
866 #ifndef PRODUCT
867 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
868 ttyLocker ttyl;
869 BytecodeCounter::print();
870 }
871 #endif
872 // To see where a verify_oop failed, get $ebx+40/X for this frame.
873 // XXX correct this offset for amd64
874 // This is the value of eip which points to where verify_oop will return.
875 if (os::message_box(msg, "Execution stopped, print registers?")) {
876 print_state64(pc, regs);
877 BREAKPOINT;
878 }
879 }
880 fatal("DEBUG MESSAGE: %s", msg);
881 }
882
print_state64(int64_t pc,int64_t regs[])883 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
884 ttyLocker ttyl;
885 FlagSetting fs(Debugging, true);
886 tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
887 #ifndef PRODUCT
888 tty->cr();
889 findpc(pc);
890 tty->cr();
891 #endif
892 #define PRINT_REG(rax, value) \
893 { tty->print("%s = ", #rax); os::print_location(tty, value); }
894 PRINT_REG(rax, regs[15]);
895 PRINT_REG(rbx, regs[12]);
896 PRINT_REG(rcx, regs[14]);
897 PRINT_REG(rdx, regs[13]);
898 PRINT_REG(rdi, regs[8]);
899 PRINT_REG(rsi, regs[9]);
900 PRINT_REG(rbp, regs[10]);
901 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
902 PRINT_REG(rsp, (intptr_t)(®s[16]));
903 PRINT_REG(r8 , regs[7]);
904 PRINT_REG(r9 , regs[6]);
905 PRINT_REG(r10, regs[5]);
906 PRINT_REG(r11, regs[4]);
907 PRINT_REG(r12, regs[3]);
908 PRINT_REG(r13, regs[2]);
909 PRINT_REG(r14, regs[1]);
910 PRINT_REG(r15, regs[0]);
911 #undef PRINT_REG
912 // Print some words near the top of the stack.
913 int64_t* rsp = ®s[16];
914 int64_t* dump_sp = rsp;
915 for (int col1 = 0; col1 < 8; col1++) {
916 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
917 os::print_location(tty, *dump_sp++);
918 }
919 for (int row = 0; row < 25; row++) {
920 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
921 for (int col = 0; col < 4; col++) {
922 tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
923 }
924 tty->cr();
925 }
926 // Print some instructions around pc:
927 Disassembler::decode((address)pc-64, (address)pc);
928 tty->print_cr("--------");
929 Disassembler::decode((address)pc, (address)pc+32);
930 }
931
932 #endif // _LP64
933
934 // Now versions that are common to 32/64 bit
935
addptr(Register dst,int32_t imm32)936 void MacroAssembler::addptr(Register dst, int32_t imm32) {
937 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
938 }
939
addptr(Register dst,Register src)940 void MacroAssembler::addptr(Register dst, Register src) {
941 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
942 }
943
addptr(Address dst,Register src)944 void MacroAssembler::addptr(Address dst, Register src) {
945 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
946 }
947
addsd(XMMRegister dst,AddressLiteral src)948 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
949 if (reachable(src)) {
950 Assembler::addsd(dst, as_Address(src));
951 } else {
952 lea(rscratch1, src);
953 Assembler::addsd(dst, Address(rscratch1, 0));
954 }
955 }
956
addss(XMMRegister dst,AddressLiteral src)957 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
958 if (reachable(src)) {
959 addss(dst, as_Address(src));
960 } else {
961 lea(rscratch1, src);
962 addss(dst, Address(rscratch1, 0));
963 }
964 }
965
addpd(XMMRegister dst,AddressLiteral src)966 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
967 if (reachable(src)) {
968 Assembler::addpd(dst, as_Address(src));
969 } else {
970 lea(rscratch1, src);
971 Assembler::addpd(dst, Address(rscratch1, 0));
972 }
973 }
974
align(int modulus)975 void MacroAssembler::align(int modulus) {
976 align(modulus, offset());
977 }
978
align(int modulus,int target)979 void MacroAssembler::align(int modulus, int target) {
980 if (target % modulus != 0) {
981 nop(modulus - (target % modulus));
982 }
983 }
984
andpd(XMMRegister dst,AddressLiteral src,Register scratch_reg)985 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
986 // Used in sign-masking with aligned address.
987 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
988 if (reachable(src)) {
989 Assembler::andpd(dst, as_Address(src));
990 } else {
991 lea(scratch_reg, src);
992 Assembler::andpd(dst, Address(scratch_reg, 0));
993 }
994 }
995
andps(XMMRegister dst,AddressLiteral src,Register scratch_reg)996 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
997 // Used in sign-masking with aligned address.
998 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
999 if (reachable(src)) {
1000 Assembler::andps(dst, as_Address(src));
1001 } else {
1002 lea(scratch_reg, src);
1003 Assembler::andps(dst, Address(scratch_reg, 0));
1004 }
1005 }
1006
andptr(Register dst,int32_t imm32)1007 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1008 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1009 }
1010
atomic_incl(Address counter_addr)1011 void MacroAssembler::atomic_incl(Address counter_addr) {
1012 lock();
1013 incrementl(counter_addr);
1014 }
1015
atomic_incl(AddressLiteral counter_addr,Register scr)1016 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1017 if (reachable(counter_addr)) {
1018 atomic_incl(as_Address(counter_addr));
1019 } else {
1020 lea(scr, counter_addr);
1021 atomic_incl(Address(scr, 0));
1022 }
1023 }
1024
1025 #ifdef _LP64
atomic_incq(Address counter_addr)1026 void MacroAssembler::atomic_incq(Address counter_addr) {
1027 lock();
1028 incrementq(counter_addr);
1029 }
1030
atomic_incq(AddressLiteral counter_addr,Register scr)1031 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1032 if (reachable(counter_addr)) {
1033 atomic_incq(as_Address(counter_addr));
1034 } else {
1035 lea(scr, counter_addr);
1036 atomic_incq(Address(scr, 0));
1037 }
1038 }
1039 #endif
1040
1041 // Writes to stack successive pages until offset reached to check for
1042 // stack overflow + shadow pages. This clobbers tmp.
bang_stack_size(Register size,Register tmp)1043 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1044 movptr(tmp, rsp);
1045 // Bang stack for total size given plus shadow page size.
1046 // Bang one page at a time because large size can bang beyond yellow and
1047 // red zones.
1048 Label loop;
1049 bind(loop);
1050 movl(Address(tmp, (-os::vm_page_size())), size );
1051 subptr(tmp, os::vm_page_size());
1052 subl(size, os::vm_page_size());
1053 jcc(Assembler::greater, loop);
1054
1055 // Bang down shadow pages too.
1056 // At this point, (tmp-0) is the last address touched, so don't
1057 // touch it again. (It was touched as (tmp-pagesize) but then tmp
1058 // was post-decremented.) Skip this address by starting at i=1, and
1059 // touch a few more pages below. N.B. It is important to touch all
1060 // the way down including all pages in the shadow zone.
1061 for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1062 // this could be any sized move but this is can be a debugging crumb
1063 // so the bigger the better.
1064 movptr(Address(tmp, (-i*os::vm_page_size())), size );
1065 }
1066 }
1067
reserved_stack_check()1068 void MacroAssembler::reserved_stack_check() {
1069 // testing if reserved zone needs to be enabled
1070 Label no_reserved_zone_enabling;
1071 Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1072 NOT_LP64(get_thread(rsi);)
1073
1074 cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1075 jcc(Assembler::below, no_reserved_zone_enabling);
1076
1077 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1078 jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1079 should_not_reach_here();
1080
1081 bind(no_reserved_zone_enabling);
1082 }
1083
biased_locking_enter(Register lock_reg,Register obj_reg,Register swap_reg,Register tmp_reg,Register tmp_reg2,bool swap_reg_contains_mark,Label & done,Label * slow_case,BiasedLockingCounters * counters)1084 int MacroAssembler::biased_locking_enter(Register lock_reg,
1085 Register obj_reg,
1086 Register swap_reg,
1087 Register tmp_reg,
1088 Register tmp_reg2,
1089 bool swap_reg_contains_mark,
1090 Label& done,
1091 Label* slow_case,
1092 BiasedLockingCounters* counters) {
1093 assert(UseBiasedLocking, "why call this otherwise?");
1094 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1095 assert(tmp_reg != noreg, "tmp_reg must be supplied");
1096 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1097 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
1098 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
1099 NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1100
1101 if (PrintBiasedLockingStatistics && counters == NULL) {
1102 counters = BiasedLocking::counters();
1103 }
1104 // Biased locking
1105 // See whether the lock is currently biased toward our thread and
1106 // whether the epoch is still valid
1107 // Note that the runtime guarantees sufficient alignment of JavaThread
1108 // pointers to allow age to be placed into low bits
1109 // First check to see whether biasing is even enabled for this object
1110 Label cas_label;
1111 int null_check_offset = -1;
1112 if (!swap_reg_contains_mark) {
1113 null_check_offset = offset();
1114 movptr(swap_reg, mark_addr);
1115 }
1116 movptr(tmp_reg, swap_reg);
1117 andptr(tmp_reg, markWord::biased_lock_mask_in_place);
1118 cmpptr(tmp_reg, markWord::biased_lock_pattern);
1119 jcc(Assembler::notEqual, cas_label);
1120 // The bias pattern is present in the object's header. Need to check
1121 // whether the bias owner and the epoch are both still current.
1122 #ifndef _LP64
1123 // Note that because there is no current thread register on x86_32 we
1124 // need to store off the mark word we read out of the object to
1125 // avoid reloading it and needing to recheck invariants below. This
1126 // store is unfortunate but it makes the overall code shorter and
1127 // simpler.
1128 movptr(saved_mark_addr, swap_reg);
1129 #endif
1130 if (swap_reg_contains_mark) {
1131 null_check_offset = offset();
1132 }
1133 load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1134 #ifdef _LP64
1135 orptr(tmp_reg, r15_thread);
1136 xorptr(tmp_reg, swap_reg);
1137 Register header_reg = tmp_reg;
1138 #else
1139 xorptr(tmp_reg, swap_reg);
1140 get_thread(swap_reg);
1141 xorptr(swap_reg, tmp_reg);
1142 Register header_reg = swap_reg;
1143 #endif
1144 andptr(header_reg, ~((int) markWord::age_mask_in_place));
1145 if (counters != NULL) {
1146 cond_inc32(Assembler::zero,
1147 ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1148 }
1149 jcc(Assembler::equal, done);
1150
1151 Label try_revoke_bias;
1152 Label try_rebias;
1153
1154 // At this point we know that the header has the bias pattern and
1155 // that we are not the bias owner in the current epoch. We need to
1156 // figure out more details about the state of the header in order to
1157 // know what operations can be legally performed on the object's
1158 // header.
1159
1160 // If the low three bits in the xor result aren't clear, that means
1161 // the prototype header is no longer biased and we have to revoke
1162 // the bias on this object.
1163 testptr(header_reg, markWord::biased_lock_mask_in_place);
1164 jccb(Assembler::notZero, try_revoke_bias);
1165
1166 // Biasing is still enabled for this data type. See whether the
1167 // epoch of the current bias is still valid, meaning that the epoch
1168 // bits of the mark word are equal to the epoch bits of the
1169 // prototype header. (Note that the prototype header's epoch bits
1170 // only change at a safepoint.) If not, attempt to rebias the object
1171 // toward the current thread. Note that we must be absolutely sure
1172 // that the current epoch is invalid in order to do this because
1173 // otherwise the manipulations it performs on the mark word are
1174 // illegal.
1175 testptr(header_reg, markWord::epoch_mask_in_place);
1176 jccb(Assembler::notZero, try_rebias);
1177
1178 // The epoch of the current bias is still valid but we know nothing
1179 // about the owner; it might be set or it might be clear. Try to
1180 // acquire the bias of the object using an atomic operation. If this
1181 // fails we will go in to the runtime to revoke the object's bias.
1182 // Note that we first construct the presumed unbiased header so we
1183 // don't accidentally blow away another thread's valid bias.
1184 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1185 andptr(swap_reg,
1186 markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
1187 #ifdef _LP64
1188 movptr(tmp_reg, swap_reg);
1189 orptr(tmp_reg, r15_thread);
1190 #else
1191 get_thread(tmp_reg);
1192 orptr(tmp_reg, swap_reg);
1193 #endif
1194 lock();
1195 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1196 // If the biasing toward our thread failed, this means that
1197 // another thread succeeded in biasing it toward itself and we
1198 // need to revoke that bias. The revocation will occur in the
1199 // interpreter runtime in the slow case.
1200 if (counters != NULL) {
1201 cond_inc32(Assembler::zero,
1202 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1203 }
1204 if (slow_case != NULL) {
1205 jcc(Assembler::notZero, *slow_case);
1206 }
1207 jmp(done);
1208
1209 bind(try_rebias);
1210 // At this point we know the epoch has expired, meaning that the
1211 // current "bias owner", if any, is actually invalid. Under these
1212 // circumstances _only_, we are allowed to use the current header's
1213 // value as the comparison value when doing the cas to acquire the
1214 // bias in the current epoch. In other words, we allow transfer of
1215 // the bias from one thread to another directly in this situation.
1216 //
1217 // FIXME: due to a lack of registers we currently blow away the age
1218 // bits in this situation. Should attempt to preserve them.
1219 load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1220 #ifdef _LP64
1221 orptr(tmp_reg, r15_thread);
1222 #else
1223 get_thread(swap_reg);
1224 orptr(tmp_reg, swap_reg);
1225 movptr(swap_reg, saved_mark_addr);
1226 #endif
1227 lock();
1228 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1229 // If the biasing toward our thread failed, then another thread
1230 // succeeded in biasing it toward itself and we need to revoke that
1231 // bias. The revocation will occur in the runtime in the slow case.
1232 if (counters != NULL) {
1233 cond_inc32(Assembler::zero,
1234 ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1235 }
1236 if (slow_case != NULL) {
1237 jcc(Assembler::notZero, *slow_case);
1238 }
1239 jmp(done);
1240
1241 bind(try_revoke_bias);
1242 // The prototype mark in the klass doesn't have the bias bit set any
1243 // more, indicating that objects of this data type are not supposed
1244 // to be biased any more. We are going to try to reset the mark of
1245 // this object to the prototype value and fall through to the
1246 // CAS-based locking scheme. Note that if our CAS fails, it means
1247 // that another thread raced us for the privilege of revoking the
1248 // bias of this particular object, so it's okay to continue in the
1249 // normal locking code.
1250 //
1251 // FIXME: due to a lack of registers we currently blow away the age
1252 // bits in this situation. Should attempt to preserve them.
1253 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1254 load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1255 lock();
1256 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1257 // Fall through to the normal CAS-based lock, because no matter what
1258 // the result of the above CAS, some thread must have succeeded in
1259 // removing the bias bit from the object's header.
1260 if (counters != NULL) {
1261 cond_inc32(Assembler::zero,
1262 ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1263 }
1264
1265 bind(cas_label);
1266
1267 return null_check_offset;
1268 }
1269
biased_locking_exit(Register obj_reg,Register temp_reg,Label & done)1270 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1271 assert(UseBiasedLocking, "why call this otherwise?");
1272
1273 // Check for biased locking unlock case, which is a no-op
1274 // Note: we do not have to check the thread ID for two reasons.
1275 // First, the interpreter checks for IllegalMonitorStateException at
1276 // a higher level. Second, if the bias was revoked while we held the
1277 // lock, the object could not be rebiased toward another thread, so
1278 // the bias bit would be clear.
1279 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1280 andptr(temp_reg, markWord::biased_lock_mask_in_place);
1281 cmpptr(temp_reg, markWord::biased_lock_pattern);
1282 jcc(Assembler::equal, done);
1283 }
1284
c2bool(Register x)1285 void MacroAssembler::c2bool(Register x) {
1286 // implements x == 0 ? 0 : 1
1287 // note: must only look at least-significant byte of x
1288 // since C-style booleans are stored in one byte
1289 // only! (was bug)
1290 andl(x, 0xFF);
1291 setb(Assembler::notZero, x);
1292 }
1293
1294 // Wouldn't need if AddressLiteral version had new name
call(Label & L,relocInfo::relocType rtype)1295 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1296 Assembler::call(L, rtype);
1297 }
1298
call(Register entry)1299 void MacroAssembler::call(Register entry) {
1300 Assembler::call(entry);
1301 }
1302
call(AddressLiteral entry)1303 void MacroAssembler::call(AddressLiteral entry) {
1304 if (reachable(entry)) {
1305 Assembler::call_literal(entry.target(), entry.rspec());
1306 } else {
1307 lea(rscratch1, entry);
1308 Assembler::call(rscratch1);
1309 }
1310 }
1311
ic_call(address entry,jint method_index)1312 void MacroAssembler::ic_call(address entry, jint method_index) {
1313 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1314 movptr(rax, (intptr_t)Universe::non_oop_word());
1315 call(AddressLiteral(entry, rh));
1316 }
1317
1318 // Implementation of call_VM versions
1319
call_VM(Register oop_result,address entry_point,bool check_exceptions)1320 void MacroAssembler::call_VM(Register oop_result,
1321 address entry_point,
1322 bool check_exceptions) {
1323 Label C, E;
1324 call(C, relocInfo::none);
1325 jmp(E);
1326
1327 bind(C);
1328 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1329 ret(0);
1330
1331 bind(E);
1332 }
1333
call_VM(Register oop_result,address entry_point,Register arg_1,bool check_exceptions)1334 void MacroAssembler::call_VM(Register oop_result,
1335 address entry_point,
1336 Register arg_1,
1337 bool check_exceptions) {
1338 Label C, E;
1339 call(C, relocInfo::none);
1340 jmp(E);
1341
1342 bind(C);
1343 pass_arg1(this, arg_1);
1344 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1345 ret(0);
1346
1347 bind(E);
1348 }
1349
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)1350 void MacroAssembler::call_VM(Register oop_result,
1351 address entry_point,
1352 Register arg_1,
1353 Register arg_2,
1354 bool check_exceptions) {
1355 Label C, E;
1356 call(C, relocInfo::none);
1357 jmp(E);
1358
1359 bind(C);
1360
1361 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1362
1363 pass_arg2(this, arg_2);
1364 pass_arg1(this, arg_1);
1365 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1366 ret(0);
1367
1368 bind(E);
1369 }
1370
call_VM(Register oop_result,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)1371 void MacroAssembler::call_VM(Register oop_result,
1372 address entry_point,
1373 Register arg_1,
1374 Register arg_2,
1375 Register arg_3,
1376 bool check_exceptions) {
1377 Label C, E;
1378 call(C, relocInfo::none);
1379 jmp(E);
1380
1381 bind(C);
1382
1383 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1384 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1385 pass_arg3(this, arg_3);
1386
1387 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1388 pass_arg2(this, arg_2);
1389
1390 pass_arg1(this, arg_1);
1391 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1392 ret(0);
1393
1394 bind(E);
1395 }
1396
call_VM(Register oop_result,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)1397 void MacroAssembler::call_VM(Register oop_result,
1398 Register last_java_sp,
1399 address entry_point,
1400 int number_of_arguments,
1401 bool check_exceptions) {
1402 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1403 call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1404 }
1405
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,bool check_exceptions)1406 void MacroAssembler::call_VM(Register oop_result,
1407 Register last_java_sp,
1408 address entry_point,
1409 Register arg_1,
1410 bool check_exceptions) {
1411 pass_arg1(this, arg_1);
1412 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1413 }
1414
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)1415 void MacroAssembler::call_VM(Register oop_result,
1416 Register last_java_sp,
1417 address entry_point,
1418 Register arg_1,
1419 Register arg_2,
1420 bool check_exceptions) {
1421
1422 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1423 pass_arg2(this, arg_2);
1424 pass_arg1(this, arg_1);
1425 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1426 }
1427
call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)1428 void MacroAssembler::call_VM(Register oop_result,
1429 Register last_java_sp,
1430 address entry_point,
1431 Register arg_1,
1432 Register arg_2,
1433 Register arg_3,
1434 bool check_exceptions) {
1435 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1436 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1437 pass_arg3(this, arg_3);
1438 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1439 pass_arg2(this, arg_2);
1440 pass_arg1(this, arg_1);
1441 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1442 }
1443
super_call_VM(Register oop_result,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)1444 void MacroAssembler::super_call_VM(Register oop_result,
1445 Register last_java_sp,
1446 address entry_point,
1447 int number_of_arguments,
1448 bool check_exceptions) {
1449 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1450 MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1451 }
1452
super_call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,bool check_exceptions)1453 void MacroAssembler::super_call_VM(Register oop_result,
1454 Register last_java_sp,
1455 address entry_point,
1456 Register arg_1,
1457 bool check_exceptions) {
1458 pass_arg1(this, arg_1);
1459 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1460 }
1461
super_call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,bool check_exceptions)1462 void MacroAssembler::super_call_VM(Register oop_result,
1463 Register last_java_sp,
1464 address entry_point,
1465 Register arg_1,
1466 Register arg_2,
1467 bool check_exceptions) {
1468
1469 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1470 pass_arg2(this, arg_2);
1471 pass_arg1(this, arg_1);
1472 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1473 }
1474
super_call_VM(Register oop_result,Register last_java_sp,address entry_point,Register arg_1,Register arg_2,Register arg_3,bool check_exceptions)1475 void MacroAssembler::super_call_VM(Register oop_result,
1476 Register last_java_sp,
1477 address entry_point,
1478 Register arg_1,
1479 Register arg_2,
1480 Register arg_3,
1481 bool check_exceptions) {
1482 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1483 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1484 pass_arg3(this, arg_3);
1485 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1486 pass_arg2(this, arg_2);
1487 pass_arg1(this, arg_1);
1488 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1489 }
1490
call_VM_base(Register oop_result,Register java_thread,Register last_java_sp,address entry_point,int number_of_arguments,bool check_exceptions)1491 void MacroAssembler::call_VM_base(Register oop_result,
1492 Register java_thread,
1493 Register last_java_sp,
1494 address entry_point,
1495 int number_of_arguments,
1496 bool check_exceptions) {
1497 // determine java_thread register
1498 if (!java_thread->is_valid()) {
1499 #ifdef _LP64
1500 java_thread = r15_thread;
1501 #else
1502 java_thread = rdi;
1503 get_thread(java_thread);
1504 #endif // LP64
1505 }
1506 // determine last_java_sp register
1507 if (!last_java_sp->is_valid()) {
1508 last_java_sp = rsp;
1509 }
1510 // debugging support
1511 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
1512 LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1513 #ifdef ASSERT
1514 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1515 // r12 is the heapbase.
1516 LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1517 #endif // ASSERT
1518
1519 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
1520 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1521
1522 // push java thread (becomes first argument of C function)
1523
1524 NOT_LP64(push(java_thread); number_of_arguments++);
1525 LP64_ONLY(mov(c_rarg0, r15_thread));
1526
1527 // set last Java frame before call
1528 assert(last_java_sp != rbp, "can't use ebp/rbp");
1529
1530 // Only interpreter should have to set fp
1531 set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1532
1533 // do the call, remove parameters
1534 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1535
1536 // restore the thread (cannot use the pushed argument since arguments
1537 // may be overwritten by C code generated by an optimizing compiler);
1538 // however can use the register value directly if it is callee saved.
1539 if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1540 // rdi & rsi (also r15) are callee saved -> nothing to do
1541 #ifdef ASSERT
1542 guarantee(java_thread != rax, "change this code");
1543 push(rax);
1544 { Label L;
1545 get_thread(rax);
1546 cmpptr(java_thread, rax);
1547 jcc(Assembler::equal, L);
1548 STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1549 bind(L);
1550 }
1551 pop(rax);
1552 #endif
1553 } else {
1554 get_thread(java_thread);
1555 }
1556 // reset last Java frame
1557 // Only interpreter should have to clear fp
1558 reset_last_Java_frame(java_thread, true);
1559
1560 // C++ interp handles this in the interpreter
1561 check_and_handle_popframe(java_thread);
1562 check_and_handle_earlyret(java_thread);
1563
1564 if (check_exceptions) {
1565 // check for pending exceptions (java_thread is set upon return)
1566 cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1567 #ifndef _LP64
1568 jump_cc(Assembler::notEqual,
1569 RuntimeAddress(StubRoutines::forward_exception_entry()));
1570 #else
1571 // This used to conditionally jump to forward_exception however it is
1572 // possible if we relocate that the branch will not reach. So we must jump
1573 // around so we can always reach
1574
1575 Label ok;
1576 jcc(Assembler::equal, ok);
1577 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1578 bind(ok);
1579 #endif // LP64
1580 }
1581
1582 // get oop result if there is one and reset the value in the thread
1583 if (oop_result->is_valid()) {
1584 get_vm_result(oop_result, java_thread);
1585 }
1586 }
1587
call_VM_helper(Register oop_result,address entry_point,int number_of_arguments,bool check_exceptions)1588 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1589
1590 // Calculate the value for last_Java_sp
1591 // somewhat subtle. call_VM does an intermediate call
1592 // which places a return address on the stack just under the
1593 // stack pointer as the user finsihed with it. This allows
1594 // use to retrieve last_Java_pc from last_Java_sp[-1].
1595 // On 32bit we then have to push additional args on the stack to accomplish
1596 // the actual requested call. On 64bit call_VM only can use register args
1597 // so the only extra space is the return address that call_VM created.
1598 // This hopefully explains the calculations here.
1599
1600 #ifdef _LP64
1601 // We've pushed one address, correct last_Java_sp
1602 lea(rax, Address(rsp, wordSize));
1603 #else
1604 lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1605 #endif // LP64
1606
1607 call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1608
1609 }
1610
1611 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
call_VM_leaf0(address entry_point)1612 void MacroAssembler::call_VM_leaf0(address entry_point) {
1613 MacroAssembler::call_VM_leaf_base(entry_point, 0);
1614 }
1615
call_VM_leaf(address entry_point,int number_of_arguments)1616 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1617 call_VM_leaf_base(entry_point, number_of_arguments);
1618 }
1619
call_VM_leaf(address entry_point,Register arg_0)1620 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1621 pass_arg0(this, arg_0);
1622 call_VM_leaf(entry_point, 1);
1623 }
1624
call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1625 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1626
1627 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1628 pass_arg1(this, arg_1);
1629 pass_arg0(this, arg_0);
1630 call_VM_leaf(entry_point, 2);
1631 }
1632
call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1633 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1634 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1635 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1636 pass_arg2(this, arg_2);
1637 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1638 pass_arg1(this, arg_1);
1639 pass_arg0(this, arg_0);
1640 call_VM_leaf(entry_point, 3);
1641 }
1642
super_call_VM_leaf(address entry_point,Register arg_0)1643 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1644 pass_arg0(this, arg_0);
1645 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1646 }
1647
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1)1648 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1649
1650 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1651 pass_arg1(this, arg_1);
1652 pass_arg0(this, arg_0);
1653 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1654 }
1655
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2)1656 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1657 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1658 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1659 pass_arg2(this, arg_2);
1660 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1661 pass_arg1(this, arg_1);
1662 pass_arg0(this, arg_0);
1663 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1664 }
1665
super_call_VM_leaf(address entry_point,Register arg_0,Register arg_1,Register arg_2,Register arg_3)1666 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1667 LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1668 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1669 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1670 pass_arg3(this, arg_3);
1671 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1672 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1673 pass_arg2(this, arg_2);
1674 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1675 pass_arg1(this, arg_1);
1676 pass_arg0(this, arg_0);
1677 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1678 }
1679
get_vm_result(Register oop_result,Register java_thread)1680 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1681 movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1682 movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1683 verify_oop_msg(oop_result, "broken oop in call_VM_base");
1684 }
1685
get_vm_result_2(Register metadata_result,Register java_thread)1686 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1687 movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1688 movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1689 }
1690
check_and_handle_earlyret(Register java_thread)1691 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1692 }
1693
check_and_handle_popframe(Register java_thread)1694 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1695 }
1696
cmp32(AddressLiteral src1,int32_t imm)1697 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1698 if (reachable(src1)) {
1699 cmpl(as_Address(src1), imm);
1700 } else {
1701 lea(rscratch1, src1);
1702 cmpl(Address(rscratch1, 0), imm);
1703 }
1704 }
1705
cmp32(Register src1,AddressLiteral src2)1706 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1707 assert(!src2.is_lval(), "use cmpptr");
1708 if (reachable(src2)) {
1709 cmpl(src1, as_Address(src2));
1710 } else {
1711 lea(rscratch1, src2);
1712 cmpl(src1, Address(rscratch1, 0));
1713 }
1714 }
1715
cmp32(Register src1,int32_t imm)1716 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1717 Assembler::cmpl(src1, imm);
1718 }
1719
cmp32(Register src1,Address src2)1720 void MacroAssembler::cmp32(Register src1, Address src2) {
1721 Assembler::cmpl(src1, src2);
1722 }
1723
cmpsd2int(XMMRegister opr1,XMMRegister opr2,Register dst,bool unordered_is_less)1724 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1725 ucomisd(opr1, opr2);
1726
1727 Label L;
1728 if (unordered_is_less) {
1729 movl(dst, -1);
1730 jcc(Assembler::parity, L);
1731 jcc(Assembler::below , L);
1732 movl(dst, 0);
1733 jcc(Assembler::equal , L);
1734 increment(dst);
1735 } else { // unordered is greater
1736 movl(dst, 1);
1737 jcc(Assembler::parity, L);
1738 jcc(Assembler::above , L);
1739 movl(dst, 0);
1740 jcc(Assembler::equal , L);
1741 decrementl(dst);
1742 }
1743 bind(L);
1744 }
1745
cmpss2int(XMMRegister opr1,XMMRegister opr2,Register dst,bool unordered_is_less)1746 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1747 ucomiss(opr1, opr2);
1748
1749 Label L;
1750 if (unordered_is_less) {
1751 movl(dst, -1);
1752 jcc(Assembler::parity, L);
1753 jcc(Assembler::below , L);
1754 movl(dst, 0);
1755 jcc(Assembler::equal , L);
1756 increment(dst);
1757 } else { // unordered is greater
1758 movl(dst, 1);
1759 jcc(Assembler::parity, L);
1760 jcc(Assembler::above , L);
1761 movl(dst, 0);
1762 jcc(Assembler::equal , L);
1763 decrementl(dst);
1764 }
1765 bind(L);
1766 }
1767
1768
cmp8(AddressLiteral src1,int imm)1769 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1770 if (reachable(src1)) {
1771 cmpb(as_Address(src1), imm);
1772 } else {
1773 lea(rscratch1, src1);
1774 cmpb(Address(rscratch1, 0), imm);
1775 }
1776 }
1777
cmpptr(Register src1,AddressLiteral src2)1778 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1779 #ifdef _LP64
1780 if (src2.is_lval()) {
1781 movptr(rscratch1, src2);
1782 Assembler::cmpq(src1, rscratch1);
1783 } else if (reachable(src2)) {
1784 cmpq(src1, as_Address(src2));
1785 } else {
1786 lea(rscratch1, src2);
1787 Assembler::cmpq(src1, Address(rscratch1, 0));
1788 }
1789 #else
1790 if (src2.is_lval()) {
1791 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1792 } else {
1793 cmpl(src1, as_Address(src2));
1794 }
1795 #endif // _LP64
1796 }
1797
cmpptr(Address src1,AddressLiteral src2)1798 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
1799 assert(src2.is_lval(), "not a mem-mem compare");
1800 #ifdef _LP64
1801 // moves src2's literal address
1802 movptr(rscratch1, src2);
1803 Assembler::cmpq(src1, rscratch1);
1804 #else
1805 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1806 #endif // _LP64
1807 }
1808
cmpoop(Register src1,Register src2)1809 void MacroAssembler::cmpoop(Register src1, Register src2) {
1810 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1811 bs->obj_equals(this, src1, src2);
1812 }
1813
cmpoop(Register src1,Address src2)1814 void MacroAssembler::cmpoop(Register src1, Address src2) {
1815 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1816 bs->obj_equals(this, src1, src2);
1817 }
1818
1819 #ifdef _LP64
cmpoop(Register src1,jobject src2)1820 void MacroAssembler::cmpoop(Register src1, jobject src2) {
1821 movoop(rscratch1, src2);
1822 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1823 bs->obj_equals(this, src1, rscratch1);
1824 }
1825 #endif
1826
locked_cmpxchgptr(Register reg,AddressLiteral adr)1827 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
1828 if (reachable(adr)) {
1829 lock();
1830 cmpxchgptr(reg, as_Address(adr));
1831 } else {
1832 lea(rscratch1, adr);
1833 lock();
1834 cmpxchgptr(reg, Address(rscratch1, 0));
1835 }
1836 }
1837
cmpxchgptr(Register reg,Address adr)1838 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1839 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1840 }
1841
comisd(XMMRegister dst,AddressLiteral src)1842 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
1843 if (reachable(src)) {
1844 Assembler::comisd(dst, as_Address(src));
1845 } else {
1846 lea(rscratch1, src);
1847 Assembler::comisd(dst, Address(rscratch1, 0));
1848 }
1849 }
1850
comiss(XMMRegister dst,AddressLiteral src)1851 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
1852 if (reachable(src)) {
1853 Assembler::comiss(dst, as_Address(src));
1854 } else {
1855 lea(rscratch1, src);
1856 Assembler::comiss(dst, Address(rscratch1, 0));
1857 }
1858 }
1859
1860
cond_inc32(Condition cond,AddressLiteral counter_addr)1861 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
1862 Condition negated_cond = negate_condition(cond);
1863 Label L;
1864 jcc(negated_cond, L);
1865 pushf(); // Preserve flags
1866 atomic_incl(counter_addr);
1867 popf();
1868 bind(L);
1869 }
1870
corrected_idivl(Register reg)1871 int MacroAssembler::corrected_idivl(Register reg) {
1872 // Full implementation of Java idiv and irem; checks for
1873 // special case as described in JVM spec., p.243 & p.271.
1874 // The function returns the (pc) offset of the idivl
1875 // instruction - may be needed for implicit exceptions.
1876 //
1877 // normal case special case
1878 //
1879 // input : rax,: dividend min_int
1880 // reg: divisor (may not be rax,/rdx) -1
1881 //
1882 // output: rax,: quotient (= rax, idiv reg) min_int
1883 // rdx: remainder (= rax, irem reg) 0
1884 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1885 const int min_int = 0x80000000;
1886 Label normal_case, special_case;
1887
1888 // check for special case
1889 cmpl(rax, min_int);
1890 jcc(Assembler::notEqual, normal_case);
1891 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1892 cmpl(reg, -1);
1893 jcc(Assembler::equal, special_case);
1894
1895 // handle normal case
1896 bind(normal_case);
1897 cdql();
1898 int idivl_offset = offset();
1899 idivl(reg);
1900
1901 // normal and special case exit
1902 bind(special_case);
1903
1904 return idivl_offset;
1905 }
1906
1907
1908
decrementl(Register reg,int value)1909 void MacroAssembler::decrementl(Register reg, int value) {
1910 if (value == min_jint) {subl(reg, value) ; return; }
1911 if (value < 0) { incrementl(reg, -value); return; }
1912 if (value == 0) { ; return; }
1913 if (value == 1 && UseIncDec) { decl(reg) ; return; }
1914 /* else */ { subl(reg, value) ; return; }
1915 }
1916
decrementl(Address dst,int value)1917 void MacroAssembler::decrementl(Address dst, int value) {
1918 if (value == min_jint) {subl(dst, value) ; return; }
1919 if (value < 0) { incrementl(dst, -value); return; }
1920 if (value == 0) { ; return; }
1921 if (value == 1 && UseIncDec) { decl(dst) ; return; }
1922 /* else */ { subl(dst, value) ; return; }
1923 }
1924
division_with_shift(Register reg,int shift_value)1925 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1926 assert (shift_value > 0, "illegal shift value");
1927 Label _is_positive;
1928 testl (reg, reg);
1929 jcc (Assembler::positive, _is_positive);
1930 int offset = (1 << shift_value) - 1 ;
1931
1932 if (offset == 1) {
1933 incrementl(reg);
1934 } else {
1935 addl(reg, offset);
1936 }
1937
1938 bind (_is_positive);
1939 sarl(reg, shift_value);
1940 }
1941
divsd(XMMRegister dst,AddressLiteral src)1942 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
1943 if (reachable(src)) {
1944 Assembler::divsd(dst, as_Address(src));
1945 } else {
1946 lea(rscratch1, src);
1947 Assembler::divsd(dst, Address(rscratch1, 0));
1948 }
1949 }
1950
divss(XMMRegister dst,AddressLiteral src)1951 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
1952 if (reachable(src)) {
1953 Assembler::divss(dst, as_Address(src));
1954 } else {
1955 lea(rscratch1, src);
1956 Assembler::divss(dst, Address(rscratch1, 0));
1957 }
1958 }
1959
enter()1960 void MacroAssembler::enter() {
1961 push(rbp);
1962 mov(rbp, rsp);
1963 }
1964
1965 // A 5 byte nop that is safe for patching (see patch_verified_entry)
fat_nop()1966 void MacroAssembler::fat_nop() {
1967 if (UseAddressNop) {
1968 addr_nop_5();
1969 } else {
1970 emit_int8(0x26); // es:
1971 emit_int8(0x2e); // cs:
1972 emit_int8(0x64); // fs:
1973 emit_int8(0x65); // gs:
1974 emit_int8((unsigned char)0x90);
1975 }
1976 }
1977
1978 #ifndef _LP64
fcmp(Register tmp)1979 void MacroAssembler::fcmp(Register tmp) {
1980 fcmp(tmp, 1, true, true);
1981 }
1982
fcmp(Register tmp,int index,bool pop_left,bool pop_right)1983 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
1984 assert(!pop_right || pop_left, "usage error");
1985 if (VM_Version::supports_cmov()) {
1986 assert(tmp == noreg, "unneeded temp");
1987 if (pop_left) {
1988 fucomip(index);
1989 } else {
1990 fucomi(index);
1991 }
1992 if (pop_right) {
1993 fpop();
1994 }
1995 } else {
1996 assert(tmp != noreg, "need temp");
1997 if (pop_left) {
1998 if (pop_right) {
1999 fcompp();
2000 } else {
2001 fcomp(index);
2002 }
2003 } else {
2004 fcom(index);
2005 }
2006 // convert FPU condition into eflags condition via rax,
2007 save_rax(tmp);
2008 fwait(); fnstsw_ax();
2009 sahf();
2010 restore_rax(tmp);
2011 }
2012 // condition codes set as follows:
2013 //
2014 // CF (corresponds to C0) if x < y
2015 // PF (corresponds to C2) if unordered
2016 // ZF (corresponds to C3) if x = y
2017 }
2018
fcmp2int(Register dst,bool unordered_is_less)2019 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2020 fcmp2int(dst, unordered_is_less, 1, true, true);
2021 }
2022
fcmp2int(Register dst,bool unordered_is_less,int index,bool pop_left,bool pop_right)2023 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2024 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2025 Label L;
2026 if (unordered_is_less) {
2027 movl(dst, -1);
2028 jcc(Assembler::parity, L);
2029 jcc(Assembler::below , L);
2030 movl(dst, 0);
2031 jcc(Assembler::equal , L);
2032 increment(dst);
2033 } else { // unordered is greater
2034 movl(dst, 1);
2035 jcc(Assembler::parity, L);
2036 jcc(Assembler::above , L);
2037 movl(dst, 0);
2038 jcc(Assembler::equal , L);
2039 decrementl(dst);
2040 }
2041 bind(L);
2042 }
2043
fld_d(AddressLiteral src)2044 void MacroAssembler::fld_d(AddressLiteral src) {
2045 fld_d(as_Address(src));
2046 }
2047
fld_s(AddressLiteral src)2048 void MacroAssembler::fld_s(AddressLiteral src) {
2049 fld_s(as_Address(src));
2050 }
2051
fld_x(AddressLiteral src)2052 void MacroAssembler::fld_x(AddressLiteral src) {
2053 Assembler::fld_x(as_Address(src));
2054 }
2055
fldcw(AddressLiteral src)2056 void MacroAssembler::fldcw(AddressLiteral src) {
2057 Assembler::fldcw(as_Address(src));
2058 }
2059
fpop()2060 void MacroAssembler::fpop() {
2061 ffree();
2062 fincstp();
2063 }
2064
fremr(Register tmp)2065 void MacroAssembler::fremr(Register tmp) {
2066 save_rax(tmp);
2067 { Label L;
2068 bind(L);
2069 fprem();
2070 fwait(); fnstsw_ax();
2071 sahf();
2072 jcc(Assembler::parity, L);
2073 }
2074 restore_rax(tmp);
2075 // Result is in ST0.
2076 // Note: fxch & fpop to get rid of ST1
2077 // (otherwise FPU stack could overflow eventually)
2078 fxch(1);
2079 fpop();
2080 }
2081
empty_FPU_stack()2082 void MacroAssembler::empty_FPU_stack() {
2083 if (VM_Version::supports_mmx()) {
2084 emms();
2085 } else {
2086 for (int i = 8; i-- > 0; ) ffree(i);
2087 }
2088 }
2089 #endif // !LP64
2090
mulpd(XMMRegister dst,AddressLiteral src)2091 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2092 if (reachable(src)) {
2093 Assembler::mulpd(dst, as_Address(src));
2094 } else {
2095 lea(rscratch1, src);
2096 Assembler::mulpd(dst, Address(rscratch1, 0));
2097 }
2098 }
2099
load_float(Address src)2100 void MacroAssembler::load_float(Address src) {
2101 #ifdef _LP64
2102 movflt(xmm0, src);
2103 #else
2104 if (UseSSE >= 1) {
2105 movflt(xmm0, src);
2106 } else {
2107 fld_s(src);
2108 }
2109 #endif // LP64
2110 }
2111
store_float(Address dst)2112 void MacroAssembler::store_float(Address dst) {
2113 #ifdef _LP64
2114 movflt(dst, xmm0);
2115 #else
2116 if (UseSSE >= 1) {
2117 movflt(dst, xmm0);
2118 } else {
2119 fstp_s(dst);
2120 }
2121 #endif // LP64
2122 }
2123
load_double(Address src)2124 void MacroAssembler::load_double(Address src) {
2125 #ifdef _LP64
2126 movdbl(xmm0, src);
2127 #else
2128 if (UseSSE >= 2) {
2129 movdbl(xmm0, src);
2130 } else {
2131 fld_d(src);
2132 }
2133 #endif // LP64
2134 }
2135
store_double(Address dst)2136 void MacroAssembler::store_double(Address dst) {
2137 #ifdef _LP64
2138 movdbl(dst, xmm0);
2139 #else
2140 if (UseSSE >= 2) {
2141 movdbl(dst, xmm0);
2142 } else {
2143 fstp_d(dst);
2144 }
2145 #endif // LP64
2146 }
2147
2148 // dst = c = a * b + c
fmad(XMMRegister dst,XMMRegister a,XMMRegister b,XMMRegister c)2149 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2150 Assembler::vfmadd231sd(c, a, b);
2151 if (dst != c) {
2152 movdbl(dst, c);
2153 }
2154 }
2155
2156 // dst = c = a * b + c
fmaf(XMMRegister dst,XMMRegister a,XMMRegister b,XMMRegister c)2157 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2158 Assembler::vfmadd231ss(c, a, b);
2159 if (dst != c) {
2160 movflt(dst, c);
2161 }
2162 }
2163
2164 // dst = c = a * b + c
vfmad(XMMRegister dst,XMMRegister a,XMMRegister b,XMMRegister c,int vector_len)2165 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2166 Assembler::vfmadd231pd(c, a, b, vector_len);
2167 if (dst != c) {
2168 vmovdqu(dst, c);
2169 }
2170 }
2171
2172 // dst = c = a * b + c
vfmaf(XMMRegister dst,XMMRegister a,XMMRegister b,XMMRegister c,int vector_len)2173 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2174 Assembler::vfmadd231ps(c, a, b, vector_len);
2175 if (dst != c) {
2176 vmovdqu(dst, c);
2177 }
2178 }
2179
2180 // dst = c = a * b + c
vfmad(XMMRegister dst,XMMRegister a,Address b,XMMRegister c,int vector_len)2181 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2182 Assembler::vfmadd231pd(c, a, b, vector_len);
2183 if (dst != c) {
2184 vmovdqu(dst, c);
2185 }
2186 }
2187
2188 // dst = c = a * b + c
vfmaf(XMMRegister dst,XMMRegister a,Address b,XMMRegister c,int vector_len)2189 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2190 Assembler::vfmadd231ps(c, a, b, vector_len);
2191 if (dst != c) {
2192 vmovdqu(dst, c);
2193 }
2194 }
2195
incrementl(AddressLiteral dst)2196 void MacroAssembler::incrementl(AddressLiteral dst) {
2197 if (reachable(dst)) {
2198 incrementl(as_Address(dst));
2199 } else {
2200 lea(rscratch1, dst);
2201 incrementl(Address(rscratch1, 0));
2202 }
2203 }
2204
incrementl(ArrayAddress dst)2205 void MacroAssembler::incrementl(ArrayAddress dst) {
2206 incrementl(as_Address(dst));
2207 }
2208
incrementl(Register reg,int value)2209 void MacroAssembler::incrementl(Register reg, int value) {
2210 if (value == min_jint) {addl(reg, value) ; return; }
2211 if (value < 0) { decrementl(reg, -value); return; }
2212 if (value == 0) { ; return; }
2213 if (value == 1 && UseIncDec) { incl(reg) ; return; }
2214 /* else */ { addl(reg, value) ; return; }
2215 }
2216
incrementl(Address dst,int value)2217 void MacroAssembler::incrementl(Address dst, int value) {
2218 if (value == min_jint) {addl(dst, value) ; return; }
2219 if (value < 0) { decrementl(dst, -value); return; }
2220 if (value == 0) { ; return; }
2221 if (value == 1 && UseIncDec) { incl(dst) ; return; }
2222 /* else */ { addl(dst, value) ; return; }
2223 }
2224
jump(AddressLiteral dst)2225 void MacroAssembler::jump(AddressLiteral dst) {
2226 if (reachable(dst)) {
2227 jmp_literal(dst.target(), dst.rspec());
2228 } else {
2229 lea(rscratch1, dst);
2230 jmp(rscratch1);
2231 }
2232 }
2233
jump_cc(Condition cc,AddressLiteral dst)2234 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2235 if (reachable(dst)) {
2236 InstructionMark im(this);
2237 relocate(dst.reloc());
2238 const int short_size = 2;
2239 const int long_size = 6;
2240 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2241 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2242 // 0111 tttn #8-bit disp
2243 emit_int8(0x70 | cc);
2244 emit_int8((offs - short_size) & 0xFF);
2245 } else {
2246 // 0000 1111 1000 tttn #32-bit disp
2247 emit_int8(0x0F);
2248 emit_int8((unsigned char)(0x80 | cc));
2249 emit_int32(offs - long_size);
2250 }
2251 } else {
2252 #ifdef ASSERT
2253 warning("reversing conditional branch");
2254 #endif /* ASSERT */
2255 Label skip;
2256 jccb(reverse[cc], skip);
2257 lea(rscratch1, dst);
2258 Assembler::jmp(rscratch1);
2259 bind(skip);
2260 }
2261 }
2262
ldmxcsr(AddressLiteral src)2263 void MacroAssembler::ldmxcsr(AddressLiteral src) {
2264 if (reachable(src)) {
2265 Assembler::ldmxcsr(as_Address(src));
2266 } else {
2267 lea(rscratch1, src);
2268 Assembler::ldmxcsr(Address(rscratch1, 0));
2269 }
2270 }
2271
load_signed_byte(Register dst,Address src)2272 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2273 int off;
2274 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2275 off = offset();
2276 movsbl(dst, src); // movsxb
2277 } else {
2278 off = load_unsigned_byte(dst, src);
2279 shll(dst, 24);
2280 sarl(dst, 24);
2281 }
2282 return off;
2283 }
2284
2285 // Note: load_signed_short used to be called load_signed_word.
2286 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2287 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2288 // The term "word" in HotSpot means a 32- or 64-bit machine word.
load_signed_short(Register dst,Address src)2289 int MacroAssembler::load_signed_short(Register dst, Address src) {
2290 int off;
2291 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2292 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2293 // version but this is what 64bit has always done. This seems to imply
2294 // that users are only using 32bits worth.
2295 off = offset();
2296 movswl(dst, src); // movsxw
2297 } else {
2298 off = load_unsigned_short(dst, src);
2299 shll(dst, 16);
2300 sarl(dst, 16);
2301 }
2302 return off;
2303 }
2304
load_unsigned_byte(Register dst,Address src)2305 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2306 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2307 // and "3.9 Partial Register Penalties", p. 22).
2308 int off;
2309 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2310 off = offset();
2311 movzbl(dst, src); // movzxb
2312 } else {
2313 xorl(dst, dst);
2314 off = offset();
2315 movb(dst, src);
2316 }
2317 return off;
2318 }
2319
2320 // Note: load_unsigned_short used to be called load_unsigned_word.
load_unsigned_short(Register dst,Address src)2321 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2322 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2323 // and "3.9 Partial Register Penalties", p. 22).
2324 int off;
2325 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2326 off = offset();
2327 movzwl(dst, src); // movzxw
2328 } else {
2329 xorl(dst, dst);
2330 off = offset();
2331 movw(dst, src);
2332 }
2333 return off;
2334 }
2335
load_sized_value(Register dst,Address src,size_t size_in_bytes,bool is_signed,Register dst2)2336 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2337 switch (size_in_bytes) {
2338 #ifndef _LP64
2339 case 8:
2340 assert(dst2 != noreg, "second dest register required");
2341 movl(dst, src);
2342 movl(dst2, src.plus_disp(BytesPerInt));
2343 break;
2344 #else
2345 case 8: movq(dst, src); break;
2346 #endif
2347 case 4: movl(dst, src); break;
2348 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2349 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2350 default: ShouldNotReachHere();
2351 }
2352 }
2353
store_sized_value(Address dst,Register src,size_t size_in_bytes,Register src2)2354 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2355 switch (size_in_bytes) {
2356 #ifndef _LP64
2357 case 8:
2358 assert(src2 != noreg, "second source register required");
2359 movl(dst, src);
2360 movl(dst.plus_disp(BytesPerInt), src2);
2361 break;
2362 #else
2363 case 8: movq(dst, src); break;
2364 #endif
2365 case 4: movl(dst, src); break;
2366 case 2: movw(dst, src); break;
2367 case 1: movb(dst, src); break;
2368 default: ShouldNotReachHere();
2369 }
2370 }
2371
mov32(AddressLiteral dst,Register src)2372 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2373 if (reachable(dst)) {
2374 movl(as_Address(dst), src);
2375 } else {
2376 lea(rscratch1, dst);
2377 movl(Address(rscratch1, 0), src);
2378 }
2379 }
2380
mov32(Register dst,AddressLiteral src)2381 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2382 if (reachable(src)) {
2383 movl(dst, as_Address(src));
2384 } else {
2385 lea(rscratch1, src);
2386 movl(dst, Address(rscratch1, 0));
2387 }
2388 }
2389
2390 // C++ bool manipulation
2391
movbool(Register dst,Address src)2392 void MacroAssembler::movbool(Register dst, Address src) {
2393 if(sizeof(bool) == 1)
2394 movb(dst, src);
2395 else if(sizeof(bool) == 2)
2396 movw(dst, src);
2397 else if(sizeof(bool) == 4)
2398 movl(dst, src);
2399 else
2400 // unsupported
2401 ShouldNotReachHere();
2402 }
2403
movbool(Address dst,bool boolconst)2404 void MacroAssembler::movbool(Address dst, bool boolconst) {
2405 if(sizeof(bool) == 1)
2406 movb(dst, (int) boolconst);
2407 else if(sizeof(bool) == 2)
2408 movw(dst, (int) boolconst);
2409 else if(sizeof(bool) == 4)
2410 movl(dst, (int) boolconst);
2411 else
2412 // unsupported
2413 ShouldNotReachHere();
2414 }
2415
movbool(Address dst,Register src)2416 void MacroAssembler::movbool(Address dst, Register src) {
2417 if(sizeof(bool) == 1)
2418 movb(dst, src);
2419 else if(sizeof(bool) == 2)
2420 movw(dst, src);
2421 else if(sizeof(bool) == 4)
2422 movl(dst, src);
2423 else
2424 // unsupported
2425 ShouldNotReachHere();
2426 }
2427
movbyte(ArrayAddress dst,int src)2428 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2429 movb(as_Address(dst), src);
2430 }
2431
movdl(XMMRegister dst,AddressLiteral src)2432 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2433 if (reachable(src)) {
2434 movdl(dst, as_Address(src));
2435 } else {
2436 lea(rscratch1, src);
2437 movdl(dst, Address(rscratch1, 0));
2438 }
2439 }
2440
movq(XMMRegister dst,AddressLiteral src)2441 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2442 if (reachable(src)) {
2443 movq(dst, as_Address(src));
2444 } else {
2445 lea(rscratch1, src);
2446 movq(dst, Address(rscratch1, 0));
2447 }
2448 }
2449
movdbl(XMMRegister dst,AddressLiteral src)2450 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2451 if (reachable(src)) {
2452 if (UseXmmLoadAndClearUpper) {
2453 movsd (dst, as_Address(src));
2454 } else {
2455 movlpd(dst, as_Address(src));
2456 }
2457 } else {
2458 lea(rscratch1, src);
2459 if (UseXmmLoadAndClearUpper) {
2460 movsd (dst, Address(rscratch1, 0));
2461 } else {
2462 movlpd(dst, Address(rscratch1, 0));
2463 }
2464 }
2465 }
2466
movflt(XMMRegister dst,AddressLiteral src)2467 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2468 if (reachable(src)) {
2469 movss(dst, as_Address(src));
2470 } else {
2471 lea(rscratch1, src);
2472 movss(dst, Address(rscratch1, 0));
2473 }
2474 }
2475
movptr(Register dst,Register src)2476 void MacroAssembler::movptr(Register dst, Register src) {
2477 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2478 }
2479
movptr(Register dst,Address src)2480 void MacroAssembler::movptr(Register dst, Address src) {
2481 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2482 }
2483
2484 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
movptr(Register dst,intptr_t src)2485 void MacroAssembler::movptr(Register dst, intptr_t src) {
2486 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2487 }
2488
movptr(Address dst,Register src)2489 void MacroAssembler::movptr(Address dst, Register src) {
2490 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2491 }
2492
movdqu(Address dst,XMMRegister src)2493 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2494 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2495 Assembler::movdqu(dst, src);
2496 }
2497
movdqu(XMMRegister dst,Address src)2498 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2499 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2500 Assembler::movdqu(dst, src);
2501 }
2502
movdqu(XMMRegister dst,XMMRegister src)2503 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2504 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2505 Assembler::movdqu(dst, src);
2506 }
2507
movdqu(XMMRegister dst,AddressLiteral src,Register scratchReg)2508 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2509 if (reachable(src)) {
2510 movdqu(dst, as_Address(src));
2511 } else {
2512 lea(scratchReg, src);
2513 movdqu(dst, Address(scratchReg, 0));
2514 }
2515 }
2516
vmovdqu(Address dst,XMMRegister src)2517 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2518 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2519 Assembler::vmovdqu(dst, src);
2520 }
2521
vmovdqu(XMMRegister dst,Address src)2522 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2523 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2524 Assembler::vmovdqu(dst, src);
2525 }
2526
vmovdqu(XMMRegister dst,XMMRegister src)2527 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2528 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2529 Assembler::vmovdqu(dst, src);
2530 }
2531
vmovdqu(XMMRegister dst,AddressLiteral src,Register scratch_reg)2532 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2533 if (reachable(src)) {
2534 vmovdqu(dst, as_Address(src));
2535 }
2536 else {
2537 lea(scratch_reg, src);
2538 vmovdqu(dst, Address(scratch_reg, 0));
2539 }
2540 }
2541
evmovdquq(XMMRegister dst,AddressLiteral src,int vector_len,Register rscratch)2542 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2543 if (reachable(src)) {
2544 Assembler::evmovdquq(dst, as_Address(src), vector_len);
2545 } else {
2546 lea(rscratch, src);
2547 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2548 }
2549 }
2550
movdqa(XMMRegister dst,AddressLiteral src)2551 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2552 if (reachable(src)) {
2553 Assembler::movdqa(dst, as_Address(src));
2554 } else {
2555 lea(rscratch1, src);
2556 Assembler::movdqa(dst, Address(rscratch1, 0));
2557 }
2558 }
2559
movsd(XMMRegister dst,AddressLiteral src)2560 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2561 if (reachable(src)) {
2562 Assembler::movsd(dst, as_Address(src));
2563 } else {
2564 lea(rscratch1, src);
2565 Assembler::movsd(dst, Address(rscratch1, 0));
2566 }
2567 }
2568
movss(XMMRegister dst,AddressLiteral src)2569 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2570 if (reachable(src)) {
2571 Assembler::movss(dst, as_Address(src));
2572 } else {
2573 lea(rscratch1, src);
2574 Assembler::movss(dst, Address(rscratch1, 0));
2575 }
2576 }
2577
mulsd(XMMRegister dst,AddressLiteral src)2578 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2579 if (reachable(src)) {
2580 Assembler::mulsd(dst, as_Address(src));
2581 } else {
2582 lea(rscratch1, src);
2583 Assembler::mulsd(dst, Address(rscratch1, 0));
2584 }
2585 }
2586
mulss(XMMRegister dst,AddressLiteral src)2587 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2588 if (reachable(src)) {
2589 Assembler::mulss(dst, as_Address(src));
2590 } else {
2591 lea(rscratch1, src);
2592 Assembler::mulss(dst, Address(rscratch1, 0));
2593 }
2594 }
2595
null_check(Register reg,int offset)2596 void MacroAssembler::null_check(Register reg, int offset) {
2597 if (needs_explicit_null_check(offset)) {
2598 // provoke OS NULL exception if reg = NULL by
2599 // accessing M[reg] w/o changing any (non-CC) registers
2600 // NOTE: cmpl is plenty here to provoke a segv
2601 cmpptr(rax, Address(reg, 0));
2602 // Note: should probably use testl(rax, Address(reg, 0));
2603 // may be shorter code (however, this version of
2604 // testl needs to be implemented first)
2605 } else {
2606 // nothing to do, (later) access of M[reg + offset]
2607 // will provoke OS NULL exception if reg = NULL
2608 }
2609 }
2610
os_breakpoint()2611 void MacroAssembler::os_breakpoint() {
2612 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2613 // (e.g., MSVC can't call ps() otherwise)
2614 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2615 }
2616
unimplemented(const char * what)2617 void MacroAssembler::unimplemented(const char* what) {
2618 const char* buf = NULL;
2619 {
2620 ResourceMark rm;
2621 stringStream ss;
2622 ss.print("unimplemented: %s", what);
2623 buf = code_string(ss.as_string());
2624 }
2625 stop(buf);
2626 }
2627
2628 #ifdef _LP64
2629 #define XSTATE_BV 0x200
2630 #endif
2631
pop_CPU_state()2632 void MacroAssembler::pop_CPU_state() {
2633 pop_FPU_state();
2634 pop_IU_state();
2635 }
2636
pop_FPU_state()2637 void MacroAssembler::pop_FPU_state() {
2638 #ifndef _LP64
2639 frstor(Address(rsp, 0));
2640 #else
2641 fxrstor(Address(rsp, 0));
2642 #endif
2643 addptr(rsp, FPUStateSizeInWords * wordSize);
2644 }
2645
pop_IU_state()2646 void MacroAssembler::pop_IU_state() {
2647 popa();
2648 LP64_ONLY(addq(rsp, 8));
2649 popf();
2650 }
2651
2652 // Save Integer and Float state
2653 // Warning: Stack must be 16 byte aligned (64bit)
push_CPU_state()2654 void MacroAssembler::push_CPU_state() {
2655 push_IU_state();
2656 push_FPU_state();
2657 }
2658
push_FPU_state()2659 void MacroAssembler::push_FPU_state() {
2660 subptr(rsp, FPUStateSizeInWords * wordSize);
2661 #ifndef _LP64
2662 fnsave(Address(rsp, 0));
2663 fwait();
2664 #else
2665 fxsave(Address(rsp, 0));
2666 #endif // LP64
2667 }
2668
push_IU_state()2669 void MacroAssembler::push_IU_state() {
2670 // Push flags first because pusha kills them
2671 pushf();
2672 // Make sure rsp stays 16-byte aligned
2673 LP64_ONLY(subq(rsp, 8));
2674 pusha();
2675 }
2676
reset_last_Java_frame(Register java_thread,bool clear_fp)2677 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2678 if (!java_thread->is_valid()) {
2679 java_thread = rdi;
2680 get_thread(java_thread);
2681 }
2682 // we must set sp to zero to clear frame
2683 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2684 if (clear_fp) {
2685 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2686 }
2687
2688 // Always clear the pc because it could have been set by make_walkable()
2689 movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2690
2691 vzeroupper();
2692 }
2693
restore_rax(Register tmp)2694 void MacroAssembler::restore_rax(Register tmp) {
2695 if (tmp == noreg) pop(rax);
2696 else if (tmp != rax) mov(rax, tmp);
2697 }
2698
round_to(Register reg,int modulus)2699 void MacroAssembler::round_to(Register reg, int modulus) {
2700 addptr(reg, modulus - 1);
2701 andptr(reg, -modulus);
2702 }
2703
save_rax(Register tmp)2704 void MacroAssembler::save_rax(Register tmp) {
2705 if (tmp == noreg) push(rax);
2706 else if (tmp != rax) mov(tmp, rax);
2707 }
2708
safepoint_poll(Label & slow_path,Register thread_reg,Register temp_reg)2709 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) {
2710 #ifdef _LP64
2711 assert(thread_reg == r15_thread, "should be");
2712 #else
2713 if (thread_reg == noreg) {
2714 thread_reg = temp_reg;
2715 get_thread(thread_reg);
2716 }
2717 #endif
2718 testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit());
2719 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2720 }
2721
2722 // Calls to C land
2723 //
2724 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2725 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2726 // has to be reset to 0. This is required to allow proper stack traversal.
set_last_Java_frame(Register java_thread,Register last_java_sp,Register last_java_fp,address last_java_pc)2727 void MacroAssembler::set_last_Java_frame(Register java_thread,
2728 Register last_java_sp,
2729 Register last_java_fp,
2730 address last_java_pc) {
2731 vzeroupper();
2732 // determine java_thread register
2733 if (!java_thread->is_valid()) {
2734 java_thread = rdi;
2735 get_thread(java_thread);
2736 }
2737 // determine last_java_sp register
2738 if (!last_java_sp->is_valid()) {
2739 last_java_sp = rsp;
2740 }
2741
2742 // last_java_fp is optional
2743
2744 if (last_java_fp->is_valid()) {
2745 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2746 }
2747
2748 // last_java_pc is optional
2749
2750 if (last_java_pc != NULL) {
2751 lea(Address(java_thread,
2752 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
2753 InternalAddress(last_java_pc));
2754
2755 }
2756 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2757 }
2758
shlptr(Register dst,int imm8)2759 void MacroAssembler::shlptr(Register dst, int imm8) {
2760 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
2761 }
2762
shrptr(Register dst,int imm8)2763 void MacroAssembler::shrptr(Register dst, int imm8) {
2764 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
2765 }
2766
sign_extend_byte(Register reg)2767 void MacroAssembler::sign_extend_byte(Register reg) {
2768 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
2769 movsbl(reg, reg); // movsxb
2770 } else {
2771 shll(reg, 24);
2772 sarl(reg, 24);
2773 }
2774 }
2775
sign_extend_short(Register reg)2776 void MacroAssembler::sign_extend_short(Register reg) {
2777 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2778 movswl(reg, reg); // movsxw
2779 } else {
2780 shll(reg, 16);
2781 sarl(reg, 16);
2782 }
2783 }
2784
testl(Register dst,AddressLiteral src)2785 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2786 assert(reachable(src), "Address should be reachable");
2787 testl(dst, as_Address(src));
2788 }
2789
pcmpeqb(XMMRegister dst,XMMRegister src)2790 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2791 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2792 Assembler::pcmpeqb(dst, src);
2793 }
2794
pcmpeqw(XMMRegister dst,XMMRegister src)2795 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2796 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2797 Assembler::pcmpeqw(dst, src);
2798 }
2799
pcmpestri(XMMRegister dst,Address src,int imm8)2800 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2801 assert((dst->encoding() < 16),"XMM register should be 0-15");
2802 Assembler::pcmpestri(dst, src, imm8);
2803 }
2804
pcmpestri(XMMRegister dst,XMMRegister src,int imm8)2805 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2806 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2807 Assembler::pcmpestri(dst, src, imm8);
2808 }
2809
pmovzxbw(XMMRegister dst,XMMRegister src)2810 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2811 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2812 Assembler::pmovzxbw(dst, src);
2813 }
2814
pmovzxbw(XMMRegister dst,Address src)2815 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2816 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2817 Assembler::pmovzxbw(dst, src);
2818 }
2819
pmovmskb(Register dst,XMMRegister src)2820 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2821 assert((src->encoding() < 16),"XMM register should be 0-15");
2822 Assembler::pmovmskb(dst, src);
2823 }
2824
ptest(XMMRegister dst,XMMRegister src)2825 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2826 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2827 Assembler::ptest(dst, src);
2828 }
2829
sqrtsd(XMMRegister dst,AddressLiteral src)2830 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
2831 if (reachable(src)) {
2832 Assembler::sqrtsd(dst, as_Address(src));
2833 } else {
2834 lea(rscratch1, src);
2835 Assembler::sqrtsd(dst, Address(rscratch1, 0));
2836 }
2837 }
2838
sqrtss(XMMRegister dst,AddressLiteral src)2839 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
2840 if (reachable(src)) {
2841 Assembler::sqrtss(dst, as_Address(src));
2842 } else {
2843 lea(rscratch1, src);
2844 Assembler::sqrtss(dst, Address(rscratch1, 0));
2845 }
2846 }
2847
subsd(XMMRegister dst,AddressLiteral src)2848 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
2849 if (reachable(src)) {
2850 Assembler::subsd(dst, as_Address(src));
2851 } else {
2852 lea(rscratch1, src);
2853 Assembler::subsd(dst, Address(rscratch1, 0));
2854 }
2855 }
2856
roundsd(XMMRegister dst,AddressLiteral src,int32_t rmode,Register scratch_reg)2857 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
2858 if (reachable(src)) {
2859 Assembler::roundsd(dst, as_Address(src), rmode);
2860 } else {
2861 lea(scratch_reg, src);
2862 Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
2863 }
2864 }
2865
subss(XMMRegister dst,AddressLiteral src)2866 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
2867 if (reachable(src)) {
2868 Assembler::subss(dst, as_Address(src));
2869 } else {
2870 lea(rscratch1, src);
2871 Assembler::subss(dst, Address(rscratch1, 0));
2872 }
2873 }
2874
ucomisd(XMMRegister dst,AddressLiteral src)2875 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
2876 if (reachable(src)) {
2877 Assembler::ucomisd(dst, as_Address(src));
2878 } else {
2879 lea(rscratch1, src);
2880 Assembler::ucomisd(dst, Address(rscratch1, 0));
2881 }
2882 }
2883
ucomiss(XMMRegister dst,AddressLiteral src)2884 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
2885 if (reachable(src)) {
2886 Assembler::ucomiss(dst, as_Address(src));
2887 } else {
2888 lea(rscratch1, src);
2889 Assembler::ucomiss(dst, Address(rscratch1, 0));
2890 }
2891 }
2892
xorpd(XMMRegister dst,AddressLiteral src,Register scratch_reg)2893 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2894 // Used in sign-bit flipping with aligned address.
2895 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2896 if (reachable(src)) {
2897 Assembler::xorpd(dst, as_Address(src));
2898 } else {
2899 lea(scratch_reg, src);
2900 Assembler::xorpd(dst, Address(scratch_reg, 0));
2901 }
2902 }
2903
xorpd(XMMRegister dst,XMMRegister src)2904 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
2905 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
2906 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2907 }
2908 else {
2909 Assembler::xorpd(dst, src);
2910 }
2911 }
2912
xorps(XMMRegister dst,XMMRegister src)2913 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
2914 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
2915 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2916 } else {
2917 Assembler::xorps(dst, src);
2918 }
2919 }
2920
xorps(XMMRegister dst,AddressLiteral src,Register scratch_reg)2921 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2922 // Used in sign-bit flipping with aligned address.
2923 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2924 if (reachable(src)) {
2925 Assembler::xorps(dst, as_Address(src));
2926 } else {
2927 lea(scratch_reg, src);
2928 Assembler::xorps(dst, Address(scratch_reg, 0));
2929 }
2930 }
2931
pshufb(XMMRegister dst,AddressLiteral src)2932 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
2933 // Used in sign-bit flipping with aligned address.
2934 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
2935 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
2936 if (reachable(src)) {
2937 Assembler::pshufb(dst, as_Address(src));
2938 } else {
2939 lea(rscratch1, src);
2940 Assembler::pshufb(dst, Address(rscratch1, 0));
2941 }
2942 }
2943
2944 // AVX 3-operands instructions
2945
vaddsd(XMMRegister dst,XMMRegister nds,AddressLiteral src)2946 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
2947 if (reachable(src)) {
2948 vaddsd(dst, nds, as_Address(src));
2949 } else {
2950 lea(rscratch1, src);
2951 vaddsd(dst, nds, Address(rscratch1, 0));
2952 }
2953 }
2954
vaddss(XMMRegister dst,XMMRegister nds,AddressLiteral src)2955 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
2956 if (reachable(src)) {
2957 vaddss(dst, nds, as_Address(src));
2958 } else {
2959 lea(rscratch1, src);
2960 vaddss(dst, nds, Address(rscratch1, 0));
2961 }
2962 }
2963
vpaddd(XMMRegister dst,XMMRegister nds,AddressLiteral src,int vector_len,Register rscratch)2964 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
2965 assert(UseAVX > 0, "requires some form of AVX");
2966 if (reachable(src)) {
2967 Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
2968 } else {
2969 lea(rscratch, src);
2970 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
2971 }
2972 }
2973
vabsss(XMMRegister dst,XMMRegister nds,XMMRegister src,AddressLiteral negate_field,int vector_len)2974 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
2975 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2976 vandps(dst, nds, negate_field, vector_len);
2977 }
2978
vabssd(XMMRegister dst,XMMRegister nds,XMMRegister src,AddressLiteral negate_field,int vector_len)2979 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
2980 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
2981 vandpd(dst, nds, negate_field, vector_len);
2982 }
2983
vpaddb(XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)2984 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2985 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2986 Assembler::vpaddb(dst, nds, src, vector_len);
2987 }
2988
vpaddb(XMMRegister dst,XMMRegister nds,Address src,int vector_len)2989 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
2990 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2991 Assembler::vpaddb(dst, nds, src, vector_len);
2992 }
2993
vpaddw(XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)2994 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
2995 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2996 Assembler::vpaddw(dst, nds, src, vector_len);
2997 }
2998
vpaddw(XMMRegister dst,XMMRegister nds,Address src,int vector_len)2999 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3000 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3001 Assembler::vpaddw(dst, nds, src, vector_len);
3002 }
3003
vpand(XMMRegister dst,XMMRegister nds,AddressLiteral src,int vector_len,Register scratch_reg)3004 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3005 if (reachable(src)) {
3006 Assembler::vpand(dst, nds, as_Address(src), vector_len);
3007 } else {
3008 lea(scratch_reg, src);
3009 Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3010 }
3011 }
3012
vpbroadcastw(XMMRegister dst,XMMRegister src,int vector_len)3013 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3014 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3015 Assembler::vpbroadcastw(dst, src, vector_len);
3016 }
3017
vpcmpeqb(XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)3018 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3019 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3020 Assembler::vpcmpeqb(dst, nds, src, vector_len);
3021 }
3022
vpcmpeqw(XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)3023 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3024 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3025 Assembler::vpcmpeqw(dst, nds, src, vector_len);
3026 }
3027
vpmovzxbw(XMMRegister dst,Address src,int vector_len)3028 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3029 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3030 Assembler::vpmovzxbw(dst, src, vector_len);
3031 }
3032
vpmovmskb(Register dst,XMMRegister src)3033 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
3034 assert((src->encoding() < 16),"XMM register should be 0-15");
3035 Assembler::vpmovmskb(dst, src);
3036 }
3037
vpmullw(XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)3038 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3039 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3040 Assembler::vpmullw(dst, nds, src, vector_len);
3041 }
3042
vpmullw(XMMRegister dst,XMMRegister nds,Address src,int vector_len)3043 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3044 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3045 Assembler::vpmullw(dst, nds, src, vector_len);
3046 }
3047
vpsubb(XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)3048 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3049 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3050 Assembler::vpsubb(dst, nds, src, vector_len);
3051 }
3052
vpsubb(XMMRegister dst,XMMRegister nds,Address src,int vector_len)3053 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3054 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3055 Assembler::vpsubb(dst, nds, src, vector_len);
3056 }
3057
vpsubw(XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)3058 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3059 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3060 Assembler::vpsubw(dst, nds, src, vector_len);
3061 }
3062
vpsubw(XMMRegister dst,XMMRegister nds,Address src,int vector_len)3063 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3064 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3065 Assembler::vpsubw(dst, nds, src, vector_len);
3066 }
3067
vpsraw(XMMRegister dst,XMMRegister nds,XMMRegister shift,int vector_len)3068 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3069 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3070 Assembler::vpsraw(dst, nds, shift, vector_len);
3071 }
3072
vpsraw(XMMRegister dst,XMMRegister nds,int shift,int vector_len)3073 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3074 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3075 Assembler::vpsraw(dst, nds, shift, vector_len);
3076 }
3077
evpsraq(XMMRegister dst,XMMRegister nds,XMMRegister shift,int vector_len)3078 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3079 assert(UseAVX > 2,"");
3080 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3081 vector_len = 2;
3082 }
3083 Assembler::evpsraq(dst, nds, shift, vector_len);
3084 }
3085
evpsraq(XMMRegister dst,XMMRegister nds,int shift,int vector_len)3086 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3087 assert(UseAVX > 2,"");
3088 if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3089 vector_len = 2;
3090 }
3091 Assembler::evpsraq(dst, nds, shift, vector_len);
3092 }
3093
vpsrlw(XMMRegister dst,XMMRegister nds,XMMRegister shift,int vector_len)3094 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3095 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3096 Assembler::vpsrlw(dst, nds, shift, vector_len);
3097 }
3098
vpsrlw(XMMRegister dst,XMMRegister nds,int shift,int vector_len)3099 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3100 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3101 Assembler::vpsrlw(dst, nds, shift, vector_len);
3102 }
3103
vpsllw(XMMRegister dst,XMMRegister nds,XMMRegister shift,int vector_len)3104 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3105 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3106 Assembler::vpsllw(dst, nds, shift, vector_len);
3107 }
3108
vpsllw(XMMRegister dst,XMMRegister nds,int shift,int vector_len)3109 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3110 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3111 Assembler::vpsllw(dst, nds, shift, vector_len);
3112 }
3113
vptest(XMMRegister dst,XMMRegister src)3114 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3115 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3116 Assembler::vptest(dst, src);
3117 }
3118
punpcklbw(XMMRegister dst,XMMRegister src)3119 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3120 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3121 Assembler::punpcklbw(dst, src);
3122 }
3123
pshufd(XMMRegister dst,Address src,int mode)3124 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3125 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3126 Assembler::pshufd(dst, src, mode);
3127 }
3128
pshuflw(XMMRegister dst,XMMRegister src,int mode)3129 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3130 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3131 Assembler::pshuflw(dst, src, mode);
3132 }
3133
vandpd(XMMRegister dst,XMMRegister nds,AddressLiteral src,int vector_len,Register scratch_reg)3134 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3135 if (reachable(src)) {
3136 vandpd(dst, nds, as_Address(src), vector_len);
3137 } else {
3138 lea(scratch_reg, src);
3139 vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3140 }
3141 }
3142
vandps(XMMRegister dst,XMMRegister nds,AddressLiteral src,int vector_len,Register scratch_reg)3143 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3144 if (reachable(src)) {
3145 vandps(dst, nds, as_Address(src), vector_len);
3146 } else {
3147 lea(scratch_reg, src);
3148 vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3149 }
3150 }
3151
vdivsd(XMMRegister dst,XMMRegister nds,AddressLiteral src)3152 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3153 if (reachable(src)) {
3154 vdivsd(dst, nds, as_Address(src));
3155 } else {
3156 lea(rscratch1, src);
3157 vdivsd(dst, nds, Address(rscratch1, 0));
3158 }
3159 }
3160
vdivss(XMMRegister dst,XMMRegister nds,AddressLiteral src)3161 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3162 if (reachable(src)) {
3163 vdivss(dst, nds, as_Address(src));
3164 } else {
3165 lea(rscratch1, src);
3166 vdivss(dst, nds, Address(rscratch1, 0));
3167 }
3168 }
3169
vmulsd(XMMRegister dst,XMMRegister nds,AddressLiteral src)3170 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3171 if (reachable(src)) {
3172 vmulsd(dst, nds, as_Address(src));
3173 } else {
3174 lea(rscratch1, src);
3175 vmulsd(dst, nds, Address(rscratch1, 0));
3176 }
3177 }
3178
vmulss(XMMRegister dst,XMMRegister nds,AddressLiteral src)3179 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3180 if (reachable(src)) {
3181 vmulss(dst, nds, as_Address(src));
3182 } else {
3183 lea(rscratch1, src);
3184 vmulss(dst, nds, Address(rscratch1, 0));
3185 }
3186 }
3187
vsubsd(XMMRegister dst,XMMRegister nds,AddressLiteral src)3188 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3189 if (reachable(src)) {
3190 vsubsd(dst, nds, as_Address(src));
3191 } else {
3192 lea(rscratch1, src);
3193 vsubsd(dst, nds, Address(rscratch1, 0));
3194 }
3195 }
3196
vsubss(XMMRegister dst,XMMRegister nds,AddressLiteral src)3197 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3198 if (reachable(src)) {
3199 vsubss(dst, nds, as_Address(src));
3200 } else {
3201 lea(rscratch1, src);
3202 vsubss(dst, nds, Address(rscratch1, 0));
3203 }
3204 }
3205
vnegatess(XMMRegister dst,XMMRegister nds,AddressLiteral src)3206 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3207 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3208 vxorps(dst, nds, src, Assembler::AVX_128bit);
3209 }
3210
vnegatesd(XMMRegister dst,XMMRegister nds,AddressLiteral src)3211 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3212 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3213 vxorpd(dst, nds, src, Assembler::AVX_128bit);
3214 }
3215
vxorpd(XMMRegister dst,XMMRegister nds,AddressLiteral src,int vector_len,Register scratch_reg)3216 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3217 if (reachable(src)) {
3218 vxorpd(dst, nds, as_Address(src), vector_len);
3219 } else {
3220 lea(scratch_reg, src);
3221 vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3222 }
3223 }
3224
vxorps(XMMRegister dst,XMMRegister nds,AddressLiteral src,int vector_len,Register scratch_reg)3225 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3226 if (reachable(src)) {
3227 vxorps(dst, nds, as_Address(src), vector_len);
3228 } else {
3229 lea(scratch_reg, src);
3230 vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3231 }
3232 }
3233
vpxor(XMMRegister dst,XMMRegister nds,AddressLiteral src,int vector_len,Register scratch_reg)3234 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3235 if (UseAVX > 1 || (vector_len < 1)) {
3236 if (reachable(src)) {
3237 Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3238 } else {
3239 lea(scratch_reg, src);
3240 Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3241 }
3242 }
3243 else {
3244 MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3245 }
3246 }
3247
3248 //-------------------------------------------------------------------------------------------
3249
clear_jweak_tag(Register possibly_jweak)3250 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3251 const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3252 STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3253 // The inverted mask is sign-extended
3254 andptr(possibly_jweak, inverted_jweak_mask);
3255 }
3256
resolve_jobject(Register value,Register thread,Register tmp)3257 void MacroAssembler::resolve_jobject(Register value,
3258 Register thread,
3259 Register tmp) {
3260 assert_different_registers(value, thread, tmp);
3261 Label done, not_weak;
3262 testptr(value, value);
3263 jcc(Assembler::zero, done); // Use NULL as-is.
3264 testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3265 jcc(Assembler::zero, not_weak);
3266 // Resolve jweak.
3267 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3268 value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3269 verify_oop(value);
3270 jmp(done);
3271 bind(not_weak);
3272 // Resolve (untagged) jobject.
3273 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3274 verify_oop(value);
3275 bind(done);
3276 }
3277
subptr(Register dst,int32_t imm32)3278 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3279 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3280 }
3281
3282 // Force generation of a 4 byte immediate value even if it fits into 8bit
subptr_imm32(Register dst,int32_t imm32)3283 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3284 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3285 }
3286
subptr(Register dst,Register src)3287 void MacroAssembler::subptr(Register dst, Register src) {
3288 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3289 }
3290
3291 // C++ bool manipulation
testbool(Register dst)3292 void MacroAssembler::testbool(Register dst) {
3293 if(sizeof(bool) == 1)
3294 testb(dst, 0xff);
3295 else if(sizeof(bool) == 2) {
3296 // testw implementation needed for two byte bools
3297 ShouldNotReachHere();
3298 } else if(sizeof(bool) == 4)
3299 testl(dst, dst);
3300 else
3301 // unsupported
3302 ShouldNotReachHere();
3303 }
3304
testptr(Register dst,Register src)3305 void MacroAssembler::testptr(Register dst, Register src) {
3306 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3307 }
3308
3309 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
tlab_allocate(Register thread,Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Register t2,Label & slow_case)3310 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3311 Register var_size_in_bytes,
3312 int con_size_in_bytes,
3313 Register t1,
3314 Register t2,
3315 Label& slow_case) {
3316 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3317 bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3318 }
3319
3320 // Defines obj, preserves var_size_in_bytes
eden_allocate(Register thread,Register obj,Register var_size_in_bytes,int con_size_in_bytes,Register t1,Label & slow_case)3321 void MacroAssembler::eden_allocate(Register thread, Register obj,
3322 Register var_size_in_bytes,
3323 int con_size_in_bytes,
3324 Register t1,
3325 Label& slow_case) {
3326 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3327 bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
3328 }
3329
3330 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
zero_memory(Register address,Register length_in_bytes,int offset_in_bytes,Register temp)3331 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3332 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3333 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3334 Label done;
3335
3336 testptr(length_in_bytes, length_in_bytes);
3337 jcc(Assembler::zero, done);
3338
3339 // initialize topmost word, divide index by 2, check if odd and test if zero
3340 // note: for the remaining code to work, index must be a multiple of BytesPerWord
3341 #ifdef ASSERT
3342 {
3343 Label L;
3344 testptr(length_in_bytes, BytesPerWord - 1);
3345 jcc(Assembler::zero, L);
3346 stop("length must be a multiple of BytesPerWord");
3347 bind(L);
3348 }
3349 #endif
3350 Register index = length_in_bytes;
3351 xorptr(temp, temp); // use _zero reg to clear memory (shorter code)
3352 if (UseIncDec) {
3353 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
3354 } else {
3355 shrptr(index, 2); // use 2 instructions to avoid partial flag stall
3356 shrptr(index, 1);
3357 }
3358 #ifndef _LP64
3359 // index could have not been a multiple of 8 (i.e., bit 2 was set)
3360 {
3361 Label even;
3362 // note: if index was a multiple of 8, then it cannot
3363 // be 0 now otherwise it must have been 0 before
3364 // => if it is even, we don't need to check for 0 again
3365 jcc(Assembler::carryClear, even);
3366 // clear topmost word (no jump would be needed if conditional assignment worked here)
3367 movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3368 // index could be 0 now, must check again
3369 jcc(Assembler::zero, done);
3370 bind(even);
3371 }
3372 #endif // !_LP64
3373 // initialize remaining object fields: index is a multiple of 2 now
3374 {
3375 Label loop;
3376 bind(loop);
3377 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3378 NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3379 decrement(index);
3380 jcc(Assembler::notZero, loop);
3381 }
3382
3383 bind(done);
3384 }
3385
3386 // Look up the method for a megamorphic invokeinterface call.
3387 // The target method is determined by <intf_klass, itable_index>.
3388 // The receiver klass is in recv_klass.
3389 // On success, the result will be in method_result, and execution falls through.
3390 // On failure, execution transfers to the given label.
lookup_interface_method(Register recv_klass,Register intf_klass,RegisterOrConstant itable_index,Register method_result,Register scan_temp,Label & L_no_such_interface,bool return_method)3391 void MacroAssembler::lookup_interface_method(Register recv_klass,
3392 Register intf_klass,
3393 RegisterOrConstant itable_index,
3394 Register method_result,
3395 Register scan_temp,
3396 Label& L_no_such_interface,
3397 bool return_method) {
3398 assert_different_registers(recv_klass, intf_klass, scan_temp);
3399 assert_different_registers(method_result, intf_klass, scan_temp);
3400 assert(recv_klass != method_result || !return_method,
3401 "recv_klass can be destroyed when method isn't needed");
3402
3403 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3404 "caller must use same register for non-constant itable index as for method");
3405
3406 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3407 int vtable_base = in_bytes(Klass::vtable_start_offset());
3408 int itentry_off = itableMethodEntry::method_offset_in_bytes();
3409 int scan_step = itableOffsetEntry::size() * wordSize;
3410 int vte_size = vtableEntry::size_in_bytes();
3411 Address::ScaleFactor times_vte_scale = Address::times_ptr;
3412 assert(vte_size == wordSize, "else adjust times_vte_scale");
3413
3414 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3415
3416 // %%% Could store the aligned, prescaled offset in the klassoop.
3417 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3418
3419 if (return_method) {
3420 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3421 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3422 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3423 }
3424
3425 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
3426 // if (scan->interface() == intf) {
3427 // result = (klass + scan->offset() + itable_index);
3428 // }
3429 // }
3430 Label search, found_method;
3431
3432 for (int peel = 1; peel >= 0; peel--) {
3433 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
3434 cmpptr(intf_klass, method_result);
3435
3436 if (peel) {
3437 jccb(Assembler::equal, found_method);
3438 } else {
3439 jccb(Assembler::notEqual, search);
3440 // (invert the test to fall through to found_method...)
3441 }
3442
3443 if (!peel) break;
3444
3445 bind(search);
3446
3447 // Check that the previous entry is non-null. A null entry means that
3448 // the receiver class doesn't implement the interface, and wasn't the
3449 // same as when the caller was compiled.
3450 testptr(method_result, method_result);
3451 jcc(Assembler::zero, L_no_such_interface);
3452 addptr(scan_temp, scan_step);
3453 }
3454
3455 bind(found_method);
3456
3457 if (return_method) {
3458 // Got a hit.
3459 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
3460 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3461 }
3462 }
3463
3464
3465 // virtual method calling
lookup_virtual_method(Register recv_klass,RegisterOrConstant vtable_index,Register method_result)3466 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3467 RegisterOrConstant vtable_index,
3468 Register method_result) {
3469 const int base = in_bytes(Klass::vtable_start_offset());
3470 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3471 Address vtable_entry_addr(recv_klass,
3472 vtable_index, Address::times_ptr,
3473 base + vtableEntry::method_offset_in_bytes());
3474 movptr(method_result, vtable_entry_addr);
3475 }
3476
3477
check_klass_subtype(Register sub_klass,Register super_klass,Register temp_reg,Label & L_success)3478 void MacroAssembler::check_klass_subtype(Register sub_klass,
3479 Register super_klass,
3480 Register temp_reg,
3481 Label& L_success) {
3482 Label L_failure;
3483 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL);
3484 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
3485 bind(L_failure);
3486 }
3487
3488
check_klass_subtype_fast_path(Register sub_klass,Register super_klass,Register temp_reg,Label * L_success,Label * L_failure,Label * L_slow_path,RegisterOrConstant super_check_offset)3489 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3490 Register super_klass,
3491 Register temp_reg,
3492 Label* L_success,
3493 Label* L_failure,
3494 Label* L_slow_path,
3495 RegisterOrConstant super_check_offset) {
3496 assert_different_registers(sub_klass, super_klass, temp_reg);
3497 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3498 if (super_check_offset.is_register()) {
3499 assert_different_registers(sub_klass, super_klass,
3500 super_check_offset.as_register());
3501 } else if (must_load_sco) {
3502 assert(temp_reg != noreg, "supply either a temp or a register offset");
3503 }
3504
3505 Label L_fallthrough;
3506 int label_nulls = 0;
3507 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
3508 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
3509 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
3510 assert(label_nulls <= 1, "at most one NULL in the batch");
3511
3512 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3513 int sco_offset = in_bytes(Klass::super_check_offset_offset());
3514 Address super_check_offset_addr(super_klass, sco_offset);
3515
3516 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3517 // range of a jccb. If this routine grows larger, reconsider at
3518 // least some of these.
3519 #define local_jcc(assembler_cond, label) \
3520 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
3521 else jcc( assembler_cond, label) /*omit semi*/
3522
3523 // Hacked jmp, which may only be used just before L_fallthrough.
3524 #define final_jmp(label) \
3525 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
3526 else jmp(label) /*omit semi*/
3527
3528 // If the pointers are equal, we are done (e.g., String[] elements).
3529 // This self-check enables sharing of secondary supertype arrays among
3530 // non-primary types such as array-of-interface. Otherwise, each such
3531 // type would need its own customized SSA.
3532 // We move this check to the front of the fast path because many
3533 // type checks are in fact trivially successful in this manner,
3534 // so we get a nicely predicted branch right at the start of the check.
3535 cmpptr(sub_klass, super_klass);
3536 local_jcc(Assembler::equal, *L_success);
3537
3538 // Check the supertype display:
3539 if (must_load_sco) {
3540 // Positive movl does right thing on LP64.
3541 movl(temp_reg, super_check_offset_addr);
3542 super_check_offset = RegisterOrConstant(temp_reg);
3543 }
3544 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3545 cmpptr(super_klass, super_check_addr); // load displayed supertype
3546
3547 // This check has worked decisively for primary supers.
3548 // Secondary supers are sought in the super_cache ('super_cache_addr').
3549 // (Secondary supers are interfaces and very deeply nested subtypes.)
3550 // This works in the same check above because of a tricky aliasing
3551 // between the super_cache and the primary super display elements.
3552 // (The 'super_check_addr' can address either, as the case requires.)
3553 // Note that the cache is updated below if it does not help us find
3554 // what we need immediately.
3555 // So if it was a primary super, we can just fail immediately.
3556 // Otherwise, it's the slow path for us (no success at this point).
3557
3558 if (super_check_offset.is_register()) {
3559 local_jcc(Assembler::equal, *L_success);
3560 cmpl(super_check_offset.as_register(), sc_offset);
3561 if (L_failure == &L_fallthrough) {
3562 local_jcc(Assembler::equal, *L_slow_path);
3563 } else {
3564 local_jcc(Assembler::notEqual, *L_failure);
3565 final_jmp(*L_slow_path);
3566 }
3567 } else if (super_check_offset.as_constant() == sc_offset) {
3568 // Need a slow path; fast failure is impossible.
3569 if (L_slow_path == &L_fallthrough) {
3570 local_jcc(Assembler::equal, *L_success);
3571 } else {
3572 local_jcc(Assembler::notEqual, *L_slow_path);
3573 final_jmp(*L_success);
3574 }
3575 } else {
3576 // No slow path; it's a fast decision.
3577 if (L_failure == &L_fallthrough) {
3578 local_jcc(Assembler::equal, *L_success);
3579 } else {
3580 local_jcc(Assembler::notEqual, *L_failure);
3581 final_jmp(*L_success);
3582 }
3583 }
3584
3585 bind(L_fallthrough);
3586
3587 #undef local_jcc
3588 #undef final_jmp
3589 }
3590
3591
check_klass_subtype_slow_path(Register sub_klass,Register super_klass,Register temp_reg,Register temp2_reg,Label * L_success,Label * L_failure,bool set_cond_codes)3592 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3593 Register super_klass,
3594 Register temp_reg,
3595 Register temp2_reg,
3596 Label* L_success,
3597 Label* L_failure,
3598 bool set_cond_codes) {
3599 assert_different_registers(sub_klass, super_klass, temp_reg);
3600 if (temp2_reg != noreg)
3601 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
3602 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
3603
3604 Label L_fallthrough;
3605 int label_nulls = 0;
3606 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
3607 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
3608 assert(label_nulls <= 1, "at most one NULL in the batch");
3609
3610 // a couple of useful fields in sub_klass:
3611 int ss_offset = in_bytes(Klass::secondary_supers_offset());
3612 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3613 Address secondary_supers_addr(sub_klass, ss_offset);
3614 Address super_cache_addr( sub_klass, sc_offset);
3615
3616 // Do a linear scan of the secondary super-klass chain.
3617 // This code is rarely used, so simplicity is a virtue here.
3618 // The repne_scan instruction uses fixed registers, which we must spill.
3619 // Don't worry too much about pre-existing connections with the input regs.
3620
3621 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
3622 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
3623
3624 // Get super_klass value into rax (even if it was in rdi or rcx).
3625 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
3626 if (super_klass != rax || UseCompressedOops) {
3627 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
3628 mov(rax, super_klass);
3629 }
3630 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
3631 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
3632
3633 #ifndef PRODUCT
3634 int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
3635 ExternalAddress pst_counter_addr((address) pst_counter);
3636 NOT_LP64( incrementl(pst_counter_addr) );
3637 LP64_ONLY( lea(rcx, pst_counter_addr) );
3638 LP64_ONLY( incrementl(Address(rcx, 0)) );
3639 #endif //PRODUCT
3640
3641 // We will consult the secondary-super array.
3642 movptr(rdi, secondary_supers_addr);
3643 // Load the array length. (Positive movl does right thing on LP64.)
3644 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
3645 // Skip to start of data.
3646 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
3647
3648 // Scan RCX words at [RDI] for an occurrence of RAX.
3649 // Set NZ/Z based on last compare.
3650 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
3651 // not change flags (only scas instruction which is repeated sets flags).
3652 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
3653
3654 testptr(rax,rax); // Set Z = 0
3655 repne_scan();
3656
3657 // Unspill the temp. registers:
3658 if (pushed_rdi) pop(rdi);
3659 if (pushed_rcx) pop(rcx);
3660 if (pushed_rax) pop(rax);
3661
3662 if (set_cond_codes) {
3663 // Special hack for the AD files: rdi is guaranteed non-zero.
3664 assert(!pushed_rdi, "rdi must be left non-NULL");
3665 // Also, the condition codes are properly set Z/NZ on succeed/failure.
3666 }
3667
3668 if (L_failure == &L_fallthrough)
3669 jccb(Assembler::notEqual, *L_failure);
3670 else jcc(Assembler::notEqual, *L_failure);
3671
3672 // Success. Cache the super we found and proceed in triumph.
3673 movptr(super_cache_addr, super_klass);
3674
3675 if (L_success != &L_fallthrough) {
3676 jmp(*L_success);
3677 }
3678
3679 #undef IS_A_TEMP
3680
3681 bind(L_fallthrough);
3682 }
3683
clinit_barrier(Register klass,Register thread,Label * L_fast_path,Label * L_slow_path)3684 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
3685 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
3686
3687 Label L_fallthrough;
3688 if (L_fast_path == NULL) {
3689 L_fast_path = &L_fallthrough;
3690 } else if (L_slow_path == NULL) {
3691 L_slow_path = &L_fallthrough;
3692 }
3693
3694 // Fast path check: class is fully initialized
3695 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
3696 jcc(Assembler::equal, *L_fast_path);
3697
3698 // Fast path check: current thread is initializer thread
3699 cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
3700 if (L_slow_path == &L_fallthrough) {
3701 jcc(Assembler::equal, *L_fast_path);
3702 bind(*L_slow_path);
3703 } else if (L_fast_path == &L_fallthrough) {
3704 jcc(Assembler::notEqual, *L_slow_path);
3705 bind(*L_fast_path);
3706 } else {
3707 Unimplemented();
3708 }
3709 }
3710
cmov32(Condition cc,Register dst,Address src)3711 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
3712 if (VM_Version::supports_cmov()) {
3713 cmovl(cc, dst, src);
3714 } else {
3715 Label L;
3716 jccb(negate_condition(cc), L);
3717 movl(dst, src);
3718 bind(L);
3719 }
3720 }
3721
cmov32(Condition cc,Register dst,Register src)3722 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
3723 if (VM_Version::supports_cmov()) {
3724 cmovl(cc, dst, src);
3725 } else {
3726 Label L;
3727 jccb(negate_condition(cc), L);
3728 movl(dst, src);
3729 bind(L);
3730 }
3731 }
3732
_verify_oop(Register reg,const char * s,const char * file,int line)3733 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
3734 if (!VerifyOops) return;
3735
3736 // Pass register number to verify_oop_subroutine
3737 const char* b = NULL;
3738 {
3739 ResourceMark rm;
3740 stringStream ss;
3741 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
3742 b = code_string(ss.as_string());
3743 }
3744 BLOCK_COMMENT("verify_oop {");
3745 #ifdef _LP64
3746 push(rscratch1); // save r10, trashed by movptr()
3747 #endif
3748 push(rax); // save rax,
3749 push(reg); // pass register argument
3750 ExternalAddress buffer((address) b);
3751 // avoid using pushptr, as it modifies scratch registers
3752 // and our contract is not to modify anything
3753 movptr(rax, buffer.addr());
3754 push(rax);
3755 // call indirectly to solve generation ordering problem
3756 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
3757 call(rax);
3758 // Caller pops the arguments (oop, message) and restores rax, r10
3759 BLOCK_COMMENT("} verify_oop");
3760 }
3761
vallones(XMMRegister dst,int vector_len)3762 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
3763 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
3764 vpternlogd(dst, 0xFF, dst, dst, vector_len);
3765 } else {
3766 assert(UseAVX > 0, "");
3767 vpcmpeqb(dst, dst, dst, vector_len);
3768 }
3769 }
3770
delayed_value_impl(intptr_t * delayed_value_addr,Register tmp,int offset)3771 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
3772 Register tmp,
3773 int offset) {
3774 intptr_t value = *delayed_value_addr;
3775 if (value != 0)
3776 return RegisterOrConstant(value + offset);
3777
3778 // load indirectly to solve generation ordering problem
3779 movptr(tmp, ExternalAddress((address) delayed_value_addr));
3780
3781 #ifdef ASSERT
3782 { Label L;
3783 testptr(tmp, tmp);
3784 if (WizardMode) {
3785 const char* buf = NULL;
3786 {
3787 ResourceMark rm;
3788 stringStream ss;
3789 ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
3790 buf = code_string(ss.as_string());
3791 }
3792 jcc(Assembler::notZero, L);
3793 STOP(buf);
3794 } else {
3795 jccb(Assembler::notZero, L);
3796 hlt();
3797 }
3798 bind(L);
3799 }
3800 #endif
3801
3802 if (offset != 0)
3803 addptr(tmp, offset);
3804
3805 return RegisterOrConstant(tmp);
3806 }
3807
3808
argument_address(RegisterOrConstant arg_slot,int extra_slot_offset)3809 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
3810 int extra_slot_offset) {
3811 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
3812 int stackElementSize = Interpreter::stackElementSize;
3813 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
3814 #ifdef ASSERT
3815 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
3816 assert(offset1 - offset == stackElementSize, "correct arithmetic");
3817 #endif
3818 Register scale_reg = noreg;
3819 Address::ScaleFactor scale_factor = Address::no_scale;
3820 if (arg_slot.is_constant()) {
3821 offset += arg_slot.as_constant() * stackElementSize;
3822 } else {
3823 scale_reg = arg_slot.as_register();
3824 scale_factor = Address::times(stackElementSize);
3825 }
3826 offset += wordSize; // return PC is on stack
3827 return Address(rsp, scale_reg, scale_factor, offset);
3828 }
3829
3830
_verify_oop_addr(Address addr,const char * s,const char * file,int line)3831 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
3832 if (!VerifyOops) return;
3833
3834 // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
3835 // Pass register number to verify_oop_subroutine
3836 const char* b = NULL;
3837 {
3838 ResourceMark rm;
3839 stringStream ss;
3840 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
3841 b = code_string(ss.as_string());
3842 }
3843 #ifdef _LP64
3844 push(rscratch1); // save r10, trashed by movptr()
3845 #endif
3846 push(rax); // save rax,
3847 // addr may contain rsp so we will have to adjust it based on the push
3848 // we just did (and on 64 bit we do two pushes)
3849 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
3850 // stores rax into addr which is backwards of what was intended.
3851 if (addr.uses(rsp)) {
3852 lea(rax, addr);
3853 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
3854 } else {
3855 pushptr(addr);
3856 }
3857
3858 ExternalAddress buffer((address) b);
3859 // pass msg argument
3860 // avoid using pushptr, as it modifies scratch registers
3861 // and our contract is not to modify anything
3862 movptr(rax, buffer.addr());
3863 push(rax);
3864
3865 // call indirectly to solve generation ordering problem
3866 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
3867 call(rax);
3868 // Caller pops the arguments (addr, message) and restores rax, r10.
3869 }
3870
verify_tlab()3871 void MacroAssembler::verify_tlab() {
3872 #ifdef ASSERT
3873 if (UseTLAB && VerifyOops) {
3874 Label next, ok;
3875 Register t1 = rsi;
3876 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
3877
3878 push(t1);
3879 NOT_LP64(push(thread_reg));
3880 NOT_LP64(get_thread(thread_reg));
3881
3882 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
3883 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
3884 jcc(Assembler::aboveEqual, next);
3885 STOP("assert(top >= start)");
3886 should_not_reach_here();
3887
3888 bind(next);
3889 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
3890 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
3891 jcc(Assembler::aboveEqual, ok);
3892 STOP("assert(top <= end)");
3893 should_not_reach_here();
3894
3895 bind(ok);
3896 NOT_LP64(pop(thread_reg));
3897 pop(t1);
3898 }
3899 #endif
3900 }
3901
3902 class ControlWord {
3903 public:
3904 int32_t _value;
3905
rounding_control() const3906 int rounding_control() const { return (_value >> 10) & 3 ; }
precision_control() const3907 int precision_control() const { return (_value >> 8) & 3 ; }
precision() const3908 bool precision() const { return ((_value >> 5) & 1) != 0; }
underflow() const3909 bool underflow() const { return ((_value >> 4) & 1) != 0; }
overflow() const3910 bool overflow() const { return ((_value >> 3) & 1) != 0; }
zero_divide() const3911 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
denormalized() const3912 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
invalid() const3913 bool invalid() const { return ((_value >> 0) & 1) != 0; }
3914
print() const3915 void print() const {
3916 // rounding control
3917 const char* rc;
3918 switch (rounding_control()) {
3919 case 0: rc = "round near"; break;
3920 case 1: rc = "round down"; break;
3921 case 2: rc = "round up "; break;
3922 case 3: rc = "chop "; break;
3923 };
3924 // precision control
3925 const char* pc;
3926 switch (precision_control()) {
3927 case 0: pc = "24 bits "; break;
3928 case 1: pc = "reserved"; break;
3929 case 2: pc = "53 bits "; break;
3930 case 3: pc = "64 bits "; break;
3931 };
3932 // flags
3933 char f[9];
3934 f[0] = ' ';
3935 f[1] = ' ';
3936 f[2] = (precision ()) ? 'P' : 'p';
3937 f[3] = (underflow ()) ? 'U' : 'u';
3938 f[4] = (overflow ()) ? 'O' : 'o';
3939 f[5] = (zero_divide ()) ? 'Z' : 'z';
3940 f[6] = (denormalized()) ? 'D' : 'd';
3941 f[7] = (invalid ()) ? 'I' : 'i';
3942 f[8] = '\x0';
3943 // output
3944 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
3945 }
3946
3947 };
3948
3949 class StatusWord {
3950 public:
3951 int32_t _value;
3952
busy() const3953 bool busy() const { return ((_value >> 15) & 1) != 0; }
C3() const3954 bool C3() const { return ((_value >> 14) & 1) != 0; }
C2() const3955 bool C2() const { return ((_value >> 10) & 1) != 0; }
C1() const3956 bool C1() const { return ((_value >> 9) & 1) != 0; }
C0() const3957 bool C0() const { return ((_value >> 8) & 1) != 0; }
top() const3958 int top() const { return (_value >> 11) & 7 ; }
error_status() const3959 bool error_status() const { return ((_value >> 7) & 1) != 0; }
stack_fault() const3960 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
precision() const3961 bool precision() const { return ((_value >> 5) & 1) != 0; }
underflow() const3962 bool underflow() const { return ((_value >> 4) & 1) != 0; }
overflow() const3963 bool overflow() const { return ((_value >> 3) & 1) != 0; }
zero_divide() const3964 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
denormalized() const3965 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
invalid() const3966 bool invalid() const { return ((_value >> 0) & 1) != 0; }
3967
print() const3968 void print() const {
3969 // condition codes
3970 char c[5];
3971 c[0] = (C3()) ? '3' : '-';
3972 c[1] = (C2()) ? '2' : '-';
3973 c[2] = (C1()) ? '1' : '-';
3974 c[3] = (C0()) ? '0' : '-';
3975 c[4] = '\x0';
3976 // flags
3977 char f[9];
3978 f[0] = (error_status()) ? 'E' : '-';
3979 f[1] = (stack_fault ()) ? 'S' : '-';
3980 f[2] = (precision ()) ? 'P' : '-';
3981 f[3] = (underflow ()) ? 'U' : '-';
3982 f[4] = (overflow ()) ? 'O' : '-';
3983 f[5] = (zero_divide ()) ? 'Z' : '-';
3984 f[6] = (denormalized()) ? 'D' : '-';
3985 f[7] = (invalid ()) ? 'I' : '-';
3986 f[8] = '\x0';
3987 // output
3988 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
3989 }
3990
3991 };
3992
3993 class TagWord {
3994 public:
3995 int32_t _value;
3996
tag_at(int i) const3997 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
3998
print() const3999 void print() const {
4000 printf("%04x", _value & 0xFFFF);
4001 }
4002
4003 };
4004
4005 class FPU_Register {
4006 public:
4007 int32_t _m0;
4008 int32_t _m1;
4009 int16_t _ex;
4010
is_indefinite() const4011 bool is_indefinite() const {
4012 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4013 }
4014
print() const4015 void print() const {
4016 char sign = (_ex < 0) ? '-' : '+';
4017 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
4018 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
4019 };
4020
4021 };
4022
4023 class FPU_State {
4024 public:
4025 enum {
4026 register_size = 10,
4027 number_of_registers = 8,
4028 register_mask = 7
4029 };
4030
4031 ControlWord _control_word;
4032 StatusWord _status_word;
4033 TagWord _tag_word;
4034 int32_t _error_offset;
4035 int32_t _error_selector;
4036 int32_t _data_offset;
4037 int32_t _data_selector;
4038 int8_t _register[register_size * number_of_registers];
4039
tag_for_st(int i) const4040 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
st(int i) const4041 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
4042
tag_as_string(int tag) const4043 const char* tag_as_string(int tag) const {
4044 switch (tag) {
4045 case 0: return "valid";
4046 case 1: return "zero";
4047 case 2: return "special";
4048 case 3: return "empty";
4049 }
4050 ShouldNotReachHere();
4051 return NULL;
4052 }
4053
print() const4054 void print() const {
4055 // print computation registers
4056 { int t = _status_word.top();
4057 for (int i = 0; i < number_of_registers; i++) {
4058 int j = (i - t) & register_mask;
4059 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4060 st(j)->print();
4061 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4062 }
4063 }
4064 printf("\n");
4065 // print control registers
4066 printf("ctrl = "); _control_word.print(); printf("\n");
4067 printf("stat = "); _status_word .print(); printf("\n");
4068 printf("tags = "); _tag_word .print(); printf("\n");
4069 }
4070
4071 };
4072
4073 class Flag_Register {
4074 public:
4075 int32_t _value;
4076
overflow() const4077 bool overflow() const { return ((_value >> 11) & 1) != 0; }
direction() const4078 bool direction() const { return ((_value >> 10) & 1) != 0; }
sign() const4079 bool sign() const { return ((_value >> 7) & 1) != 0; }
zero() const4080 bool zero() const { return ((_value >> 6) & 1) != 0; }
auxiliary_carry() const4081 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
parity() const4082 bool parity() const { return ((_value >> 2) & 1) != 0; }
carry() const4083 bool carry() const { return ((_value >> 0) & 1) != 0; }
4084
print() const4085 void print() const {
4086 // flags
4087 char f[8];
4088 f[0] = (overflow ()) ? 'O' : '-';
4089 f[1] = (direction ()) ? 'D' : '-';
4090 f[2] = (sign ()) ? 'S' : '-';
4091 f[3] = (zero ()) ? 'Z' : '-';
4092 f[4] = (auxiliary_carry()) ? 'A' : '-';
4093 f[5] = (parity ()) ? 'P' : '-';
4094 f[6] = (carry ()) ? 'C' : '-';
4095 f[7] = '\x0';
4096 // output
4097 printf("%08x flags = %s", _value, f);
4098 }
4099
4100 };
4101
4102 class IU_Register {
4103 public:
4104 int32_t _value;
4105
print() const4106 void print() const {
4107 printf("%08x %11d", _value, _value);
4108 }
4109
4110 };
4111
4112 class IU_State {
4113 public:
4114 Flag_Register _eflags;
4115 IU_Register _rdi;
4116 IU_Register _rsi;
4117 IU_Register _rbp;
4118 IU_Register _rsp;
4119 IU_Register _rbx;
4120 IU_Register _rdx;
4121 IU_Register _rcx;
4122 IU_Register _rax;
4123
print() const4124 void print() const {
4125 // computation registers
4126 printf("rax, = "); _rax.print(); printf("\n");
4127 printf("rbx, = "); _rbx.print(); printf("\n");
4128 printf("rcx = "); _rcx.print(); printf("\n");
4129 printf("rdx = "); _rdx.print(); printf("\n");
4130 printf("rdi = "); _rdi.print(); printf("\n");
4131 printf("rsi = "); _rsi.print(); printf("\n");
4132 printf("rbp, = "); _rbp.print(); printf("\n");
4133 printf("rsp = "); _rsp.print(); printf("\n");
4134 printf("\n");
4135 // control registers
4136 printf("flgs = "); _eflags.print(); printf("\n");
4137 }
4138 };
4139
4140
4141 class CPU_State {
4142 public:
4143 FPU_State _fpu_state;
4144 IU_State _iu_state;
4145
print() const4146 void print() const {
4147 printf("--------------------------------------------------\n");
4148 _iu_state .print();
4149 printf("\n");
4150 _fpu_state.print();
4151 printf("--------------------------------------------------\n");
4152 }
4153
4154 };
4155
4156
_print_CPU_state(CPU_State * state)4157 static void _print_CPU_state(CPU_State* state) {
4158 state->print();
4159 };
4160
4161
print_CPU_state()4162 void MacroAssembler::print_CPU_state() {
4163 push_CPU_state();
4164 push(rsp); // pass CPU state
4165 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4166 addptr(rsp, wordSize); // discard argument
4167 pop_CPU_state();
4168 }
4169
4170
4171 #ifndef _LP64
_verify_FPU(int stack_depth,char * s,CPU_State * state)4172 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4173 static int counter = 0;
4174 FPU_State* fs = &state->_fpu_state;
4175 counter++;
4176 // For leaf calls, only verify that the top few elements remain empty.
4177 // We only need 1 empty at the top for C2 code.
4178 if( stack_depth < 0 ) {
4179 if( fs->tag_for_st(7) != 3 ) {
4180 printf("FPR7 not empty\n");
4181 state->print();
4182 assert(false, "error");
4183 return false;
4184 }
4185 return true; // All other stack states do not matter
4186 }
4187
4188 assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
4189 "bad FPU control word");
4190
4191 // compute stack depth
4192 int i = 0;
4193 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++;
4194 int d = i;
4195 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4196 // verify findings
4197 if (i != FPU_State::number_of_registers) {
4198 // stack not contiguous
4199 printf("%s: stack not contiguous at ST%d\n", s, i);
4200 state->print();
4201 assert(false, "error");
4202 return false;
4203 }
4204 // check if computed stack depth corresponds to expected stack depth
4205 if (stack_depth < 0) {
4206 // expected stack depth is -stack_depth or less
4207 if (d > -stack_depth) {
4208 // too many elements on the stack
4209 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4210 state->print();
4211 assert(false, "error");
4212 return false;
4213 }
4214 } else {
4215 // expected stack depth is stack_depth
4216 if (d != stack_depth) {
4217 // wrong stack depth
4218 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4219 state->print();
4220 assert(false, "error");
4221 return false;
4222 }
4223 }
4224 // everything is cool
4225 return true;
4226 }
4227
verify_FPU(int stack_depth,const char * s)4228 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4229 if (!VerifyFPU) return;
4230 push_CPU_state();
4231 push(rsp); // pass CPU state
4232 ExternalAddress msg((address) s);
4233 // pass message string s
4234 pushptr(msg.addr());
4235 push(stack_depth); // pass stack depth
4236 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4237 addptr(rsp, 3 * wordSize); // discard arguments
4238 // check for error
4239 { Label L;
4240 testl(rax, rax);
4241 jcc(Assembler::notZero, L);
4242 int3(); // break if error condition
4243 bind(L);
4244 }
4245 pop_CPU_state();
4246 }
4247 #endif // _LP64
4248
restore_cpu_control_state_after_jni()4249 void MacroAssembler::restore_cpu_control_state_after_jni() {
4250 // Either restore the MXCSR register after returning from the JNI Call
4251 // or verify that it wasn't changed (with -Xcheck:jni flag).
4252 if (VM_Version::supports_sse()) {
4253 if (RestoreMXCSROnJNICalls) {
4254 ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
4255 } else if (CheckJNICalls) {
4256 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4257 }
4258 }
4259 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4260 vzeroupper();
4261 // Reset k1 to 0xffff.
4262
4263 #ifdef COMPILER2
4264 if (PostLoopMultiversioning && VM_Version::supports_evex()) {
4265 push(rcx);
4266 movl(rcx, 0xffff);
4267 kmovwl(k1, rcx);
4268 pop(rcx);
4269 }
4270 #endif // COMPILER2
4271
4272 #ifndef _LP64
4273 // Either restore the x87 floating pointer control word after returning
4274 // from the JNI call or verify that it wasn't changed.
4275 if (CheckJNICalls) {
4276 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4277 }
4278 #endif // _LP64
4279 }
4280
4281 // ((OopHandle)result).resolve();
resolve_oop_handle(Register result,Register tmp)4282 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4283 assert_different_registers(result, tmp);
4284
4285 // Only 64 bit platforms support GCs that require a tmp register
4286 // Only IN_HEAP loads require a thread_tmp register
4287 // OopHandle::resolve is an indirection like jobject.
4288 access_load_at(T_OBJECT, IN_NATIVE,
4289 result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4290 }
4291
4292 // ((WeakHandle)result).resolve();
resolve_weak_handle(Register rresult,Register rtmp)4293 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4294 assert_different_registers(rresult, rtmp);
4295 Label resolved;
4296
4297 // A null weak handle resolves to null.
4298 cmpptr(rresult, 0);
4299 jcc(Assembler::equal, resolved);
4300
4301 // Only 64 bit platforms support GCs that require a tmp register
4302 // Only IN_HEAP loads require a thread_tmp register
4303 // WeakHandle::resolve is an indirection like jweak.
4304 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4305 rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4306 bind(resolved);
4307 }
4308
load_mirror(Register mirror,Register method,Register tmp)4309 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4310 // get mirror
4311 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4312 load_method_holder(mirror, method);
4313 movptr(mirror, Address(mirror, mirror_offset));
4314 resolve_oop_handle(mirror, tmp);
4315 }
4316
load_method_holder_cld(Register rresult,Register rmethod)4317 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4318 load_method_holder(rresult, rmethod);
4319 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4320 }
4321
load_method_holder(Register holder,Register method)4322 void MacroAssembler::load_method_holder(Register holder, Register method) {
4323 movptr(holder, Address(method, Method::const_offset())); // ConstMethod*
4324 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
4325 movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4326 }
4327
load_klass(Register dst,Register src,Register tmp)4328 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
4329 assert_different_registers(src, tmp);
4330 assert_different_registers(dst, tmp);
4331 #ifdef _LP64
4332 if (UseCompressedClassPointers) {
4333 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4334 decode_klass_not_null(dst, tmp);
4335 } else
4336 #endif
4337 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4338 }
4339
load_prototype_header(Register dst,Register src,Register tmp)4340 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) {
4341 load_klass(dst, src, tmp);
4342 movptr(dst, Address(dst, Klass::prototype_header_offset()));
4343 }
4344
store_klass(Register dst,Register src,Register tmp)4345 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
4346 assert_different_registers(src, tmp);
4347 assert_different_registers(dst, tmp);
4348 #ifdef _LP64
4349 if (UseCompressedClassPointers) {
4350 encode_klass_not_null(src, tmp);
4351 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4352 } else
4353 #endif
4354 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4355 }
4356
access_load_at(BasicType type,DecoratorSet decorators,Register dst,Address src,Register tmp1,Register thread_tmp)4357 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4358 Register tmp1, Register thread_tmp) {
4359 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4360 decorators = AccessInternal::decorator_fixup(decorators);
4361 bool as_raw = (decorators & AS_RAW) != 0;
4362 if (as_raw) {
4363 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4364 } else {
4365 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4366 }
4367 }
4368
access_store_at(BasicType type,DecoratorSet decorators,Address dst,Register src,Register tmp1,Register tmp2)4369 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4370 Register tmp1, Register tmp2) {
4371 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4372 decorators = AccessInternal::decorator_fixup(decorators);
4373 bool as_raw = (decorators & AS_RAW) != 0;
4374 if (as_raw) {
4375 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
4376 } else {
4377 bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
4378 }
4379 }
4380
resolve(DecoratorSet decorators,Register obj)4381 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4382 // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4383 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4384 decorators |= ACCESS_READ | ACCESS_WRITE;
4385 }
4386 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4387 return bs->resolve(this, decorators, obj);
4388 }
4389
load_heap_oop(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4390 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4391 Register thread_tmp, DecoratorSet decorators) {
4392 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4393 }
4394
4395 // Doesn't do verfication, generates fixed size code
load_heap_oop_not_null(Register dst,Address src,Register tmp1,Register thread_tmp,DecoratorSet decorators)4396 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4397 Register thread_tmp, DecoratorSet decorators) {
4398 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4399 }
4400
store_heap_oop(Address dst,Register src,Register tmp1,Register tmp2,DecoratorSet decorators)4401 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4402 Register tmp2, DecoratorSet decorators) {
4403 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4404 }
4405
4406 // Used for storing NULLs.
store_heap_oop_null(Address dst)4407 void MacroAssembler::store_heap_oop_null(Address dst) {
4408 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4409 }
4410
4411 #ifdef _LP64
store_klass_gap(Register dst,Register src)4412 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4413 if (UseCompressedClassPointers) {
4414 // Store to klass gap in destination
4415 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
4416 }
4417 }
4418
4419 #ifdef ASSERT
verify_heapbase(const char * msg)4420 void MacroAssembler::verify_heapbase(const char* msg) {
4421 assert (UseCompressedOops, "should be compressed");
4422 assert (Universe::heap() != NULL, "java heap should be initialized");
4423 if (CheckCompressedOops) {
4424 Label ok;
4425 push(rscratch1); // cmpptr trashes rscratch1
4426 cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4427 jcc(Assembler::equal, ok);
4428 STOP(msg);
4429 bind(ok);
4430 pop(rscratch1);
4431 }
4432 }
4433 #endif
4434
4435 // Algorithm must match oop.inline.hpp encode_heap_oop.
encode_heap_oop(Register r)4436 void MacroAssembler::encode_heap_oop(Register r) {
4437 #ifdef ASSERT
4438 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4439 #endif
4440 verify_oop_msg(r, "broken oop in encode_heap_oop");
4441 if (CompressedOops::base() == NULL) {
4442 if (CompressedOops::shift() != 0) {
4443 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4444 shrq(r, LogMinObjAlignmentInBytes);
4445 }
4446 return;
4447 }
4448 testq(r, r);
4449 cmovq(Assembler::equal, r, r12_heapbase);
4450 subq(r, r12_heapbase);
4451 shrq(r, LogMinObjAlignmentInBytes);
4452 }
4453
encode_heap_oop_not_null(Register r)4454 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4455 #ifdef ASSERT
4456 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4457 if (CheckCompressedOops) {
4458 Label ok;
4459 testq(r, r);
4460 jcc(Assembler::notEqual, ok);
4461 STOP("null oop passed to encode_heap_oop_not_null");
4462 bind(ok);
4463 }
4464 #endif
4465 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4466 if (CompressedOops::base() != NULL) {
4467 subq(r, r12_heapbase);
4468 }
4469 if (CompressedOops::shift() != 0) {
4470 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4471 shrq(r, LogMinObjAlignmentInBytes);
4472 }
4473 }
4474
encode_heap_oop_not_null(Register dst,Register src)4475 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4476 #ifdef ASSERT
4477 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4478 if (CheckCompressedOops) {
4479 Label ok;
4480 testq(src, src);
4481 jcc(Assembler::notEqual, ok);
4482 STOP("null oop passed to encode_heap_oop_not_null2");
4483 bind(ok);
4484 }
4485 #endif
4486 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4487 if (dst != src) {
4488 movq(dst, src);
4489 }
4490 if (CompressedOops::base() != NULL) {
4491 subq(dst, r12_heapbase);
4492 }
4493 if (CompressedOops::shift() != 0) {
4494 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4495 shrq(dst, LogMinObjAlignmentInBytes);
4496 }
4497 }
4498
decode_heap_oop(Register r)4499 void MacroAssembler::decode_heap_oop(Register r) {
4500 #ifdef ASSERT
4501 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4502 #endif
4503 if (CompressedOops::base() == NULL) {
4504 if (CompressedOops::shift() != 0) {
4505 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4506 shlq(r, LogMinObjAlignmentInBytes);
4507 }
4508 } else {
4509 Label done;
4510 shlq(r, LogMinObjAlignmentInBytes);
4511 jccb(Assembler::equal, done);
4512 addq(r, r12_heapbase);
4513 bind(done);
4514 }
4515 verify_oop_msg(r, "broken oop in decode_heap_oop");
4516 }
4517
decode_heap_oop_not_null(Register r)4518 void MacroAssembler::decode_heap_oop_not_null(Register r) {
4519 // Note: it will change flags
4520 assert (UseCompressedOops, "should only be used for compressed headers");
4521 assert (Universe::heap() != NULL, "java heap should be initialized");
4522 // Cannot assert, unverified entry point counts instructions (see .ad file)
4523 // vtableStubs also counts instructions in pd_code_size_limit.
4524 // Also do not verify_oop as this is called by verify_oop.
4525 if (CompressedOops::shift() != 0) {
4526 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4527 shlq(r, LogMinObjAlignmentInBytes);
4528 if (CompressedOops::base() != NULL) {
4529 addq(r, r12_heapbase);
4530 }
4531 } else {
4532 assert (CompressedOops::base() == NULL, "sanity");
4533 }
4534 }
4535
decode_heap_oop_not_null(Register dst,Register src)4536 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4537 // Note: it will change flags
4538 assert (UseCompressedOops, "should only be used for compressed headers");
4539 assert (Universe::heap() != NULL, "java heap should be initialized");
4540 // Cannot assert, unverified entry point counts instructions (see .ad file)
4541 // vtableStubs also counts instructions in pd_code_size_limit.
4542 // Also do not verify_oop as this is called by verify_oop.
4543 if (CompressedOops::shift() != 0) {
4544 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4545 if (LogMinObjAlignmentInBytes == Address::times_8) {
4546 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
4547 } else {
4548 if (dst != src) {
4549 movq(dst, src);
4550 }
4551 shlq(dst, LogMinObjAlignmentInBytes);
4552 if (CompressedOops::base() != NULL) {
4553 addq(dst, r12_heapbase);
4554 }
4555 }
4556 } else {
4557 assert (CompressedOops::base() == NULL, "sanity");
4558 if (dst != src) {
4559 movq(dst, src);
4560 }
4561 }
4562 }
4563
encode_klass_not_null(Register r,Register tmp)4564 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
4565 assert_different_registers(r, tmp);
4566 if (CompressedKlassPointers::base() != NULL) {
4567 mov64(tmp, (int64_t)CompressedKlassPointers::base());
4568 subq(r, tmp);
4569 }
4570 if (CompressedKlassPointers::shift() != 0) {
4571 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4572 shrq(r, LogKlassAlignmentInBytes);
4573 }
4574 }
4575
encode_and_move_klass_not_null(Register dst,Register src)4576 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
4577 assert_different_registers(src, dst);
4578 if (CompressedKlassPointers::base() != NULL) {
4579 mov64(dst, -(int64_t)CompressedKlassPointers::base());
4580 addq(dst, src);
4581 } else {
4582 movptr(dst, src);
4583 }
4584 if (CompressedKlassPointers::shift() != 0) {
4585 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4586 shrq(dst, LogKlassAlignmentInBytes);
4587 }
4588 }
4589
4590 // !!! If the instructions that get generated here change then function
4591 // instr_size_for_decode_klass_not_null() needs to get updated.
decode_klass_not_null(Register r,Register tmp)4592 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
4593 assert_different_registers(r, tmp);
4594 // Note: it will change flags
4595 assert(UseCompressedClassPointers, "should only be used for compressed headers");
4596 // Cannot assert, unverified entry point counts instructions (see .ad file)
4597 // vtableStubs also counts instructions in pd_code_size_limit.
4598 // Also do not verify_oop as this is called by verify_oop.
4599 if (CompressedKlassPointers::shift() != 0) {
4600 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4601 shlq(r, LogKlassAlignmentInBytes);
4602 }
4603 if (CompressedKlassPointers::base() != NULL) {
4604 mov64(tmp, (int64_t)CompressedKlassPointers::base());
4605 addq(r, tmp);
4606 }
4607 }
4608
decode_and_move_klass_not_null(Register dst,Register src)4609 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
4610 assert_different_registers(src, dst);
4611 // Note: it will change flags
4612 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4613 // Cannot assert, unverified entry point counts instructions (see .ad file)
4614 // vtableStubs also counts instructions in pd_code_size_limit.
4615 // Also do not verify_oop as this is called by verify_oop.
4616
4617 if (CompressedKlassPointers::base() == NULL &&
4618 CompressedKlassPointers::shift() == 0) {
4619 // The best case scenario is that there is no base or shift. Then it is already
4620 // a pointer that needs nothing but a register rename.
4621 movl(dst, src);
4622 } else {
4623 if (CompressedKlassPointers::base() != NULL) {
4624 mov64(dst, (int64_t)CompressedKlassPointers::base());
4625 } else {
4626 xorq(dst, dst);
4627 }
4628 if (CompressedKlassPointers::shift() != 0) {
4629 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4630 assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
4631 leaq(dst, Address(dst, src, Address::times_8, 0));
4632 } else {
4633 addq(dst, src);
4634 }
4635 }
4636 }
4637
set_narrow_oop(Register dst,jobject obj)4638 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4639 assert (UseCompressedOops, "should only be used for compressed headers");
4640 assert (Universe::heap() != NULL, "java heap should be initialized");
4641 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4642 int oop_index = oop_recorder()->find_index(obj);
4643 RelocationHolder rspec = oop_Relocation::spec(oop_index);
4644 mov_narrow_oop(dst, oop_index, rspec);
4645 }
4646
set_narrow_oop(Address dst,jobject obj)4647 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
4648 assert (UseCompressedOops, "should only be used for compressed headers");
4649 assert (Universe::heap() != NULL, "java heap should be initialized");
4650 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4651 int oop_index = oop_recorder()->find_index(obj);
4652 RelocationHolder rspec = oop_Relocation::spec(oop_index);
4653 mov_narrow_oop(dst, oop_index, rspec);
4654 }
4655
set_narrow_klass(Register dst,Klass * k)4656 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4657 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4658 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4659 int klass_index = oop_recorder()->find_index(k);
4660 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4661 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4662 }
4663
set_narrow_klass(Address dst,Klass * k)4664 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
4665 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4666 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4667 int klass_index = oop_recorder()->find_index(k);
4668 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4669 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4670 }
4671
cmp_narrow_oop(Register dst,jobject obj)4672 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
4673 assert (UseCompressedOops, "should only be used for compressed headers");
4674 assert (Universe::heap() != NULL, "java heap should be initialized");
4675 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4676 int oop_index = oop_recorder()->find_index(obj);
4677 RelocationHolder rspec = oop_Relocation::spec(oop_index);
4678 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
4679 }
4680
cmp_narrow_oop(Address dst,jobject obj)4681 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
4682 assert (UseCompressedOops, "should only be used for compressed headers");
4683 assert (Universe::heap() != NULL, "java heap should be initialized");
4684 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4685 int oop_index = oop_recorder()->find_index(obj);
4686 RelocationHolder rspec = oop_Relocation::spec(oop_index);
4687 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
4688 }
4689
cmp_narrow_klass(Register dst,Klass * k)4690 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
4691 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4692 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4693 int klass_index = oop_recorder()->find_index(k);
4694 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4695 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4696 }
4697
cmp_narrow_klass(Address dst,Klass * k)4698 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
4699 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4700 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4701 int klass_index = oop_recorder()->find_index(k);
4702 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4703 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4704 }
4705
reinit_heapbase()4706 void MacroAssembler::reinit_heapbase() {
4707 if (UseCompressedOops) {
4708 if (Universe::heap() != NULL) {
4709 if (CompressedOops::base() == NULL) {
4710 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
4711 } else {
4712 mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
4713 }
4714 } else {
4715 movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4716 }
4717 }
4718 }
4719
4720 #endif // _LP64
4721
4722 // C2 compiled method's prolog code.
verified_entry(int framesize,int stack_bang_size,bool fp_mode_24b,bool is_stub)4723 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
4724
4725 // WARNING: Initial instruction MUST be 5 bytes or longer so that
4726 // NativeJump::patch_verified_entry will be able to patch out the entry
4727 // code safely. The push to verify stack depth is ok at 5 bytes,
4728 // the frame allocation can be either 3 or 6 bytes. So if we don't do
4729 // stack bang then we must use the 6 byte frame allocation even if
4730 // we have no frame. :-(
4731 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
4732
4733 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
4734 // Remove word for return addr
4735 framesize -= wordSize;
4736 stack_bang_size -= wordSize;
4737
4738 // Calls to C2R adapters often do not accept exceptional returns.
4739 // We require that their callers must bang for them. But be careful, because
4740 // some VM calls (such as call site linkage) can use several kilobytes of
4741 // stack. But the stack safety zone should account for that.
4742 // See bugs 4446381, 4468289, 4497237.
4743 if (stack_bang_size > 0) {
4744 generate_stack_overflow_check(stack_bang_size);
4745
4746 // We always push rbp, so that on return to interpreter rbp, will be
4747 // restored correctly and we can correct the stack.
4748 push(rbp);
4749 // Save caller's stack pointer into RBP if the frame pointer is preserved.
4750 if (PreserveFramePointer) {
4751 mov(rbp, rsp);
4752 }
4753 // Remove word for ebp
4754 framesize -= wordSize;
4755
4756 // Create frame
4757 if (framesize) {
4758 subptr(rsp, framesize);
4759 }
4760 } else {
4761 // Create frame (force generation of a 4 byte immediate value)
4762 subptr_imm32(rsp, framesize);
4763
4764 // Save RBP register now.
4765 framesize -= wordSize;
4766 movptr(Address(rsp, framesize), rbp);
4767 // Save caller's stack pointer into RBP if the frame pointer is preserved.
4768 if (PreserveFramePointer) {
4769 movptr(rbp, rsp);
4770 if (framesize > 0) {
4771 addptr(rbp, framesize);
4772 }
4773 }
4774 }
4775
4776 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
4777 framesize -= wordSize;
4778 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
4779 }
4780
4781 #ifndef _LP64
4782 // If method sets FPU control word do it now
4783 if (fp_mode_24b) {
4784 fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
4785 }
4786 if (UseSSE >= 2 && VerifyFPU) {
4787 verify_FPU(0, "FPU stack must be clean on entry");
4788 }
4789 #endif
4790
4791 #ifdef ASSERT
4792 if (VerifyStackAtCalls) {
4793 Label L;
4794 push(rax);
4795 mov(rax, rsp);
4796 andptr(rax, StackAlignmentInBytes-1);
4797 cmpptr(rax, StackAlignmentInBytes-wordSize);
4798 pop(rax);
4799 jcc(Assembler::equal, L);
4800 STOP("Stack is not properly aligned!");
4801 bind(L);
4802 }
4803 #endif
4804
4805 if (!is_stub) {
4806 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4807 bs->nmethod_entry_barrier(this);
4808 }
4809 }
4810
4811 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
xmm_clear_mem(Register base,Register cnt,XMMRegister xtmp)4812 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) {
4813 // cnt - number of qwords (8-byte words).
4814 // base - start address, qword aligned.
4815 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
4816 if (UseAVX >= 2) {
4817 vpxor(xtmp, xtmp, xtmp, AVX_256bit);
4818 } else {
4819 pxor(xtmp, xtmp);
4820 }
4821 jmp(L_zero_64_bytes);
4822
4823 BIND(L_loop);
4824 if (UseAVX >= 2) {
4825 vmovdqu(Address(base, 0), xtmp);
4826 vmovdqu(Address(base, 32), xtmp);
4827 } else {
4828 movdqu(Address(base, 0), xtmp);
4829 movdqu(Address(base, 16), xtmp);
4830 movdqu(Address(base, 32), xtmp);
4831 movdqu(Address(base, 48), xtmp);
4832 }
4833 addptr(base, 64);
4834
4835 BIND(L_zero_64_bytes);
4836 subptr(cnt, 8);
4837 jccb(Assembler::greaterEqual, L_loop);
4838 addptr(cnt, 4);
4839 jccb(Assembler::less, L_tail);
4840 // Copy trailing 32 bytes
4841 if (UseAVX >= 2) {
4842 vmovdqu(Address(base, 0), xtmp);
4843 } else {
4844 movdqu(Address(base, 0), xtmp);
4845 movdqu(Address(base, 16), xtmp);
4846 }
4847 addptr(base, 32);
4848 subptr(cnt, 4);
4849
4850 BIND(L_tail);
4851 addptr(cnt, 4);
4852 jccb(Assembler::lessEqual, L_end);
4853 decrement(cnt);
4854
4855 BIND(L_sloop);
4856 movq(Address(base, 0), xtmp);
4857 addptr(base, 8);
4858 decrement(cnt);
4859 jccb(Assembler::greaterEqual, L_sloop);
4860 BIND(L_end);
4861 }
4862
clear_mem(Register base,Register cnt,Register tmp,XMMRegister xtmp,bool is_large)4863 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) {
4864 // cnt - number of qwords (8-byte words).
4865 // base - start address, qword aligned.
4866 // is_large - if optimizers know cnt is larger than InitArrayShortSize
4867 assert(base==rdi, "base register must be edi for rep stos");
4868 assert(tmp==rax, "tmp register must be eax for rep stos");
4869 assert(cnt==rcx, "cnt register must be ecx for rep stos");
4870 assert(InitArrayShortSize % BytesPerLong == 0,
4871 "InitArrayShortSize should be the multiple of BytesPerLong");
4872
4873 Label DONE;
4874
4875 if (!is_large || !UseXMMForObjInit) {
4876 xorptr(tmp, tmp);
4877 }
4878
4879 if (!is_large) {
4880 Label LOOP, LONG;
4881 cmpptr(cnt, InitArrayShortSize/BytesPerLong);
4882 jccb(Assembler::greater, LONG);
4883
4884 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
4885
4886 decrement(cnt);
4887 jccb(Assembler::negative, DONE); // Zero length
4888
4889 // Use individual pointer-sized stores for small counts:
4890 BIND(LOOP);
4891 movptr(Address(base, cnt, Address::times_ptr), tmp);
4892 decrement(cnt);
4893 jccb(Assembler::greaterEqual, LOOP);
4894 jmpb(DONE);
4895
4896 BIND(LONG);
4897 }
4898
4899 // Use longer rep-prefixed ops for non-small counts:
4900 if (UseFastStosb) {
4901 shlptr(cnt, 3); // convert to number of bytes
4902 rep_stosb();
4903 } else if (UseXMMForObjInit) {
4904 movptr(tmp, base);
4905 xmm_clear_mem(tmp, cnt, xtmp);
4906 } else {
4907 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
4908 rep_stos();
4909 }
4910
4911 BIND(DONE);
4912 }
4913
generate_fill(BasicType t,bool aligned,Register to,Register value,Register count,Register rtmp,XMMRegister xtmp)4914 void MacroAssembler::generate_fill(BasicType t, bool aligned,
4915 Register to, Register value, Register count,
4916 Register rtmp, XMMRegister xtmp) {
4917 ShortBranchVerifier sbv(this);
4918 assert_different_registers(to, value, count, rtmp);
4919 Label L_exit;
4920 Label L_fill_2_bytes, L_fill_4_bytes;
4921
4922 int shift = -1;
4923 switch (t) {
4924 case T_BYTE:
4925 shift = 2;
4926 break;
4927 case T_SHORT:
4928 shift = 1;
4929 break;
4930 case T_INT:
4931 shift = 0;
4932 break;
4933 default: ShouldNotReachHere();
4934 }
4935
4936 if (t == T_BYTE) {
4937 andl(value, 0xff);
4938 movl(rtmp, value);
4939 shll(rtmp, 8);
4940 orl(value, rtmp);
4941 }
4942 if (t == T_SHORT) {
4943 andl(value, 0xffff);
4944 }
4945 if (t == T_BYTE || t == T_SHORT) {
4946 movl(rtmp, value);
4947 shll(rtmp, 16);
4948 orl(value, rtmp);
4949 }
4950
4951 cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
4952 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
4953 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
4954 Label L_skip_align2;
4955 // align source address at 4 bytes address boundary
4956 if (t == T_BYTE) {
4957 Label L_skip_align1;
4958 // One byte misalignment happens only for byte arrays
4959 testptr(to, 1);
4960 jccb(Assembler::zero, L_skip_align1);
4961 movb(Address(to, 0), value);
4962 increment(to);
4963 decrement(count);
4964 BIND(L_skip_align1);
4965 }
4966 // Two bytes misalignment happens only for byte and short (char) arrays
4967 testptr(to, 2);
4968 jccb(Assembler::zero, L_skip_align2);
4969 movw(Address(to, 0), value);
4970 addptr(to, 2);
4971 subl(count, 1<<(shift-1));
4972 BIND(L_skip_align2);
4973 }
4974 if (UseSSE < 2) {
4975 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
4976 // Fill 32-byte chunks
4977 subl(count, 8 << shift);
4978 jcc(Assembler::less, L_check_fill_8_bytes);
4979 align(16);
4980
4981 BIND(L_fill_32_bytes_loop);
4982
4983 for (int i = 0; i < 32; i += 4) {
4984 movl(Address(to, i), value);
4985 }
4986
4987 addptr(to, 32);
4988 subl(count, 8 << shift);
4989 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
4990 BIND(L_check_fill_8_bytes);
4991 addl(count, 8 << shift);
4992 jccb(Assembler::zero, L_exit);
4993 jmpb(L_fill_8_bytes);
4994
4995 //
4996 // length is too short, just fill qwords
4997 //
4998 BIND(L_fill_8_bytes_loop);
4999 movl(Address(to, 0), value);
5000 movl(Address(to, 4), value);
5001 addptr(to, 8);
5002 BIND(L_fill_8_bytes);
5003 subl(count, 1 << (shift + 1));
5004 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5005 // fall through to fill 4 bytes
5006 } else {
5007 Label L_fill_32_bytes;
5008 if (!UseUnalignedLoadStores) {
5009 // align to 8 bytes, we know we are 4 byte aligned to start
5010 testptr(to, 4);
5011 jccb(Assembler::zero, L_fill_32_bytes);
5012 movl(Address(to, 0), value);
5013 addptr(to, 4);
5014 subl(count, 1<<shift);
5015 }
5016 BIND(L_fill_32_bytes);
5017 {
5018 assert( UseSSE >= 2, "supported cpu only" );
5019 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5020 movdl(xtmp, value);
5021 if (UseAVX >= 2 && UseUnalignedLoadStores) {
5022 Label L_check_fill_32_bytes;
5023 if (UseAVX > 2) {
5024 // Fill 64-byte chunks
5025 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5026
5027 // If number of bytes to fill < AVX3Threshold, perform fill using AVX2
5028 cmpl(count, AVX3Threshold);
5029 jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5030
5031 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5032
5033 subl(count, 16 << shift);
5034 jccb(Assembler::less, L_check_fill_32_bytes);
5035 align(16);
5036
5037 BIND(L_fill_64_bytes_loop_avx3);
5038 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5039 addptr(to, 64);
5040 subl(count, 16 << shift);
5041 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5042 jmpb(L_check_fill_32_bytes);
5043
5044 BIND(L_check_fill_64_bytes_avx2);
5045 }
5046 // Fill 64-byte chunks
5047 Label L_fill_64_bytes_loop;
5048 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5049
5050 subl(count, 16 << shift);
5051 jcc(Assembler::less, L_check_fill_32_bytes);
5052 align(16);
5053
5054 BIND(L_fill_64_bytes_loop);
5055 vmovdqu(Address(to, 0), xtmp);
5056 vmovdqu(Address(to, 32), xtmp);
5057 addptr(to, 64);
5058 subl(count, 16 << shift);
5059 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5060
5061 BIND(L_check_fill_32_bytes);
5062 addl(count, 8 << shift);
5063 jccb(Assembler::less, L_check_fill_8_bytes);
5064 vmovdqu(Address(to, 0), xtmp);
5065 addptr(to, 32);
5066 subl(count, 8 << shift);
5067
5068 BIND(L_check_fill_8_bytes);
5069 // clean upper bits of YMM registers
5070 movdl(xtmp, value);
5071 pshufd(xtmp, xtmp, 0);
5072 } else {
5073 // Fill 32-byte chunks
5074 pshufd(xtmp, xtmp, 0);
5075
5076 subl(count, 8 << shift);
5077 jcc(Assembler::less, L_check_fill_8_bytes);
5078 align(16);
5079
5080 BIND(L_fill_32_bytes_loop);
5081
5082 if (UseUnalignedLoadStores) {
5083 movdqu(Address(to, 0), xtmp);
5084 movdqu(Address(to, 16), xtmp);
5085 } else {
5086 movq(Address(to, 0), xtmp);
5087 movq(Address(to, 8), xtmp);
5088 movq(Address(to, 16), xtmp);
5089 movq(Address(to, 24), xtmp);
5090 }
5091
5092 addptr(to, 32);
5093 subl(count, 8 << shift);
5094 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5095
5096 BIND(L_check_fill_8_bytes);
5097 }
5098 addl(count, 8 << shift);
5099 jccb(Assembler::zero, L_exit);
5100 jmpb(L_fill_8_bytes);
5101
5102 //
5103 // length is too short, just fill qwords
5104 //
5105 BIND(L_fill_8_bytes_loop);
5106 movq(Address(to, 0), xtmp);
5107 addptr(to, 8);
5108 BIND(L_fill_8_bytes);
5109 subl(count, 1 << (shift + 1));
5110 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5111 }
5112 }
5113 // fill trailing 4 bytes
5114 BIND(L_fill_4_bytes);
5115 testl(count, 1<<shift);
5116 jccb(Assembler::zero, L_fill_2_bytes);
5117 movl(Address(to, 0), value);
5118 if (t == T_BYTE || t == T_SHORT) {
5119 Label L_fill_byte;
5120 addptr(to, 4);
5121 BIND(L_fill_2_bytes);
5122 // fill trailing 2 bytes
5123 testl(count, 1<<(shift-1));
5124 jccb(Assembler::zero, L_fill_byte);
5125 movw(Address(to, 0), value);
5126 if (t == T_BYTE) {
5127 addptr(to, 2);
5128 BIND(L_fill_byte);
5129 // fill trailing byte
5130 testl(count, 1);
5131 jccb(Assembler::zero, L_exit);
5132 movb(Address(to, 0), value);
5133 } else {
5134 BIND(L_fill_byte);
5135 }
5136 } else {
5137 BIND(L_fill_2_bytes);
5138 }
5139 BIND(L_exit);
5140 }
5141
5142 // encode char[] to byte[] in ISO_8859_1
5143 //@HotSpotIntrinsicCandidate
5144 //private static int implEncodeISOArray(byte[] sa, int sp,
5145 //byte[] da, int dp, int len) {
5146 // int i = 0;
5147 // for (; i < len; i++) {
5148 // char c = StringUTF16.getChar(sa, sp++);
5149 // if (c > '\u00FF')
5150 // break;
5151 // da[dp++] = (byte)c;
5152 // }
5153 // return i;
5154 //}
encode_iso_array(Register src,Register dst,Register len,XMMRegister tmp1Reg,XMMRegister tmp2Reg,XMMRegister tmp3Reg,XMMRegister tmp4Reg,Register tmp5,Register result)5155 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5156 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5157 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5158 Register tmp5, Register result) {
5159
5160 // rsi: src
5161 // rdi: dst
5162 // rdx: len
5163 // rcx: tmp5
5164 // rax: result
5165 ShortBranchVerifier sbv(this);
5166 assert_different_registers(src, dst, len, tmp5, result);
5167 Label L_done, L_copy_1_char, L_copy_1_char_exit;
5168
5169 // set result
5170 xorl(result, result);
5171 // check for zero length
5172 testl(len, len);
5173 jcc(Assembler::zero, L_done);
5174
5175 movl(result, len);
5176
5177 // Setup pointers
5178 lea(src, Address(src, len, Address::times_2)); // char[]
5179 lea(dst, Address(dst, len, Address::times_1)); // byte[]
5180 negptr(len);
5181
5182 if (UseSSE42Intrinsics || UseAVX >= 2) {
5183 Label L_copy_8_chars, L_copy_8_chars_exit;
5184 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5185
5186 if (UseAVX >= 2) {
5187 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5188 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
5189 movdl(tmp1Reg, tmp5);
5190 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5191 jmp(L_chars_32_check);
5192
5193 bind(L_copy_32_chars);
5194 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5195 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5196 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5197 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
5198 jccb(Assembler::notZero, L_copy_32_chars_exit);
5199 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5200 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5201 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5202
5203 bind(L_chars_32_check);
5204 addptr(len, 32);
5205 jcc(Assembler::lessEqual, L_copy_32_chars);
5206
5207 bind(L_copy_32_chars_exit);
5208 subptr(len, 16);
5209 jccb(Assembler::greater, L_copy_16_chars_exit);
5210
5211 } else if (UseSSE42Intrinsics) {
5212 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
5213 movdl(tmp1Reg, tmp5);
5214 pshufd(tmp1Reg, tmp1Reg, 0);
5215 jmpb(L_chars_16_check);
5216 }
5217
5218 bind(L_copy_16_chars);
5219 if (UseAVX >= 2) {
5220 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5221 vptest(tmp2Reg, tmp1Reg);
5222 jcc(Assembler::notZero, L_copy_16_chars_exit);
5223 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5224 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5225 } else {
5226 if (UseAVX > 0) {
5227 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5228 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5229 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5230 } else {
5231 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5232 por(tmp2Reg, tmp3Reg);
5233 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5234 por(tmp2Reg, tmp4Reg);
5235 }
5236 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
5237 jccb(Assembler::notZero, L_copy_16_chars_exit);
5238 packuswb(tmp3Reg, tmp4Reg);
5239 }
5240 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
5241
5242 bind(L_chars_16_check);
5243 addptr(len, 16);
5244 jcc(Assembler::lessEqual, L_copy_16_chars);
5245
5246 bind(L_copy_16_chars_exit);
5247 if (UseAVX >= 2) {
5248 // clean upper bits of YMM registers
5249 vpxor(tmp2Reg, tmp2Reg);
5250 vpxor(tmp3Reg, tmp3Reg);
5251 vpxor(tmp4Reg, tmp4Reg);
5252 movdl(tmp1Reg, tmp5);
5253 pshufd(tmp1Reg, tmp1Reg, 0);
5254 }
5255 subptr(len, 8);
5256 jccb(Assembler::greater, L_copy_8_chars_exit);
5257
5258 bind(L_copy_8_chars);
5259 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
5260 ptest(tmp3Reg, tmp1Reg);
5261 jccb(Assembler::notZero, L_copy_8_chars_exit);
5262 packuswb(tmp3Reg, tmp1Reg);
5263 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
5264 addptr(len, 8);
5265 jccb(Assembler::lessEqual, L_copy_8_chars);
5266
5267 bind(L_copy_8_chars_exit);
5268 subptr(len, 8);
5269 jccb(Assembler::zero, L_done);
5270 }
5271
5272 bind(L_copy_1_char);
5273 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
5274 testl(tmp5, 0xff00); // check if Unicode char
5275 jccb(Assembler::notZero, L_copy_1_char_exit);
5276 movb(Address(dst, len, Address::times_1, 0), tmp5);
5277 addptr(len, 1);
5278 jccb(Assembler::less, L_copy_1_char);
5279
5280 bind(L_copy_1_char_exit);
5281 addptr(result, len); // len is negative count of not processed elements
5282
5283 bind(L_done);
5284 }
5285
5286 #ifdef _LP64
5287 /**
5288 * Helper for multiply_to_len().
5289 */
add2_with_carry(Register dest_hi,Register dest_lo,Register src1,Register src2)5290 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
5291 addq(dest_lo, src1);
5292 adcq(dest_hi, 0);
5293 addq(dest_lo, src2);
5294 adcq(dest_hi, 0);
5295 }
5296
5297 /**
5298 * Multiply 64 bit by 64 bit first loop.
5299 */
multiply_64_x_64_loop(Register x,Register xstart,Register x_xstart,Register y,Register y_idx,Register z,Register carry,Register product,Register idx,Register kdx)5300 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5301 Register y, Register y_idx, Register z,
5302 Register carry, Register product,
5303 Register idx, Register kdx) {
5304 //
5305 // jlong carry, x[], y[], z[];
5306 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5307 // huge_128 product = y[idx] * x[xstart] + carry;
5308 // z[kdx] = (jlong)product;
5309 // carry = (jlong)(product >>> 64);
5310 // }
5311 // z[xstart] = carry;
5312 //
5313
5314 Label L_first_loop, L_first_loop_exit;
5315 Label L_one_x, L_one_y, L_multiply;
5316
5317 decrementl(xstart);
5318 jcc(Assembler::negative, L_one_x);
5319
5320 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
5321 rorq(x_xstart, 32); // convert big-endian to little-endian
5322
5323 bind(L_first_loop);
5324 decrementl(idx);
5325 jcc(Assembler::negative, L_first_loop_exit);
5326 decrementl(idx);
5327 jcc(Assembler::negative, L_one_y);
5328 movq(y_idx, Address(y, idx, Address::times_4, 0));
5329 rorq(y_idx, 32); // convert big-endian to little-endian
5330 bind(L_multiply);
5331 movq(product, x_xstart);
5332 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
5333 addq(product, carry);
5334 adcq(rdx, 0);
5335 subl(kdx, 2);
5336 movl(Address(z, kdx, Address::times_4, 4), product);
5337 shrq(product, 32);
5338 movl(Address(z, kdx, Address::times_4, 0), product);
5339 movq(carry, rdx);
5340 jmp(L_first_loop);
5341
5342 bind(L_one_y);
5343 movl(y_idx, Address(y, 0));
5344 jmp(L_multiply);
5345
5346 bind(L_one_x);
5347 movl(x_xstart, Address(x, 0));
5348 jmp(L_first_loop);
5349
5350 bind(L_first_loop_exit);
5351 }
5352
5353 /**
5354 * Multiply 64 bit by 64 bit and add 128 bit.
5355 */
multiply_add_128_x_128(Register x_xstart,Register y,Register z,Register yz_idx,Register idx,Register carry,Register product,int offset)5356 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
5357 Register yz_idx, Register idx,
5358 Register carry, Register product, int offset) {
5359 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5360 // z[kdx] = (jlong)product;
5361
5362 movq(yz_idx, Address(y, idx, Address::times_4, offset));
5363 rorq(yz_idx, 32); // convert big-endian to little-endian
5364 movq(product, x_xstart);
5365 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5366 movq(yz_idx, Address(z, idx, Address::times_4, offset));
5367 rorq(yz_idx, 32); // convert big-endian to little-endian
5368
5369 add2_with_carry(rdx, product, carry, yz_idx);
5370
5371 movl(Address(z, idx, Address::times_4, offset+4), product);
5372 shrq(product, 32);
5373 movl(Address(z, idx, Address::times_4, offset), product);
5374
5375 }
5376
5377 /**
5378 * Multiply 128 bit by 128 bit. Unrolled inner loop.
5379 */
multiply_128_x_128_loop(Register x_xstart,Register y,Register z,Register yz_idx,Register idx,Register jdx,Register carry,Register product,Register carry2)5380 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
5381 Register yz_idx, Register idx, Register jdx,
5382 Register carry, Register product,
5383 Register carry2) {
5384 // jlong carry, x[], y[], z[];
5385 // int kdx = ystart+1;
5386 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5387 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5388 // z[kdx+idx+1] = (jlong)product;
5389 // jlong carry2 = (jlong)(product >>> 64);
5390 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5391 // z[kdx+idx] = (jlong)product;
5392 // carry = (jlong)(product >>> 64);
5393 // }
5394 // idx += 2;
5395 // if (idx > 0) {
5396 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5397 // z[kdx+idx] = (jlong)product;
5398 // carry = (jlong)(product >>> 64);
5399 // }
5400 //
5401
5402 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5403
5404 movl(jdx, idx);
5405 andl(jdx, 0xFFFFFFFC);
5406 shrl(jdx, 2);
5407
5408 bind(L_third_loop);
5409 subl(jdx, 1);
5410 jcc(Assembler::negative, L_third_loop_exit);
5411 subl(idx, 4);
5412
5413 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
5414 movq(carry2, rdx);
5415
5416 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
5417 movq(carry, rdx);
5418 jmp(L_third_loop);
5419
5420 bind (L_third_loop_exit);
5421
5422 andl (idx, 0x3);
5423 jcc(Assembler::zero, L_post_third_loop_done);
5424
5425 Label L_check_1;
5426 subl(idx, 2);
5427 jcc(Assembler::negative, L_check_1);
5428
5429 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
5430 movq(carry, rdx);
5431
5432 bind (L_check_1);
5433 addl (idx, 0x2);
5434 andl (idx, 0x1);
5435 subl(idx, 1);
5436 jcc(Assembler::negative, L_post_third_loop_done);
5437
5438 movl(yz_idx, Address(y, idx, Address::times_4, 0));
5439 movq(product, x_xstart);
5440 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5441 movl(yz_idx, Address(z, idx, Address::times_4, 0));
5442
5443 add2_with_carry(rdx, product, yz_idx, carry);
5444
5445 movl(Address(z, idx, Address::times_4, 0), product);
5446 shrq(product, 32);
5447
5448 shlq(rdx, 32);
5449 orq(product, rdx);
5450 movq(carry, product);
5451
5452 bind(L_post_third_loop_done);
5453 }
5454
5455 /**
5456 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
5457 *
5458 */
multiply_128_x_128_bmi2_loop(Register y,Register z,Register carry,Register carry2,Register idx,Register jdx,Register yz_idx1,Register yz_idx2,Register tmp,Register tmp3,Register tmp4)5459 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
5460 Register carry, Register carry2,
5461 Register idx, Register jdx,
5462 Register yz_idx1, Register yz_idx2,
5463 Register tmp, Register tmp3, Register tmp4) {
5464 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
5465
5466 // jlong carry, x[], y[], z[];
5467 // int kdx = ystart+1;
5468 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5469 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
5470 // jlong carry2 = (jlong)(tmp3 >>> 64);
5471 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
5472 // carry = (jlong)(tmp4 >>> 64);
5473 // z[kdx+idx+1] = (jlong)tmp3;
5474 // z[kdx+idx] = (jlong)tmp4;
5475 // }
5476 // idx += 2;
5477 // if (idx > 0) {
5478 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
5479 // z[kdx+idx] = (jlong)yz_idx1;
5480 // carry = (jlong)(yz_idx1 >>> 64);
5481 // }
5482 //
5483
5484 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5485
5486 movl(jdx, idx);
5487 andl(jdx, 0xFFFFFFFC);
5488 shrl(jdx, 2);
5489
5490 bind(L_third_loop);
5491 subl(jdx, 1);
5492 jcc(Assembler::negative, L_third_loop_exit);
5493 subl(idx, 4);
5494
5495 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
5496 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5497 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
5498 rorxq(yz_idx2, yz_idx2, 32);
5499
5500 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
5501 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
5502
5503 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
5504 rorxq(yz_idx1, yz_idx1, 32);
5505 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
5506 rorxq(yz_idx2, yz_idx2, 32);
5507
5508 if (VM_Version::supports_adx()) {
5509 adcxq(tmp3, carry);
5510 adoxq(tmp3, yz_idx1);
5511
5512 adcxq(tmp4, tmp);
5513 adoxq(tmp4, yz_idx2);
5514
5515 movl(carry, 0); // does not affect flags
5516 adcxq(carry2, carry);
5517 adoxq(carry2, carry);
5518 } else {
5519 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
5520 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
5521 }
5522 movq(carry, carry2);
5523
5524 movl(Address(z, idx, Address::times_4, 12), tmp3);
5525 shrq(tmp3, 32);
5526 movl(Address(z, idx, Address::times_4, 8), tmp3);
5527
5528 movl(Address(z, idx, Address::times_4, 4), tmp4);
5529 shrq(tmp4, 32);
5530 movl(Address(z, idx, Address::times_4, 0), tmp4);
5531
5532 jmp(L_third_loop);
5533
5534 bind (L_third_loop_exit);
5535
5536 andl (idx, 0x3);
5537 jcc(Assembler::zero, L_post_third_loop_done);
5538
5539 Label L_check_1;
5540 subl(idx, 2);
5541 jcc(Assembler::negative, L_check_1);
5542
5543 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
5544 rorxq(yz_idx1, yz_idx1, 32);
5545 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
5546 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
5547 rorxq(yz_idx2, yz_idx2, 32);
5548
5549 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
5550
5551 movl(Address(z, idx, Address::times_4, 4), tmp3);
5552 shrq(tmp3, 32);
5553 movl(Address(z, idx, Address::times_4, 0), tmp3);
5554 movq(carry, tmp4);
5555
5556 bind (L_check_1);
5557 addl (idx, 0x2);
5558 andl (idx, 0x1);
5559 subl(idx, 1);
5560 jcc(Assembler::negative, L_post_third_loop_done);
5561 movl(tmp4, Address(y, idx, Address::times_4, 0));
5562 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
5563 movl(tmp4, Address(z, idx, Address::times_4, 0));
5564
5565 add2_with_carry(carry2, tmp3, tmp4, carry);
5566
5567 movl(Address(z, idx, Address::times_4, 0), tmp3);
5568 shrq(tmp3, 32);
5569
5570 shlq(carry2, 32);
5571 orq(tmp3, carry2);
5572 movq(carry, tmp3);
5573
5574 bind(L_post_third_loop_done);
5575 }
5576
5577 /**
5578 * Code for BigInteger::multiplyToLen() instrinsic.
5579 *
5580 * rdi: x
5581 * rax: xlen
5582 * rsi: y
5583 * rcx: ylen
5584 * r8: z
5585 * r11: zlen
5586 * r12: tmp1
5587 * r13: tmp2
5588 * r14: tmp3
5589 * r15: tmp4
5590 * rbx: tmp5
5591 *
5592 */
multiply_to_len(Register x,Register xlen,Register y,Register ylen,Register z,Register zlen,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5)5593 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
5594 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
5595 ShortBranchVerifier sbv(this);
5596 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
5597
5598 push(tmp1);
5599 push(tmp2);
5600 push(tmp3);
5601 push(tmp4);
5602 push(tmp5);
5603
5604 push(xlen);
5605 push(zlen);
5606
5607 const Register idx = tmp1;
5608 const Register kdx = tmp2;
5609 const Register xstart = tmp3;
5610
5611 const Register y_idx = tmp4;
5612 const Register carry = tmp5;
5613 const Register product = xlen;
5614 const Register x_xstart = zlen; // reuse register
5615
5616 // First Loop.
5617 //
5618 // final static long LONG_MASK = 0xffffffffL;
5619 // int xstart = xlen - 1;
5620 // int ystart = ylen - 1;
5621 // long carry = 0;
5622 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5623 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
5624 // z[kdx] = (int)product;
5625 // carry = product >>> 32;
5626 // }
5627 // z[xstart] = (int)carry;
5628 //
5629
5630 movl(idx, ylen); // idx = ylen;
5631 movl(kdx, zlen); // kdx = xlen+ylen;
5632 xorq(carry, carry); // carry = 0;
5633
5634 Label L_done;
5635
5636 movl(xstart, xlen);
5637 decrementl(xstart);
5638 jcc(Assembler::negative, L_done);
5639
5640 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
5641
5642 Label L_second_loop;
5643 testl(kdx, kdx);
5644 jcc(Assembler::zero, L_second_loop);
5645
5646 Label L_carry;
5647 subl(kdx, 1);
5648 jcc(Assembler::zero, L_carry);
5649
5650 movl(Address(z, kdx, Address::times_4, 0), carry);
5651 shrq(carry, 32);
5652 subl(kdx, 1);
5653
5654 bind(L_carry);
5655 movl(Address(z, kdx, Address::times_4, 0), carry);
5656
5657 // Second and third (nested) loops.
5658 //
5659 // for (int i = xstart-1; i >= 0; i--) { // Second loop
5660 // carry = 0;
5661 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5662 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5663 // (z[k] & LONG_MASK) + carry;
5664 // z[k] = (int)product;
5665 // carry = product >>> 32;
5666 // }
5667 // z[i] = (int)carry;
5668 // }
5669 //
5670 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
5671
5672 const Register jdx = tmp1;
5673
5674 bind(L_second_loop);
5675 xorl(carry, carry); // carry = 0;
5676 movl(jdx, ylen); // j = ystart+1
5677
5678 subl(xstart, 1); // i = xstart-1;
5679 jcc(Assembler::negative, L_done);
5680
5681 push (z);
5682
5683 Label L_last_x;
5684 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
5685 subl(xstart, 1); // i = xstart-1;
5686 jcc(Assembler::negative, L_last_x);
5687
5688 if (UseBMI2Instructions) {
5689 movq(rdx, Address(x, xstart, Address::times_4, 0));
5690 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
5691 } else {
5692 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
5693 rorq(x_xstart, 32); // convert big-endian to little-endian
5694 }
5695
5696 Label L_third_loop_prologue;
5697 bind(L_third_loop_prologue);
5698
5699 push (x);
5700 push (xstart);
5701 push (ylen);
5702
5703
5704 if (UseBMI2Instructions) {
5705 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
5706 } else { // !UseBMI2Instructions
5707 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
5708 }
5709
5710 pop(ylen);
5711 pop(xlen);
5712 pop(x);
5713 pop(z);
5714
5715 movl(tmp3, xlen);
5716 addl(tmp3, 1);
5717 movl(Address(z, tmp3, Address::times_4, 0), carry);
5718 subl(tmp3, 1);
5719 jccb(Assembler::negative, L_done);
5720
5721 shrq(carry, 32);
5722 movl(Address(z, tmp3, Address::times_4, 0), carry);
5723 jmp(L_second_loop);
5724
5725 // Next infrequent code is moved outside loops.
5726 bind(L_last_x);
5727 if (UseBMI2Instructions) {
5728 movl(rdx, Address(x, 0));
5729 } else {
5730 movl(x_xstart, Address(x, 0));
5731 }
5732 jmp(L_third_loop_prologue);
5733
5734 bind(L_done);
5735
5736 pop(zlen);
5737 pop(xlen);
5738
5739 pop(tmp5);
5740 pop(tmp4);
5741 pop(tmp3);
5742 pop(tmp2);
5743 pop(tmp1);
5744 }
5745
vectorized_mismatch(Register obja,Register objb,Register length,Register log2_array_indxscale,Register result,Register tmp1,Register tmp2,XMMRegister rymm0,XMMRegister rymm1,XMMRegister rymm2)5746 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
5747 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
5748 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
5749 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
5750 Label VECTOR8_TAIL, VECTOR4_TAIL;
5751 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
5752 Label SAME_TILL_END, DONE;
5753 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
5754
5755 //scale is in rcx in both Win64 and Unix
5756 ShortBranchVerifier sbv(this);
5757
5758 shlq(length);
5759 xorq(result, result);
5760
5761 if ((AVX3Threshold == 0) && (UseAVX > 2) &&
5762 VM_Version::supports_avx512vlbw()) {
5763 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
5764
5765 cmpq(length, 64);
5766 jcc(Assembler::less, VECTOR32_TAIL);
5767
5768 movq(tmp1, length);
5769 andq(tmp1, 0x3F); // tail count
5770 andq(length, ~(0x3F)); //vector count
5771
5772 bind(VECTOR64_LOOP);
5773 // AVX512 code to compare 64 byte vectors.
5774 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
5775 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
5776 kortestql(k7, k7);
5777 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
5778 addq(result, 64);
5779 subq(length, 64);
5780 jccb(Assembler::notZero, VECTOR64_LOOP);
5781
5782 //bind(VECTOR64_TAIL);
5783 testq(tmp1, tmp1);
5784 jcc(Assembler::zero, SAME_TILL_END);
5785
5786 //bind(VECTOR64_TAIL);
5787 // AVX512 code to compare upto 63 byte vectors.
5788 mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
5789 shlxq(tmp2, tmp2, tmp1);
5790 notq(tmp2);
5791 kmovql(k3, tmp2);
5792
5793 evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit);
5794 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
5795
5796 ktestql(k7, k3);
5797 jcc(Assembler::below, SAME_TILL_END); // not mismatch
5798
5799 bind(VECTOR64_NOT_EQUAL);
5800 kmovql(tmp1, k7);
5801 notq(tmp1);
5802 tzcntq(tmp1, tmp1);
5803 addq(result, tmp1);
5804 shrq(result);
5805 jmp(DONE);
5806 bind(VECTOR32_TAIL);
5807 }
5808
5809 cmpq(length, 8);
5810 jcc(Assembler::equal, VECTOR8_LOOP);
5811 jcc(Assembler::less, VECTOR4_TAIL);
5812
5813 if (UseAVX >= 2) {
5814 Label VECTOR16_TAIL, VECTOR32_LOOP;
5815
5816 cmpq(length, 16);
5817 jcc(Assembler::equal, VECTOR16_LOOP);
5818 jcc(Assembler::less, VECTOR8_LOOP);
5819
5820 cmpq(length, 32);
5821 jccb(Assembler::less, VECTOR16_TAIL);
5822
5823 subq(length, 32);
5824 bind(VECTOR32_LOOP);
5825 vmovdqu(rymm0, Address(obja, result));
5826 vmovdqu(rymm1, Address(objb, result));
5827 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
5828 vptest(rymm2, rymm2);
5829 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
5830 addq(result, 32);
5831 subq(length, 32);
5832 jcc(Assembler::greaterEqual, VECTOR32_LOOP);
5833 addq(length, 32);
5834 jcc(Assembler::equal, SAME_TILL_END);
5835 //falling through if less than 32 bytes left //close the branch here.
5836
5837 bind(VECTOR16_TAIL);
5838 cmpq(length, 16);
5839 jccb(Assembler::less, VECTOR8_TAIL);
5840 bind(VECTOR16_LOOP);
5841 movdqu(rymm0, Address(obja, result));
5842 movdqu(rymm1, Address(objb, result));
5843 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
5844 ptest(rymm2, rymm2);
5845 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
5846 addq(result, 16);
5847 subq(length, 16);
5848 jcc(Assembler::equal, SAME_TILL_END);
5849 //falling through if less than 16 bytes left
5850 } else {//regular intrinsics
5851
5852 cmpq(length, 16);
5853 jccb(Assembler::less, VECTOR8_TAIL);
5854
5855 subq(length, 16);
5856 bind(VECTOR16_LOOP);
5857 movdqu(rymm0, Address(obja, result));
5858 movdqu(rymm1, Address(objb, result));
5859 pxor(rymm0, rymm1);
5860 ptest(rymm0, rymm0);
5861 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
5862 addq(result, 16);
5863 subq(length, 16);
5864 jccb(Assembler::greaterEqual, VECTOR16_LOOP);
5865 addq(length, 16);
5866 jcc(Assembler::equal, SAME_TILL_END);
5867 //falling through if less than 16 bytes left
5868 }
5869
5870 bind(VECTOR8_TAIL);
5871 cmpq(length, 8);
5872 jccb(Assembler::less, VECTOR4_TAIL);
5873 bind(VECTOR8_LOOP);
5874 movq(tmp1, Address(obja, result));
5875 movq(tmp2, Address(objb, result));
5876 xorq(tmp1, tmp2);
5877 testq(tmp1, tmp1);
5878 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
5879 addq(result, 8);
5880 subq(length, 8);
5881 jcc(Assembler::equal, SAME_TILL_END);
5882 //falling through if less than 8 bytes left
5883
5884 bind(VECTOR4_TAIL);
5885 cmpq(length, 4);
5886 jccb(Assembler::less, BYTES_TAIL);
5887 bind(VECTOR4_LOOP);
5888 movl(tmp1, Address(obja, result));
5889 xorl(tmp1, Address(objb, result));
5890 testl(tmp1, tmp1);
5891 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
5892 addq(result, 4);
5893 subq(length, 4);
5894 jcc(Assembler::equal, SAME_TILL_END);
5895 //falling through if less than 4 bytes left
5896
5897 bind(BYTES_TAIL);
5898 bind(BYTES_LOOP);
5899 load_unsigned_byte(tmp1, Address(obja, result));
5900 load_unsigned_byte(tmp2, Address(objb, result));
5901 xorl(tmp1, tmp2);
5902 testl(tmp1, tmp1);
5903 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
5904 decq(length);
5905 jcc(Assembler::zero, SAME_TILL_END);
5906 incq(result);
5907 load_unsigned_byte(tmp1, Address(obja, result));
5908 load_unsigned_byte(tmp2, Address(objb, result));
5909 xorl(tmp1, tmp2);
5910 testl(tmp1, tmp1);
5911 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
5912 decq(length);
5913 jcc(Assembler::zero, SAME_TILL_END);
5914 incq(result);
5915 load_unsigned_byte(tmp1, Address(obja, result));
5916 load_unsigned_byte(tmp2, Address(objb, result));
5917 xorl(tmp1, tmp2);
5918 testl(tmp1, tmp1);
5919 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
5920 jmp(SAME_TILL_END);
5921
5922 if (UseAVX >= 2) {
5923 bind(VECTOR32_NOT_EQUAL);
5924 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
5925 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
5926 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
5927 vpmovmskb(tmp1, rymm0);
5928 bsfq(tmp1, tmp1);
5929 addq(result, tmp1);
5930 shrq(result);
5931 jmp(DONE);
5932 }
5933
5934 bind(VECTOR16_NOT_EQUAL);
5935 if (UseAVX >= 2) {
5936 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
5937 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
5938 pxor(rymm0, rymm2);
5939 } else {
5940 pcmpeqb(rymm2, rymm2);
5941 pxor(rymm0, rymm1);
5942 pcmpeqb(rymm0, rymm1);
5943 pxor(rymm0, rymm2);
5944 }
5945 pmovmskb(tmp1, rymm0);
5946 bsfq(tmp1, tmp1);
5947 addq(result, tmp1);
5948 shrq(result);
5949 jmpb(DONE);
5950
5951 bind(VECTOR8_NOT_EQUAL);
5952 bind(VECTOR4_NOT_EQUAL);
5953 bsfq(tmp1, tmp1);
5954 shrq(tmp1, 3);
5955 addq(result, tmp1);
5956 bind(BYTES_NOT_EQUAL);
5957 shrq(result);
5958 jmpb(DONE);
5959
5960 bind(SAME_TILL_END);
5961 mov64(result, -1);
5962
5963 bind(DONE);
5964 }
5965
5966 //Helper functions for square_to_len()
5967
5968 /**
5969 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
5970 * Preserves x and z and modifies rest of the registers.
5971 */
square_rshift(Register x,Register xlen,Register z,Register tmp1,Register tmp3,Register tmp4,Register tmp5,Register rdxReg,Register raxReg)5972 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
5973 // Perform square and right shift by 1
5974 // Handle odd xlen case first, then for even xlen do the following
5975 // jlong carry = 0;
5976 // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
5977 // huge_128 product = x[j:j+1] * x[j:j+1];
5978 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
5979 // z[i+2:i+3] = (jlong)(product >>> 1);
5980 // carry = (jlong)product;
5981 // }
5982
5983 xorq(tmp5, tmp5); // carry
5984 xorq(rdxReg, rdxReg);
5985 xorl(tmp1, tmp1); // index for x
5986 xorl(tmp4, tmp4); // index for z
5987
5988 Label L_first_loop, L_first_loop_exit;
5989
5990 testl(xlen, 1);
5991 jccb(Assembler::zero, L_first_loop); //jump if xlen is even
5992
5993 // Square and right shift by 1 the odd element using 32 bit multiply
5994 movl(raxReg, Address(x, tmp1, Address::times_4, 0));
5995 imulq(raxReg, raxReg);
5996 shrq(raxReg, 1);
5997 adcq(tmp5, 0);
5998 movq(Address(z, tmp4, Address::times_4, 0), raxReg);
5999 incrementl(tmp1);
6000 addl(tmp4, 2);
6001
6002 // Square and right shift by 1 the rest using 64 bit multiply
6003 bind(L_first_loop);
6004 cmpptr(tmp1, xlen);
6005 jccb(Assembler::equal, L_first_loop_exit);
6006
6007 // Square
6008 movq(raxReg, Address(x, tmp1, Address::times_4, 0));
6009 rorq(raxReg, 32); // convert big-endian to little-endian
6010 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
6011
6012 // Right shift by 1 and save carry
6013 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6014 rcrq(rdxReg, 1);
6015 rcrq(raxReg, 1);
6016 adcq(tmp5, 0);
6017
6018 // Store result in z
6019 movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6020 movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6021
6022 // Update indices for x and z
6023 addl(tmp1, 2);
6024 addl(tmp4, 4);
6025 jmp(L_first_loop);
6026
6027 bind(L_first_loop_exit);
6028 }
6029
6030
6031 /**
6032 * Perform the following multiply add operation using BMI2 instructions
6033 * carry:sum = sum + op1*op2 + carry
6034 * op2 should be in rdx
6035 * op2 is preserved, all other registers are modified
6036 */
multiply_add_64_bmi2(Register sum,Register op1,Register op2,Register carry,Register tmp2)6037 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6038 // assert op2 is rdx
6039 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
6040 addq(sum, carry);
6041 adcq(tmp2, 0);
6042 addq(sum, op1);
6043 adcq(tmp2, 0);
6044 movq(carry, tmp2);
6045 }
6046
6047 /**
6048 * Perform the following multiply add operation:
6049 * carry:sum = sum + op1*op2 + carry
6050 * Preserves op1, op2 and modifies rest of registers
6051 */
multiply_add_64(Register sum,Register op1,Register op2,Register carry,Register rdxReg,Register raxReg)6052 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6053 // rdx:rax = op1 * op2
6054 movq(raxReg, op2);
6055 mulq(op1);
6056
6057 // rdx:rax = sum + carry + rdx:rax
6058 addq(sum, carry);
6059 adcq(rdxReg, 0);
6060 addq(sum, raxReg);
6061 adcq(rdxReg, 0);
6062
6063 // carry:sum = rdx:sum
6064 movq(carry, rdxReg);
6065 }
6066
6067 /**
6068 * Add 64 bit long carry into z[] with carry propogation.
6069 * Preserves z and carry register values and modifies rest of registers.
6070 *
6071 */
add_one_64(Register z,Register zlen,Register carry,Register tmp1)6072 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6073 Label L_fourth_loop, L_fourth_loop_exit;
6074
6075 movl(tmp1, 1);
6076 subl(zlen, 2);
6077 addq(Address(z, zlen, Address::times_4, 0), carry);
6078
6079 bind(L_fourth_loop);
6080 jccb(Assembler::carryClear, L_fourth_loop_exit);
6081 subl(zlen, 2);
6082 jccb(Assembler::negative, L_fourth_loop_exit);
6083 addq(Address(z, zlen, Address::times_4, 0), tmp1);
6084 jmp(L_fourth_loop);
6085 bind(L_fourth_loop_exit);
6086 }
6087
6088 /**
6089 * Shift z[] left by 1 bit.
6090 * Preserves x, len, z and zlen registers and modifies rest of the registers.
6091 *
6092 */
lshift_by_1(Register x,Register len,Register z,Register zlen,Register tmp1,Register tmp2,Register tmp3,Register tmp4)6093 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6094
6095 Label L_fifth_loop, L_fifth_loop_exit;
6096
6097 // Fifth loop
6098 // Perform primitiveLeftShift(z, zlen, 1)
6099
6100 const Register prev_carry = tmp1;
6101 const Register new_carry = tmp4;
6102 const Register value = tmp2;
6103 const Register zidx = tmp3;
6104
6105 // int zidx, carry;
6106 // long value;
6107 // carry = 0;
6108 // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6109 // (carry:value) = (z[i] << 1) | carry ;
6110 // z[i] = value;
6111 // }
6112
6113 movl(zidx, zlen);
6114 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6115
6116 bind(L_fifth_loop);
6117 decl(zidx); // Use decl to preserve carry flag
6118 decl(zidx);
6119 jccb(Assembler::negative, L_fifth_loop_exit);
6120
6121 if (UseBMI2Instructions) {
6122 movq(value, Address(z, zidx, Address::times_4, 0));
6123 rclq(value, 1);
6124 rorxq(value, value, 32);
6125 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
6126 }
6127 else {
6128 // clear new_carry
6129 xorl(new_carry, new_carry);
6130
6131 // Shift z[i] by 1, or in previous carry and save new carry
6132 movq(value, Address(z, zidx, Address::times_4, 0));
6133 shlq(value, 1);
6134 adcl(new_carry, 0);
6135
6136 orq(value, prev_carry);
6137 rorq(value, 0x20);
6138 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
6139
6140 // Set previous carry = new carry
6141 movl(prev_carry, new_carry);
6142 }
6143 jmp(L_fifth_loop);
6144
6145 bind(L_fifth_loop_exit);
6146 }
6147
6148
6149 /**
6150 * Code for BigInteger::squareToLen() intrinsic
6151 *
6152 * rdi: x
6153 * rsi: len
6154 * r8: z
6155 * rcx: zlen
6156 * r12: tmp1
6157 * r13: tmp2
6158 * r14: tmp3
6159 * r15: tmp4
6160 * rbx: tmp5
6161 *
6162 */
square_to_len(Register x,Register len,Register z,Register zlen,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register rdxReg,Register raxReg)6163 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6164
6165 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6166 push(tmp1);
6167 push(tmp2);
6168 push(tmp3);
6169 push(tmp4);
6170 push(tmp5);
6171
6172 // First loop
6173 // Store the squares, right shifted one bit (i.e., divided by 2).
6174 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6175
6176 // Add in off-diagonal sums.
6177 //
6178 // Second, third (nested) and fourth loops.
6179 // zlen +=2;
6180 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6181 // carry = 0;
6182 // long op2 = x[xidx:xidx+1];
6183 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6184 // k -= 2;
6185 // long op1 = x[j:j+1];
6186 // long sum = z[k:k+1];
6187 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6188 // z[k:k+1] = sum;
6189 // }
6190 // add_one_64(z, k, carry, tmp_regs);
6191 // }
6192
6193 const Register carry = tmp5;
6194 const Register sum = tmp3;
6195 const Register op1 = tmp4;
6196 Register op2 = tmp2;
6197
6198 push(zlen);
6199 push(len);
6200 addl(zlen,2);
6201 bind(L_second_loop);
6202 xorq(carry, carry);
6203 subl(zlen, 4);
6204 subl(len, 2);
6205 push(zlen);
6206 push(len);
6207 cmpl(len, 0);
6208 jccb(Assembler::lessEqual, L_second_loop_exit);
6209
6210 // Multiply an array by one 64 bit long.
6211 if (UseBMI2Instructions) {
6212 op2 = rdxReg;
6213 movq(op2, Address(x, len, Address::times_4, 0));
6214 rorxq(op2, op2, 32);
6215 }
6216 else {
6217 movq(op2, Address(x, len, Address::times_4, 0));
6218 rorq(op2, 32);
6219 }
6220
6221 bind(L_third_loop);
6222 decrementl(len);
6223 jccb(Assembler::negative, L_third_loop_exit);
6224 decrementl(len);
6225 jccb(Assembler::negative, L_last_x);
6226
6227 movq(op1, Address(x, len, Address::times_4, 0));
6228 rorq(op1, 32);
6229
6230 bind(L_multiply);
6231 subl(zlen, 2);
6232 movq(sum, Address(z, zlen, Address::times_4, 0));
6233
6234 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6235 if (UseBMI2Instructions) {
6236 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6237 }
6238 else {
6239 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6240 }
6241
6242 movq(Address(z, zlen, Address::times_4, 0), sum);
6243
6244 jmp(L_third_loop);
6245 bind(L_third_loop_exit);
6246
6247 // Fourth loop
6248 // Add 64 bit long carry into z with carry propogation.
6249 // Uses offsetted zlen.
6250 add_one_64(z, zlen, carry, tmp1);
6251
6252 pop(len);
6253 pop(zlen);
6254 jmp(L_second_loop);
6255
6256 // Next infrequent code is moved outside loops.
6257 bind(L_last_x);
6258 movl(op1, Address(x, 0));
6259 jmp(L_multiply);
6260
6261 bind(L_second_loop_exit);
6262 pop(len);
6263 pop(zlen);
6264 pop(len);
6265 pop(zlen);
6266
6267 // Fifth loop
6268 // Shift z left 1 bit.
6269 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
6270
6271 // z[zlen-1] |= x[len-1] & 1;
6272 movl(tmp3, Address(x, len, Address::times_4, -4));
6273 andl(tmp3, 1);
6274 orl(Address(z, zlen, Address::times_4, -4), tmp3);
6275
6276 pop(tmp5);
6277 pop(tmp4);
6278 pop(tmp3);
6279 pop(tmp2);
6280 pop(tmp1);
6281 }
6282
6283 /**
6284 * Helper function for mul_add()
6285 * Multiply the in[] by int k and add to out[] starting at offset offs using
6286 * 128 bit by 32 bit multiply and return the carry in tmp5.
6287 * Only quad int aligned length of in[] is operated on in this function.
6288 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
6289 * This function preserves out, in and k registers.
6290 * len and offset point to the appropriate index in "in" & "out" correspondingly
6291 * tmp5 has the carry.
6292 * other registers are temporary and are modified.
6293 *
6294 */
mul_add_128_x_32_loop(Register out,Register in,Register offset,Register len,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register rdxReg,Register raxReg)6295 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
6296 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
6297 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6298
6299 Label L_first_loop, L_first_loop_exit;
6300
6301 movl(tmp1, len);
6302 shrl(tmp1, 2);
6303
6304 bind(L_first_loop);
6305 subl(tmp1, 1);
6306 jccb(Assembler::negative, L_first_loop_exit);
6307
6308 subl(len, 4);
6309 subl(offset, 4);
6310
6311 Register op2 = tmp2;
6312 const Register sum = tmp3;
6313 const Register op1 = tmp4;
6314 const Register carry = tmp5;
6315
6316 if (UseBMI2Instructions) {
6317 op2 = rdxReg;
6318 }
6319
6320 movq(op1, Address(in, len, Address::times_4, 8));
6321 rorq(op1, 32);
6322 movq(sum, Address(out, offset, Address::times_4, 8));
6323 rorq(sum, 32);
6324 if (UseBMI2Instructions) {
6325 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6326 }
6327 else {
6328 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6329 }
6330 // Store back in big endian from little endian
6331 rorq(sum, 0x20);
6332 movq(Address(out, offset, Address::times_4, 8), sum);
6333
6334 movq(op1, Address(in, len, Address::times_4, 0));
6335 rorq(op1, 32);
6336 movq(sum, Address(out, offset, Address::times_4, 0));
6337 rorq(sum, 32);
6338 if (UseBMI2Instructions) {
6339 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6340 }
6341 else {
6342 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6343 }
6344 // Store back in big endian from little endian
6345 rorq(sum, 0x20);
6346 movq(Address(out, offset, Address::times_4, 0), sum);
6347
6348 jmp(L_first_loop);
6349 bind(L_first_loop_exit);
6350 }
6351
6352 /**
6353 * Code for BigInteger::mulAdd() intrinsic
6354 *
6355 * rdi: out
6356 * rsi: in
6357 * r11: offs (out.length - offset)
6358 * rcx: len
6359 * r8: k
6360 * r12: tmp1
6361 * r13: tmp2
6362 * r14: tmp3
6363 * r15: tmp4
6364 * rbx: tmp5
6365 * Multiply the in[] by word k and add to out[], return the carry in rax
6366 */
mul_add(Register out,Register in,Register offs,Register len,Register k,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register rdxReg,Register raxReg)6367 void MacroAssembler::mul_add(Register out, Register in, Register offs,
6368 Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
6369 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6370
6371 Label L_carry, L_last_in, L_done;
6372
6373 // carry = 0;
6374 // for (int j=len-1; j >= 0; j--) {
6375 // long product = (in[j] & LONG_MASK) * kLong +
6376 // (out[offs] & LONG_MASK) + carry;
6377 // out[offs--] = (int)product;
6378 // carry = product >>> 32;
6379 // }
6380 //
6381 push(tmp1);
6382 push(tmp2);
6383 push(tmp3);
6384 push(tmp4);
6385 push(tmp5);
6386
6387 Register op2 = tmp2;
6388 const Register sum = tmp3;
6389 const Register op1 = tmp4;
6390 const Register carry = tmp5;
6391
6392 if (UseBMI2Instructions) {
6393 op2 = rdxReg;
6394 movl(op2, k);
6395 }
6396 else {
6397 movl(op2, k);
6398 }
6399
6400 xorq(carry, carry);
6401
6402 //First loop
6403
6404 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
6405 //The carry is in tmp5
6406 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
6407
6408 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
6409 decrementl(len);
6410 jccb(Assembler::negative, L_carry);
6411 decrementl(len);
6412 jccb(Assembler::negative, L_last_in);
6413
6414 movq(op1, Address(in, len, Address::times_4, 0));
6415 rorq(op1, 32);
6416
6417 subl(offs, 2);
6418 movq(sum, Address(out, offs, Address::times_4, 0));
6419 rorq(sum, 32);
6420
6421 if (UseBMI2Instructions) {
6422 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6423 }
6424 else {
6425 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6426 }
6427
6428 // Store back in big endian from little endian
6429 rorq(sum, 0x20);
6430 movq(Address(out, offs, Address::times_4, 0), sum);
6431
6432 testl(len, len);
6433 jccb(Assembler::zero, L_carry);
6434
6435 //Multiply the last in[] entry, if any
6436 bind(L_last_in);
6437 movl(op1, Address(in, 0));
6438 movl(sum, Address(out, offs, Address::times_4, -4));
6439
6440 movl(raxReg, k);
6441 mull(op1); //tmp4 * eax -> edx:eax
6442 addl(sum, carry);
6443 adcl(rdxReg, 0);
6444 addl(sum, raxReg);
6445 adcl(rdxReg, 0);
6446 movl(carry, rdxReg);
6447
6448 movl(Address(out, offs, Address::times_4, -4), sum);
6449
6450 bind(L_carry);
6451 //return tmp5/carry as carry in rax
6452 movl(rax, carry);
6453
6454 bind(L_done);
6455 pop(tmp5);
6456 pop(tmp4);
6457 pop(tmp3);
6458 pop(tmp2);
6459 pop(tmp1);
6460 }
6461 #endif
6462
6463 /**
6464 * Emits code to update CRC-32 with a byte value according to constants in table
6465 *
6466 * @param [in,out]crc Register containing the crc.
6467 * @param [in]val Register containing the byte to fold into the CRC.
6468 * @param [in]table Register containing the table of crc constants.
6469 *
6470 * uint32_t crc;
6471 * val = crc_table[(val ^ crc) & 0xFF];
6472 * crc = val ^ (crc >> 8);
6473 *
6474 */
update_byte_crc32(Register crc,Register val,Register table)6475 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
6476 xorl(val, crc);
6477 andl(val, 0xFF);
6478 shrl(crc, 8); // unsigned shift
6479 xorl(crc, Address(table, val, Address::times_4, 0));
6480 }
6481
6482 /**
6483 * Fold 128-bit data chunk
6484 */
fold_128bit_crc32(XMMRegister xcrc,XMMRegister xK,XMMRegister xtmp,Register buf,int offset)6485 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
6486 if (UseAVX > 0) {
6487 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
6488 vpclmulldq(xcrc, xK, xcrc); // [63:0]
6489 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
6490 pxor(xcrc, xtmp);
6491 } else {
6492 movdqa(xtmp, xcrc);
6493 pclmulhdq(xtmp, xK); // [123:64]
6494 pclmulldq(xcrc, xK); // [63:0]
6495 pxor(xcrc, xtmp);
6496 movdqu(xtmp, Address(buf, offset));
6497 pxor(xcrc, xtmp);
6498 }
6499 }
6500
fold_128bit_crc32(XMMRegister xcrc,XMMRegister xK,XMMRegister xtmp,XMMRegister xbuf)6501 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
6502 if (UseAVX > 0) {
6503 vpclmulhdq(xtmp, xK, xcrc);
6504 vpclmulldq(xcrc, xK, xcrc);
6505 pxor(xcrc, xbuf);
6506 pxor(xcrc, xtmp);
6507 } else {
6508 movdqa(xtmp, xcrc);
6509 pclmulhdq(xtmp, xK);
6510 pclmulldq(xcrc, xK);
6511 pxor(xcrc, xbuf);
6512 pxor(xcrc, xtmp);
6513 }
6514 }
6515
6516 /**
6517 * 8-bit folds to compute 32-bit CRC
6518 *
6519 * uint64_t xcrc;
6520 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
6521 */
fold_8bit_crc32(XMMRegister xcrc,Register table,XMMRegister xtmp,Register tmp)6522 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
6523 movdl(tmp, xcrc);
6524 andl(tmp, 0xFF);
6525 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
6526 psrldq(xcrc, 1); // unsigned shift one byte
6527 pxor(xcrc, xtmp);
6528 }
6529
6530 /**
6531 * uint32_t crc;
6532 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
6533 */
fold_8bit_crc32(Register crc,Register table,Register tmp)6534 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
6535 movl(tmp, crc);
6536 andl(tmp, 0xFF);
6537 shrl(crc, 8);
6538 xorl(crc, Address(table, tmp, Address::times_4, 0));
6539 }
6540
6541 /**
6542 * @param crc register containing existing CRC (32-bit)
6543 * @param buf register pointing to input byte buffer (byte*)
6544 * @param len register containing number of bytes
6545 * @param table register that will contain address of CRC table
6546 * @param tmp scratch register
6547 */
kernel_crc32(Register crc,Register buf,Register len,Register table,Register tmp)6548 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
6549 assert_different_registers(crc, buf, len, table, tmp, rax);
6550
6551 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
6552 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
6553
6554 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
6555 // context for the registers used, where all instructions below are using 128-bit mode
6556 // On EVEX without VL and BW, these instructions will all be AVX.
6557 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
6558 notl(crc); // ~crc
6559 cmpl(len, 16);
6560 jcc(Assembler::less, L_tail);
6561
6562 // Align buffer to 16 bytes
6563 movl(tmp, buf);
6564 andl(tmp, 0xF);
6565 jccb(Assembler::zero, L_aligned);
6566 subl(tmp, 16);
6567 addl(len, tmp);
6568
6569 align(4);
6570 BIND(L_align_loop);
6571 movsbl(rax, Address(buf, 0)); // load byte with sign extension
6572 update_byte_crc32(crc, rax, table);
6573 increment(buf);
6574 incrementl(tmp);
6575 jccb(Assembler::less, L_align_loop);
6576
6577 BIND(L_aligned);
6578 movl(tmp, len); // save
6579 shrl(len, 4);
6580 jcc(Assembler::zero, L_tail_restore);
6581
6582 // Fold crc into first bytes of vector
6583 movdqa(xmm1, Address(buf, 0));
6584 movdl(rax, xmm1);
6585 xorl(crc, rax);
6586 if (VM_Version::supports_sse4_1()) {
6587 pinsrd(xmm1, crc, 0);
6588 } else {
6589 pinsrw(xmm1, crc, 0);
6590 shrl(crc, 16);
6591 pinsrw(xmm1, crc, 1);
6592 }
6593 addptr(buf, 16);
6594 subl(len, 4); // len > 0
6595 jcc(Assembler::less, L_fold_tail);
6596
6597 movdqa(xmm2, Address(buf, 0));
6598 movdqa(xmm3, Address(buf, 16));
6599 movdqa(xmm4, Address(buf, 32));
6600 addptr(buf, 48);
6601 subl(len, 3);
6602 jcc(Assembler::lessEqual, L_fold_512b);
6603
6604 // Fold total 512 bits of polynomial on each iteration,
6605 // 128 bits per each of 4 parallel streams.
6606 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
6607
6608 align(32);
6609 BIND(L_fold_512b_loop);
6610 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
6611 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
6612 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
6613 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
6614 addptr(buf, 64);
6615 subl(len, 4);
6616 jcc(Assembler::greater, L_fold_512b_loop);
6617
6618 // Fold 512 bits to 128 bits.
6619 BIND(L_fold_512b);
6620 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
6621 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
6622 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
6623 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
6624
6625 // Fold the rest of 128 bits data chunks
6626 BIND(L_fold_tail);
6627 addl(len, 3);
6628 jccb(Assembler::lessEqual, L_fold_128b);
6629 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
6630
6631 BIND(L_fold_tail_loop);
6632 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
6633 addptr(buf, 16);
6634 decrementl(len);
6635 jccb(Assembler::greater, L_fold_tail_loop);
6636
6637 // Fold 128 bits in xmm1 down into 32 bits in crc register.
6638 BIND(L_fold_128b);
6639 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
6640 if (UseAVX > 0) {
6641 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
6642 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
6643 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
6644 } else {
6645 movdqa(xmm2, xmm0);
6646 pclmulqdq(xmm2, xmm1, 0x1);
6647 movdqa(xmm3, xmm0);
6648 pand(xmm3, xmm2);
6649 pclmulqdq(xmm0, xmm3, 0x1);
6650 }
6651 psrldq(xmm1, 8);
6652 psrldq(xmm2, 4);
6653 pxor(xmm0, xmm1);
6654 pxor(xmm0, xmm2);
6655
6656 // 8 8-bit folds to compute 32-bit CRC.
6657 for (int j = 0; j < 4; j++) {
6658 fold_8bit_crc32(xmm0, table, xmm1, rax);
6659 }
6660 movdl(crc, xmm0); // mov 32 bits to general register
6661 for (int j = 0; j < 4; j++) {
6662 fold_8bit_crc32(crc, table, rax);
6663 }
6664
6665 BIND(L_tail_restore);
6666 movl(len, tmp); // restore
6667 BIND(L_tail);
6668 andl(len, 0xf);
6669 jccb(Assembler::zero, L_exit);
6670
6671 // Fold the rest of bytes
6672 align(4);
6673 BIND(L_tail_loop);
6674 movsbl(rax, Address(buf, 0)); // load byte with sign extension
6675 update_byte_crc32(crc, rax, table);
6676 increment(buf);
6677 decrementl(len);
6678 jccb(Assembler::greater, L_tail_loop);
6679
6680 BIND(L_exit);
6681 notl(crc); // ~c
6682 }
6683
6684 #ifdef _LP64
6685 // Helper function for AVX 512 CRC32
6686 // Fold 512-bit data chunks
fold512bit_crc32_avx512(XMMRegister xcrc,XMMRegister xK,XMMRegister xtmp,Register buf,Register pos,int offset)6687 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
6688 Register pos, int offset) {
6689 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
6690 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
6691 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
6692 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
6693 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
6694 }
6695
6696 // Helper function for AVX 512 CRC32
6697 // Compute CRC32 for < 256B buffers
kernel_crc32_avx512_256B(Register crc,Register buf,Register len,Register key,Register pos,Register tmp1,Register tmp2,Label & L_barrett,Label & L_16B_reduction_loop,Label & L_get_last_two_xmms,Label & L_128_done,Label & L_cleanup)6698 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
6699 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
6700 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
6701
6702 Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
6703 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
6704 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
6705
6706 // check if there is enough buffer to be able to fold 16B at a time
6707 cmpl(len, 32);
6708 jcc(Assembler::less, L_less_than_32);
6709
6710 // if there is, load the constants
6711 movdqu(xmm10, Address(key, 1 * 16)); //rk1 and rk2 in xmm10
6712 movdl(xmm0, crc); // get the initial crc value
6713 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
6714 pxor(xmm7, xmm0);
6715
6716 // update the buffer pointer
6717 addl(pos, 16);
6718 //update the counter.subtract 32 instead of 16 to save one instruction from the loop
6719 subl(len, 32);
6720 jmp(L_16B_reduction_loop);
6721
6722 bind(L_less_than_32);
6723 //mov initial crc to the return value. this is necessary for zero - length buffers.
6724 movl(rax, crc);
6725 testl(len, len);
6726 jcc(Assembler::equal, L_cleanup);
6727
6728 movdl(xmm0, crc); //get the initial crc value
6729
6730 cmpl(len, 16);
6731 jcc(Assembler::equal, L_exact_16_left);
6732 jcc(Assembler::less, L_less_than_16_left);
6733
6734 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
6735 pxor(xmm7, xmm0); //xor the initial crc value
6736 addl(pos, 16);
6737 subl(len, 16);
6738 movdqu(xmm10, Address(key, 1 * 16)); // rk1 and rk2 in xmm10
6739 jmp(L_get_last_two_xmms);
6740
6741 bind(L_less_than_16_left);
6742 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
6743 pxor(xmm1, xmm1);
6744 movptr(tmp1, rsp);
6745 movdqu(Address(tmp1, 0 * 16), xmm1);
6746
6747 cmpl(len, 4);
6748 jcc(Assembler::less, L_only_less_than_4);
6749
6750 //backup the counter value
6751 movl(tmp2, len);
6752 cmpl(len, 8);
6753 jcc(Assembler::less, L_less_than_8_left);
6754
6755 //load 8 Bytes
6756 movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
6757 movq(Address(tmp1, 0 * 16), rax);
6758 addptr(tmp1, 8);
6759 subl(len, 8);
6760 addl(pos, 8);
6761
6762 bind(L_less_than_8_left);
6763 cmpl(len, 4);
6764 jcc(Assembler::less, L_less_than_4_left);
6765
6766 //load 4 Bytes
6767 movl(rax, Address(buf, pos, Address::times_1, 0));
6768 movl(Address(tmp1, 0 * 16), rax);
6769 addptr(tmp1, 4);
6770 subl(len, 4);
6771 addl(pos, 4);
6772
6773 bind(L_less_than_4_left);
6774 cmpl(len, 2);
6775 jcc(Assembler::less, L_less_than_2_left);
6776
6777 // load 2 Bytes
6778 movw(rax, Address(buf, pos, Address::times_1, 0));
6779 movl(Address(tmp1, 0 * 16), rax);
6780 addptr(tmp1, 2);
6781 subl(len, 2);
6782 addl(pos, 2);
6783
6784 bind(L_less_than_2_left);
6785 cmpl(len, 1);
6786 jcc(Assembler::less, L_zero_left);
6787
6788 // load 1 Byte
6789 movb(rax, Address(buf, pos, Address::times_1, 0));
6790 movb(Address(tmp1, 0 * 16), rax);
6791
6792 bind(L_zero_left);
6793 movdqu(xmm7, Address(rsp, 0));
6794 pxor(xmm7, xmm0); //xor the initial crc value
6795
6796 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
6797 movdqu(xmm0, Address(rax, tmp2));
6798 pshufb(xmm7, xmm0);
6799 jmp(L_128_done);
6800
6801 bind(L_exact_16_left);
6802 movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
6803 pxor(xmm7, xmm0); //xor the initial crc value
6804 jmp(L_128_done);
6805
6806 bind(L_only_less_than_4);
6807 cmpl(len, 3);
6808 jcc(Assembler::less, L_only_less_than_3);
6809
6810 // load 3 Bytes
6811 movb(rax, Address(buf, pos, Address::times_1, 0));
6812 movb(Address(tmp1, 0), rax);
6813
6814 movb(rax, Address(buf, pos, Address::times_1, 1));
6815 movb(Address(tmp1, 1), rax);
6816
6817 movb(rax, Address(buf, pos, Address::times_1, 2));
6818 movb(Address(tmp1, 2), rax);
6819
6820 movdqu(xmm7, Address(rsp, 0));
6821 pxor(xmm7, xmm0); //xor the initial crc value
6822
6823 pslldq(xmm7, 0x5);
6824 jmp(L_barrett);
6825 bind(L_only_less_than_3);
6826 cmpl(len, 2);
6827 jcc(Assembler::less, L_only_less_than_2);
6828
6829 // load 2 Bytes
6830 movb(rax, Address(buf, pos, Address::times_1, 0));
6831 movb(Address(tmp1, 0), rax);
6832
6833 movb(rax, Address(buf, pos, Address::times_1, 1));
6834 movb(Address(tmp1, 1), rax);
6835
6836 movdqu(xmm7, Address(rsp, 0));
6837 pxor(xmm7, xmm0); //xor the initial crc value
6838
6839 pslldq(xmm7, 0x6);
6840 jmp(L_barrett);
6841
6842 bind(L_only_less_than_2);
6843 //load 1 Byte
6844 movb(rax, Address(buf, pos, Address::times_1, 0));
6845 movb(Address(tmp1, 0), rax);
6846
6847 movdqu(xmm7, Address(rsp, 0));
6848 pxor(xmm7, xmm0); //xor the initial crc value
6849
6850 pslldq(xmm7, 0x7);
6851 }
6852
6853 /**
6854 * Compute CRC32 using AVX512 instructions
6855 * param crc register containing existing CRC (32-bit)
6856 * param buf register pointing to input byte buffer (byte*)
6857 * param len register containing number of bytes
6858 * param tmp1 scratch register
6859 * param tmp2 scratch register
6860 * return rax result register
6861 */
kernel_crc32_avx512(Register crc,Register buf,Register len,Register key,Register tmp1,Register tmp2)6862 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register key, Register tmp1, Register tmp2) {
6863 assert_different_registers(crc, buf, len, key, tmp1, tmp2, rax);
6864
6865 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
6866 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
6867 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
6868 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
6869 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
6870
6871 const Register pos = r12;
6872 push(r12);
6873 subptr(rsp, 16 * 2 + 8);
6874
6875 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
6876 // context for the registers used, where all instructions below are using 128-bit mode
6877 // On EVEX without VL and BW, these instructions will all be AVX.
6878 lea(key, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
6879 notl(crc);
6880 movl(pos, 0);
6881
6882 // check if smaller than 256B
6883 cmpl(len, 256);
6884 jcc(Assembler::less, L_less_than_256);
6885
6886 // load the initial crc value
6887 movdl(xmm10, crc);
6888
6889 // receive the initial 64B data, xor the initial crc value
6890 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
6891 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
6892 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
6893 evbroadcasti32x4(xmm10, Address(key, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
6894
6895 subl(len, 256);
6896 cmpl(len, 256);
6897 jcc(Assembler::less, L_fold_128_B_loop);
6898
6899 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
6900 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
6901 evbroadcasti32x4(xmm16, Address(key, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
6902 subl(len, 256);
6903
6904 bind(L_fold_256_B_loop);
6905 addl(pos, 256);
6906 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
6907 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
6908 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
6909 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
6910
6911 subl(len, 256);
6912 jcc(Assembler::greaterEqual, L_fold_256_B_loop);
6913
6914 // Fold 256 into 128
6915 addl(pos, 256);
6916 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
6917 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
6918 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
6919
6920 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
6921 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
6922 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
6923
6924 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
6925 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
6926
6927 addl(len, 128);
6928 jmp(L_fold_128_B_register);
6929
6930 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
6931 // loop will fold 128B at a time until we have 128 + y Bytes of buffer
6932
6933 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
6934 bind(L_fold_128_B_loop);
6935 addl(pos, 128);
6936 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
6937 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
6938
6939 subl(len, 128);
6940 jcc(Assembler::greaterEqual, L_fold_128_B_loop);
6941
6942 addl(pos, 128);
6943
6944 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
6945 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
6946 bind(L_fold_128_B_register);
6947 evmovdquq(xmm16, Address(key, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
6948 evmovdquq(xmm11, Address(key, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
6949 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
6950 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
6951 // save last that has no multiplicand
6952 vextracti64x2(xmm7, xmm4, 3);
6953
6954 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
6955 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
6956 // Needed later in reduction loop
6957 movdqu(xmm10, Address(key, 1 * 16));
6958 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
6959 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
6960
6961 // Swap 1,0,3,2 - 01 00 11 10
6962 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
6963 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
6964 vextracti128(xmm5, xmm8, 1);
6965 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
6966
6967 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
6968 // instead of a cmp instruction, we use the negative flag with the jl instruction
6969 addl(len, 128 - 16);
6970 jcc(Assembler::less, L_final_reduction_for_128);
6971
6972 bind(L_16B_reduction_loop);
6973 vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
6974 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
6975 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
6976 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
6977 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
6978 addl(pos, 16);
6979 subl(len, 16);
6980 jcc(Assembler::greaterEqual, L_16B_reduction_loop);
6981
6982 bind(L_final_reduction_for_128);
6983 addl(len, 16);
6984 jcc(Assembler::equal, L_128_done);
6985
6986 bind(L_get_last_two_xmms);
6987 movdqu(xmm2, xmm7);
6988 addl(pos, len);
6989 movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
6990 subl(pos, len);
6991
6992 // get rid of the extra data that was loaded before
6993 // load the shift constant
6994 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
6995 movdqu(xmm0, Address(rax, len));
6996 addl(rax, len);
6997
6998 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
6999 //Change mask to 512
7000 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7001 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7002
7003 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7004 vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
7005 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7006 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7007 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7008
7009 bind(L_128_done);
7010 // compute crc of a 128-bit value
7011 movdqu(xmm10, Address(key, 3 * 16));
7012 movdqu(xmm0, xmm7);
7013
7014 // 64b fold
7015 vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7016 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7017 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7018
7019 // 32b fold
7020 movdqu(xmm0, xmm7);
7021 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7022 vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7023 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7024 jmp(L_barrett);
7025
7026 bind(L_less_than_256);
7027 kernel_crc32_avx512_256B(crc, buf, len, key, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7028
7029 //barrett reduction
7030 bind(L_barrett);
7031 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7032 movdqu(xmm1, xmm7);
7033 movdqu(xmm2, xmm7);
7034 movdqu(xmm10, Address(key, 4 * 16));
7035
7036 pclmulqdq(xmm7, xmm10, 0x0);
7037 pxor(xmm7, xmm2);
7038 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7039 movdqu(xmm2, xmm7);
7040 pclmulqdq(xmm7, xmm10, 0x10);
7041 pxor(xmm7, xmm2);
7042 pxor(xmm7, xmm1);
7043 pextrd(crc, xmm7, 2);
7044
7045 bind(L_cleanup);
7046 notl(crc); // ~c
7047 addptr(rsp, 16 * 2 + 8);
7048 pop(r12);
7049 }
7050
7051 // S. Gueron / Information Processing Letters 112 (2012) 184
7052 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7053 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7054 // Output: the 64-bit carry-less product of B * CONST
crc32c_ipl_alg4(Register in,uint32_t n,Register tmp1,Register tmp2,Register tmp3)7055 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7056 Register tmp1, Register tmp2, Register tmp3) {
7057 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7058 if (n > 0) {
7059 addq(tmp3, n * 256 * 8);
7060 }
7061 // Q1 = TABLEExt[n][B & 0xFF];
7062 movl(tmp1, in);
7063 andl(tmp1, 0x000000FF);
7064 shll(tmp1, 3);
7065 addq(tmp1, tmp3);
7066 movq(tmp1, Address(tmp1, 0));
7067
7068 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
7069 movl(tmp2, in);
7070 shrl(tmp2, 8);
7071 andl(tmp2, 0x000000FF);
7072 shll(tmp2, 3);
7073 addq(tmp2, tmp3);
7074 movq(tmp2, Address(tmp2, 0));
7075
7076 shlq(tmp2, 8);
7077 xorq(tmp1, tmp2);
7078
7079 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
7080 movl(tmp2, in);
7081 shrl(tmp2, 16);
7082 andl(tmp2, 0x000000FF);
7083 shll(tmp2, 3);
7084 addq(tmp2, tmp3);
7085 movq(tmp2, Address(tmp2, 0));
7086
7087 shlq(tmp2, 16);
7088 xorq(tmp1, tmp2);
7089
7090 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
7091 shrl(in, 24);
7092 andl(in, 0x000000FF);
7093 shll(in, 3);
7094 addq(in, tmp3);
7095 movq(in, Address(in, 0));
7096
7097 shlq(in, 24);
7098 xorq(in, tmp1);
7099 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7100 }
7101
crc32c_pclmulqdq(XMMRegister w_xtmp1,Register in_out,uint32_t const_or_pre_comp_const_index,bool is_pclmulqdq_supported,XMMRegister w_xtmp2,Register tmp1,Register n_tmp2,Register n_tmp3)7102 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7103 Register in_out,
7104 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7105 XMMRegister w_xtmp2,
7106 Register tmp1,
7107 Register n_tmp2, Register n_tmp3) {
7108 if (is_pclmulqdq_supported) {
7109 movdl(w_xtmp1, in_out); // modified blindly
7110
7111 movl(tmp1, const_or_pre_comp_const_index);
7112 movdl(w_xtmp2, tmp1);
7113 pclmulqdq(w_xtmp1, w_xtmp2, 0);
7114
7115 movdq(in_out, w_xtmp1);
7116 } else {
7117 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7118 }
7119 }
7120
7121 // Recombination Alternative 2: No bit-reflections
7122 // T1 = (CRC_A * U1) << 1
7123 // T2 = (CRC_B * U2) << 1
7124 // C1 = T1 >> 32
7125 // C2 = T2 >> 32
7126 // T1 = T1 & 0xFFFFFFFF
7127 // T2 = T2 & 0xFFFFFFFF
7128 // T1 = CRC32(0, T1)
7129 // T2 = CRC32(0, T2)
7130 // C1 = C1 ^ T1
7131 // C2 = C2 ^ T2
7132 // CRC = C1 ^ C2 ^ CRC_C
crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1,uint32_t const_or_pre_comp_const_index_u2,bool is_pclmulqdq_supported,Register in_out,Register in1,Register in2,XMMRegister w_xtmp1,XMMRegister w_xtmp2,XMMRegister w_xtmp3,Register tmp1,Register tmp2,Register n_tmp3)7133 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7134 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7135 Register tmp1, Register tmp2,
7136 Register n_tmp3) {
7137 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7138 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7139 shlq(in_out, 1);
7140 movl(tmp1, in_out);
7141 shrq(in_out, 32);
7142 xorl(tmp2, tmp2);
7143 crc32(tmp2, tmp1, 4);
7144 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7145 shlq(in1, 1);
7146 movl(tmp1, in1);
7147 shrq(in1, 32);
7148 xorl(tmp2, tmp2);
7149 crc32(tmp2, tmp1, 4);
7150 xorl(in1, tmp2);
7151 xorl(in_out, in1);
7152 xorl(in_out, in2);
7153 }
7154
7155 // Set N to predefined value
7156 // Subtract from a lenght of a buffer
7157 // execute in a loop:
7158 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7159 // for i = 1 to N do
7160 // CRC_A = CRC32(CRC_A, A[i])
7161 // CRC_B = CRC32(CRC_B, B[i])
7162 // CRC_C = CRC32(CRC_C, C[i])
7163 // end for
7164 // Recombine
crc32c_proc_chunk(uint32_t size,uint32_t const_or_pre_comp_const_index_u1,uint32_t const_or_pre_comp_const_index_u2,bool is_pclmulqdq_supported,Register in_out1,Register in_out2,Register in_out3,Register tmp1,Register tmp2,Register tmp3,XMMRegister w_xtmp1,XMMRegister w_xtmp2,XMMRegister w_xtmp3,Register tmp4,Register tmp5,Register n_tmp6)7165 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7166 Register in_out1, Register in_out2, Register in_out3,
7167 Register tmp1, Register tmp2, Register tmp3,
7168 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7169 Register tmp4, Register tmp5,
7170 Register n_tmp6) {
7171 Label L_processPartitions;
7172 Label L_processPartition;
7173 Label L_exit;
7174
7175 bind(L_processPartitions);
7176 cmpl(in_out1, 3 * size);
7177 jcc(Assembler::less, L_exit);
7178 xorl(tmp1, tmp1);
7179 xorl(tmp2, tmp2);
7180 movq(tmp3, in_out2);
7181 addq(tmp3, size);
7182
7183 bind(L_processPartition);
7184 crc32(in_out3, Address(in_out2, 0), 8);
7185 crc32(tmp1, Address(in_out2, size), 8);
7186 crc32(tmp2, Address(in_out2, size * 2), 8);
7187 addq(in_out2, 8);
7188 cmpq(in_out2, tmp3);
7189 jcc(Assembler::less, L_processPartition);
7190 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7191 w_xtmp1, w_xtmp2, w_xtmp3,
7192 tmp4, tmp5,
7193 n_tmp6);
7194 addq(in_out2, 2 * size);
7195 subl(in_out1, 3 * size);
7196 jmp(L_processPartitions);
7197
7198 bind(L_exit);
7199 }
7200 #else
crc32c_ipl_alg4(Register in_out,uint32_t n,Register tmp1,Register tmp2,Register tmp3,XMMRegister xtmp1,XMMRegister xtmp2)7201 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7202 Register tmp1, Register tmp2, Register tmp3,
7203 XMMRegister xtmp1, XMMRegister xtmp2) {
7204 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7205 if (n > 0) {
7206 addl(tmp3, n * 256 * 8);
7207 }
7208 // Q1 = TABLEExt[n][B & 0xFF];
7209 movl(tmp1, in_out);
7210 andl(tmp1, 0x000000FF);
7211 shll(tmp1, 3);
7212 addl(tmp1, tmp3);
7213 movq(xtmp1, Address(tmp1, 0));
7214
7215 // Q2 = TABLEExt[n][B >> 8 & 0xFF];
7216 movl(tmp2, in_out);
7217 shrl(tmp2, 8);
7218 andl(tmp2, 0x000000FF);
7219 shll(tmp2, 3);
7220 addl(tmp2, tmp3);
7221 movq(xtmp2, Address(tmp2, 0));
7222
7223 psllq(xtmp2, 8);
7224 pxor(xtmp1, xtmp2);
7225
7226 // Q3 = TABLEExt[n][B >> 16 & 0xFF];
7227 movl(tmp2, in_out);
7228 shrl(tmp2, 16);
7229 andl(tmp2, 0x000000FF);
7230 shll(tmp2, 3);
7231 addl(tmp2, tmp3);
7232 movq(xtmp2, Address(tmp2, 0));
7233
7234 psllq(xtmp2, 16);
7235 pxor(xtmp1, xtmp2);
7236
7237 // Q4 = TABLEExt[n][B >> 24 & 0xFF];
7238 shrl(in_out, 24);
7239 andl(in_out, 0x000000FF);
7240 shll(in_out, 3);
7241 addl(in_out, tmp3);
7242 movq(xtmp2, Address(in_out, 0));
7243
7244 psllq(xtmp2, 24);
7245 pxor(xtmp1, xtmp2); // Result in CXMM
7246 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7247 }
7248
crc32c_pclmulqdq(XMMRegister w_xtmp1,Register in_out,uint32_t const_or_pre_comp_const_index,bool is_pclmulqdq_supported,XMMRegister w_xtmp2,Register tmp1,Register n_tmp2,Register n_tmp3)7249 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7250 Register in_out,
7251 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7252 XMMRegister w_xtmp2,
7253 Register tmp1,
7254 Register n_tmp2, Register n_tmp3) {
7255 if (is_pclmulqdq_supported) {
7256 movdl(w_xtmp1, in_out);
7257
7258 movl(tmp1, const_or_pre_comp_const_index);
7259 movdl(w_xtmp2, tmp1);
7260 pclmulqdq(w_xtmp1, w_xtmp2, 0);
7261 // Keep result in XMM since GPR is 32 bit in length
7262 } else {
7263 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
7264 }
7265 }
7266
crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1,uint32_t const_or_pre_comp_const_index_u2,bool is_pclmulqdq_supported,Register in_out,Register in1,Register in2,XMMRegister w_xtmp1,XMMRegister w_xtmp2,XMMRegister w_xtmp3,Register tmp1,Register tmp2,Register n_tmp3)7267 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7268 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7269 Register tmp1, Register tmp2,
7270 Register n_tmp3) {
7271 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7272 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7273
7274 psllq(w_xtmp1, 1);
7275 movdl(tmp1, w_xtmp1);
7276 psrlq(w_xtmp1, 32);
7277 movdl(in_out, w_xtmp1);
7278
7279 xorl(tmp2, tmp2);
7280 crc32(tmp2, tmp1, 4);
7281 xorl(in_out, tmp2);
7282
7283 psllq(w_xtmp2, 1);
7284 movdl(tmp1, w_xtmp2);
7285 psrlq(w_xtmp2, 32);
7286 movdl(in1, w_xtmp2);
7287
7288 xorl(tmp2, tmp2);
7289 crc32(tmp2, tmp1, 4);
7290 xorl(in1, tmp2);
7291 xorl(in_out, in1);
7292 xorl(in_out, in2);
7293 }
7294
crc32c_proc_chunk(uint32_t size,uint32_t const_or_pre_comp_const_index_u1,uint32_t const_or_pre_comp_const_index_u2,bool is_pclmulqdq_supported,Register in_out1,Register in_out2,Register in_out3,Register tmp1,Register tmp2,Register tmp3,XMMRegister w_xtmp1,XMMRegister w_xtmp2,XMMRegister w_xtmp3,Register tmp4,Register tmp5,Register n_tmp6)7295 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7296 Register in_out1, Register in_out2, Register in_out3,
7297 Register tmp1, Register tmp2, Register tmp3,
7298 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7299 Register tmp4, Register tmp5,
7300 Register n_tmp6) {
7301 Label L_processPartitions;
7302 Label L_processPartition;
7303 Label L_exit;
7304
7305 bind(L_processPartitions);
7306 cmpl(in_out1, 3 * size);
7307 jcc(Assembler::less, L_exit);
7308 xorl(tmp1, tmp1);
7309 xorl(tmp2, tmp2);
7310 movl(tmp3, in_out2);
7311 addl(tmp3, size);
7312
7313 bind(L_processPartition);
7314 crc32(in_out3, Address(in_out2, 0), 4);
7315 crc32(tmp1, Address(in_out2, size), 4);
7316 crc32(tmp2, Address(in_out2, size*2), 4);
7317 crc32(in_out3, Address(in_out2, 0+4), 4);
7318 crc32(tmp1, Address(in_out2, size+4), 4);
7319 crc32(tmp2, Address(in_out2, size*2+4), 4);
7320 addl(in_out2, 8);
7321 cmpl(in_out2, tmp3);
7322 jcc(Assembler::less, L_processPartition);
7323
7324 push(tmp3);
7325 push(in_out1);
7326 push(in_out2);
7327 tmp4 = tmp3;
7328 tmp5 = in_out1;
7329 n_tmp6 = in_out2;
7330
7331 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7332 w_xtmp1, w_xtmp2, w_xtmp3,
7333 tmp4, tmp5,
7334 n_tmp6);
7335
7336 pop(in_out2);
7337 pop(in_out1);
7338 pop(tmp3);
7339
7340 addl(in_out2, 2 * size);
7341 subl(in_out1, 3 * size);
7342 jmp(L_processPartitions);
7343
7344 bind(L_exit);
7345 }
7346 #endif //LP64
7347
7348 #ifdef _LP64
7349 // Algorithm 2: Pipelined usage of the CRC32 instruction.
7350 // Input: A buffer I of L bytes.
7351 // Output: the CRC32C value of the buffer.
7352 // Notations:
7353 // Write L = 24N + r, with N = floor (L/24).
7354 // r = L mod 24 (0 <= r < 24).
7355 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
7356 // N quadwords, and R consists of r bytes.
7357 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
7358 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
7359 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
7360 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
crc32c_ipl_alg2_alt2(Register in_out,Register in1,Register in2,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,XMMRegister w_xtmp1,XMMRegister w_xtmp2,XMMRegister w_xtmp3,bool is_pclmulqdq_supported)7361 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7362 Register tmp1, Register tmp2, Register tmp3,
7363 Register tmp4, Register tmp5, Register tmp6,
7364 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7365 bool is_pclmulqdq_supported) {
7366 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7367 Label L_wordByWord;
7368 Label L_byteByByteProlog;
7369 Label L_byteByByte;
7370 Label L_exit;
7371
7372 if (is_pclmulqdq_supported ) {
7373 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7374 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
7375
7376 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7377 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7378
7379 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7380 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7381 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
7382 } else {
7383 const_or_pre_comp_const_index[0] = 1;
7384 const_or_pre_comp_const_index[1] = 0;
7385
7386 const_or_pre_comp_const_index[2] = 3;
7387 const_or_pre_comp_const_index[3] = 2;
7388
7389 const_or_pre_comp_const_index[4] = 5;
7390 const_or_pre_comp_const_index[5] = 4;
7391 }
7392 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7393 in2, in1, in_out,
7394 tmp1, tmp2, tmp3,
7395 w_xtmp1, w_xtmp2, w_xtmp3,
7396 tmp4, tmp5,
7397 tmp6);
7398 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7399 in2, in1, in_out,
7400 tmp1, tmp2, tmp3,
7401 w_xtmp1, w_xtmp2, w_xtmp3,
7402 tmp4, tmp5,
7403 tmp6);
7404 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7405 in2, in1, in_out,
7406 tmp1, tmp2, tmp3,
7407 w_xtmp1, w_xtmp2, w_xtmp3,
7408 tmp4, tmp5,
7409 tmp6);
7410 movl(tmp1, in2);
7411 andl(tmp1, 0x00000007);
7412 negl(tmp1);
7413 addl(tmp1, in2);
7414 addq(tmp1, in1);
7415
7416 BIND(L_wordByWord);
7417 cmpq(in1, tmp1);
7418 jcc(Assembler::greaterEqual, L_byteByByteProlog);
7419 crc32(in_out, Address(in1, 0), 4);
7420 addq(in1, 4);
7421 jmp(L_wordByWord);
7422
7423 BIND(L_byteByByteProlog);
7424 andl(in2, 0x00000007);
7425 movl(tmp2, 1);
7426
7427 BIND(L_byteByByte);
7428 cmpl(tmp2, in2);
7429 jccb(Assembler::greater, L_exit);
7430 crc32(in_out, Address(in1, 0), 1);
7431 incq(in1);
7432 incl(tmp2);
7433 jmp(L_byteByByte);
7434
7435 BIND(L_exit);
7436 }
7437 #else
crc32c_ipl_alg2_alt2(Register in_out,Register in1,Register in2,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,XMMRegister w_xtmp1,XMMRegister w_xtmp2,XMMRegister w_xtmp3,bool is_pclmulqdq_supported)7438 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7439 Register tmp1, Register tmp2, Register tmp3,
7440 Register tmp4, Register tmp5, Register tmp6,
7441 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7442 bool is_pclmulqdq_supported) {
7443 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7444 Label L_wordByWord;
7445 Label L_byteByByteProlog;
7446 Label L_byteByByte;
7447 Label L_exit;
7448
7449 if (is_pclmulqdq_supported) {
7450 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7451 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
7452
7453 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7454 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7455
7456 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7457 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7458 } else {
7459 const_or_pre_comp_const_index[0] = 1;
7460 const_or_pre_comp_const_index[1] = 0;
7461
7462 const_or_pre_comp_const_index[2] = 3;
7463 const_or_pre_comp_const_index[3] = 2;
7464
7465 const_or_pre_comp_const_index[4] = 5;
7466 const_or_pre_comp_const_index[5] = 4;
7467 }
7468 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7469 in2, in1, in_out,
7470 tmp1, tmp2, tmp3,
7471 w_xtmp1, w_xtmp2, w_xtmp3,
7472 tmp4, tmp5,
7473 tmp6);
7474 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7475 in2, in1, in_out,
7476 tmp1, tmp2, tmp3,
7477 w_xtmp1, w_xtmp2, w_xtmp3,
7478 tmp4, tmp5,
7479 tmp6);
7480 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7481 in2, in1, in_out,
7482 tmp1, tmp2, tmp3,
7483 w_xtmp1, w_xtmp2, w_xtmp3,
7484 tmp4, tmp5,
7485 tmp6);
7486 movl(tmp1, in2);
7487 andl(tmp1, 0x00000007);
7488 negl(tmp1);
7489 addl(tmp1, in2);
7490 addl(tmp1, in1);
7491
7492 BIND(L_wordByWord);
7493 cmpl(in1, tmp1);
7494 jcc(Assembler::greaterEqual, L_byteByByteProlog);
7495 crc32(in_out, Address(in1,0), 4);
7496 addl(in1, 4);
7497 jmp(L_wordByWord);
7498
7499 BIND(L_byteByByteProlog);
7500 andl(in2, 0x00000007);
7501 movl(tmp2, 1);
7502
7503 BIND(L_byteByByte);
7504 cmpl(tmp2, in2);
7505 jccb(Assembler::greater, L_exit);
7506 movb(tmp1, Address(in1, 0));
7507 crc32(in_out, tmp1, 1);
7508 incl(in1);
7509 incl(tmp2);
7510 jmp(L_byteByByte);
7511
7512 BIND(L_exit);
7513 }
7514 #endif // LP64
7515 #undef BIND
7516 #undef BLOCK_COMMENT
7517
7518 // Compress char[] array to byte[].
7519 // ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
7520 // @HotSpotIntrinsicCandidate
7521 // private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
7522 // for (int i = 0; i < len; i++) {
7523 // int c = src[srcOff++];
7524 // if (c >>> 8 != 0) {
7525 // return 0;
7526 // }
7527 // dst[dstOff++] = (byte)c;
7528 // }
7529 // return len;
7530 // }
char_array_compress(Register src,Register dst,Register len,XMMRegister tmp1Reg,XMMRegister tmp2Reg,XMMRegister tmp3Reg,XMMRegister tmp4Reg,Register tmp5,Register result)7531 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
7532 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7533 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7534 Register tmp5, Register result) {
7535 Label copy_chars_loop, return_length, return_zero, done;
7536
7537 // rsi: src
7538 // rdi: dst
7539 // rdx: len
7540 // rcx: tmp5
7541 // rax: result
7542
7543 // rsi holds start addr of source char[] to be compressed
7544 // rdi holds start addr of destination byte[]
7545 // rdx holds length
7546
7547 assert(len != result, "");
7548
7549 // save length for return
7550 push(len);
7551
7552 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
7553 VM_Version::supports_avx512vlbw() &&
7554 VM_Version::supports_bmi2()) {
7555
7556 Label copy_32_loop, copy_loop_tail, below_threshold;
7557
7558 // alignment
7559 Label post_alignment;
7560
7561 // if length of the string is less than 16, handle it in an old fashioned way
7562 testl(len, -32);
7563 jcc(Assembler::zero, below_threshold);
7564
7565 // First check whether a character is compressable ( <= 0xFF).
7566 // Create mask to test for Unicode chars inside zmm vector
7567 movl(result, 0x00FF);
7568 evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
7569
7570 testl(len, -64);
7571 jcc(Assembler::zero, post_alignment);
7572
7573 movl(tmp5, dst);
7574 andl(tmp5, (32 - 1));
7575 negl(tmp5);
7576 andl(tmp5, (32 - 1));
7577
7578 // bail out when there is nothing to be done
7579 testl(tmp5, 0xFFFFFFFF);
7580 jcc(Assembler::zero, post_alignment);
7581
7582 // ~(~0 << len), where len is the # of remaining elements to process
7583 movl(result, 0xFFFFFFFF);
7584 shlxl(result, result, tmp5);
7585 notl(result);
7586 kmovdl(k3, result);
7587
7588 evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
7589 evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
7590 ktestd(k2, k3);
7591 jcc(Assembler::carryClear, return_zero);
7592
7593 evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
7594
7595 addptr(src, tmp5);
7596 addptr(src, tmp5);
7597 addptr(dst, tmp5);
7598 subl(len, tmp5);
7599
7600 bind(post_alignment);
7601 // end of alignment
7602
7603 movl(tmp5, len);
7604 andl(tmp5, (32 - 1)); // tail count (in chars)
7605 andl(len, ~(32 - 1)); // vector count (in chars)
7606 jcc(Assembler::zero, copy_loop_tail);
7607
7608 lea(src, Address(src, len, Address::times_2));
7609 lea(dst, Address(dst, len, Address::times_1));
7610 negptr(len);
7611
7612 bind(copy_32_loop);
7613 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
7614 evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
7615 kortestdl(k2, k2);
7616 jcc(Assembler::carryClear, return_zero);
7617
7618 // All elements in current processed chunk are valid candidates for
7619 // compression. Write a truncated byte elements to the memory.
7620 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
7621 addptr(len, 32);
7622 jcc(Assembler::notZero, copy_32_loop);
7623
7624 bind(copy_loop_tail);
7625 // bail out when there is nothing to be done
7626 testl(tmp5, 0xFFFFFFFF);
7627 jcc(Assembler::zero, return_length);
7628
7629 movl(len, tmp5);
7630
7631 // ~(~0 << len), where len is the # of remaining elements to process
7632 movl(result, 0xFFFFFFFF);
7633 shlxl(result, result, len);
7634 notl(result);
7635
7636 kmovdl(k3, result);
7637
7638 evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
7639 evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
7640 ktestd(k2, k3);
7641 jcc(Assembler::carryClear, return_zero);
7642
7643 evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
7644 jmp(return_length);
7645
7646 bind(below_threshold);
7647 }
7648
7649 if (UseSSE42Intrinsics) {
7650 Label copy_32_loop, copy_16, copy_tail;
7651
7652 movl(result, len);
7653
7654 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
7655
7656 // vectored compression
7657 andl(len, 0xfffffff0); // vector count (in chars)
7658 andl(result, 0x0000000f); // tail count (in chars)
7659 testl(len, len);
7660 jcc(Assembler::zero, copy_16);
7661
7662 // compress 16 chars per iter
7663 movdl(tmp1Reg, tmp5);
7664 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
7665 pxor(tmp4Reg, tmp4Reg);
7666
7667 lea(src, Address(src, len, Address::times_2));
7668 lea(dst, Address(dst, len, Address::times_1));
7669 negptr(len);
7670
7671 bind(copy_32_loop);
7672 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
7673 por(tmp4Reg, tmp2Reg);
7674 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
7675 por(tmp4Reg, tmp3Reg);
7676 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
7677 jcc(Assembler::notZero, return_zero);
7678 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
7679 movdqu(Address(dst, len, Address::times_1), tmp2Reg);
7680 addptr(len, 16);
7681 jcc(Assembler::notZero, copy_32_loop);
7682
7683 // compress next vector of 8 chars (if any)
7684 bind(copy_16);
7685 movl(len, result);
7686 andl(len, 0xfffffff8); // vector count (in chars)
7687 andl(result, 0x00000007); // tail count (in chars)
7688 testl(len, len);
7689 jccb(Assembler::zero, copy_tail);
7690
7691 movdl(tmp1Reg, tmp5);
7692 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
7693 pxor(tmp3Reg, tmp3Reg);
7694
7695 movdqu(tmp2Reg, Address(src, 0));
7696 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7697 jccb(Assembler::notZero, return_zero);
7698 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
7699 movq(Address(dst, 0), tmp2Reg);
7700 addptr(src, 16);
7701 addptr(dst, 8);
7702
7703 bind(copy_tail);
7704 movl(len, result);
7705 }
7706 // compress 1 char per iter
7707 testl(len, len);
7708 jccb(Assembler::zero, return_length);
7709 lea(src, Address(src, len, Address::times_2));
7710 lea(dst, Address(dst, len, Address::times_1));
7711 negptr(len);
7712
7713 bind(copy_chars_loop);
7714 load_unsigned_short(result, Address(src, len, Address::times_2));
7715 testl(result, 0xff00); // check if Unicode char
7716 jccb(Assembler::notZero, return_zero);
7717 movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte
7718 increment(len);
7719 jcc(Assembler::notZero, copy_chars_loop);
7720
7721 // if compression succeeded, return length
7722 bind(return_length);
7723 pop(result);
7724 jmpb(done);
7725
7726 // if compression failed, return 0
7727 bind(return_zero);
7728 xorl(result, result);
7729 addptr(rsp, wordSize);
7730
7731 bind(done);
7732 }
7733
7734 // Inflate byte[] array to char[].
7735 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
7736 // @HotSpotIntrinsicCandidate
7737 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
7738 // for (int i = 0; i < len; i++) {
7739 // dst[dstOff++] = (char)(src[srcOff++] & 0xff);
7740 // }
7741 // }
byte_array_inflate(Register src,Register dst,Register len,XMMRegister tmp1,Register tmp2)7742 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
7743 XMMRegister tmp1, Register tmp2) {
7744 Label copy_chars_loop, done, below_threshold, avx3_threshold;
7745 // rsi: src
7746 // rdi: dst
7747 // rdx: len
7748 // rcx: tmp2
7749
7750 // rsi holds start addr of source byte[] to be inflated
7751 // rdi holds start addr of destination char[]
7752 // rdx holds length
7753 assert_different_registers(src, dst, len, tmp2);
7754 movl(tmp2, len);
7755 if ((UseAVX > 2) && // AVX512
7756 VM_Version::supports_avx512vlbw() &&
7757 VM_Version::supports_bmi2()) {
7758
7759 Label copy_32_loop, copy_tail;
7760 Register tmp3_aliased = len;
7761
7762 // if length of the string is less than 16, handle it in an old fashioned way
7763 testl(len, -16);
7764 jcc(Assembler::zero, below_threshold);
7765
7766 testl(len, -1 * AVX3Threshold);
7767 jcc(Assembler::zero, avx3_threshold);
7768
7769 // In order to use only one arithmetic operation for the main loop we use
7770 // this pre-calculation
7771 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
7772 andl(len, -32); // vector count
7773 jccb(Assembler::zero, copy_tail);
7774
7775 lea(src, Address(src, len, Address::times_1));
7776 lea(dst, Address(dst, len, Address::times_2));
7777 negptr(len);
7778
7779
7780 // inflate 32 chars per iter
7781 bind(copy_32_loop);
7782 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
7783 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
7784 addptr(len, 32);
7785 jcc(Assembler::notZero, copy_32_loop);
7786
7787 bind(copy_tail);
7788 // bail out when there is nothing to be done
7789 testl(tmp2, -1); // we don't destroy the contents of tmp2 here
7790 jcc(Assembler::zero, done);
7791
7792 // ~(~0 << length), where length is the # of remaining elements to process
7793 movl(tmp3_aliased, -1);
7794 shlxl(tmp3_aliased, tmp3_aliased, tmp2);
7795 notl(tmp3_aliased);
7796 kmovdl(k2, tmp3_aliased);
7797 evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
7798 evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
7799
7800 jmp(done);
7801 bind(avx3_threshold);
7802 }
7803 if (UseSSE42Intrinsics) {
7804 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
7805
7806 if (UseAVX > 1) {
7807 andl(tmp2, (16 - 1));
7808 andl(len, -16);
7809 jccb(Assembler::zero, copy_new_tail);
7810 } else {
7811 andl(tmp2, 0x00000007); // tail count (in chars)
7812 andl(len, 0xfffffff8); // vector count (in chars)
7813 jccb(Assembler::zero, copy_tail);
7814 }
7815
7816 // vectored inflation
7817 lea(src, Address(src, len, Address::times_1));
7818 lea(dst, Address(dst, len, Address::times_2));
7819 negptr(len);
7820
7821 if (UseAVX > 1) {
7822 bind(copy_16_loop);
7823 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
7824 vmovdqu(Address(dst, len, Address::times_2), tmp1);
7825 addptr(len, 16);
7826 jcc(Assembler::notZero, copy_16_loop);
7827
7828 bind(below_threshold);
7829 bind(copy_new_tail);
7830 movl(len, tmp2);
7831 andl(tmp2, 0x00000007);
7832 andl(len, 0xFFFFFFF8);
7833 jccb(Assembler::zero, copy_tail);
7834
7835 pmovzxbw(tmp1, Address(src, 0));
7836 movdqu(Address(dst, 0), tmp1);
7837 addptr(src, 8);
7838 addptr(dst, 2 * 8);
7839
7840 jmp(copy_tail, true);
7841 }
7842
7843 // inflate 8 chars per iter
7844 bind(copy_8_loop);
7845 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
7846 movdqu(Address(dst, len, Address::times_2), tmp1);
7847 addptr(len, 8);
7848 jcc(Assembler::notZero, copy_8_loop);
7849
7850 bind(copy_tail);
7851 movl(len, tmp2);
7852
7853 cmpl(len, 4);
7854 jccb(Assembler::less, copy_bytes);
7855
7856 movdl(tmp1, Address(src, 0)); // load 4 byte chars
7857 pmovzxbw(tmp1, tmp1);
7858 movq(Address(dst, 0), tmp1);
7859 subptr(len, 4);
7860 addptr(src, 4);
7861 addptr(dst, 8);
7862
7863 bind(copy_bytes);
7864 } else {
7865 bind(below_threshold);
7866 }
7867
7868 testl(len, len);
7869 jccb(Assembler::zero, done);
7870 lea(src, Address(src, len, Address::times_1));
7871 lea(dst, Address(dst, len, Address::times_2));
7872 negptr(len);
7873
7874 // inflate 1 char per iter
7875 bind(copy_chars_loop);
7876 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
7877 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
7878 increment(len);
7879 jcc(Assembler::notZero, copy_chars_loop);
7880
7881 bind(done);
7882 }
7883
7884 #ifdef _LP64
convert_f2i(Register dst,XMMRegister src)7885 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
7886 Label done;
7887 cvttss2sil(dst, src);
7888 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
7889 cmpl(dst, 0x80000000); // float_sign_flip
7890 jccb(Assembler::notEqual, done);
7891 subptr(rsp, 8);
7892 movflt(Address(rsp, 0), src);
7893 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
7894 pop(dst);
7895 bind(done);
7896 }
7897
convert_d2i(Register dst,XMMRegister src)7898 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
7899 Label done;
7900 cvttsd2sil(dst, src);
7901 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
7902 cmpl(dst, 0x80000000); // float_sign_flip
7903 jccb(Assembler::notEqual, done);
7904 subptr(rsp, 8);
7905 movdbl(Address(rsp, 0), src);
7906 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
7907 pop(dst);
7908 bind(done);
7909 }
7910
convert_f2l(Register dst,XMMRegister src)7911 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
7912 Label done;
7913 cvttss2siq(dst, src);
7914 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
7915 jccb(Assembler::notEqual, done);
7916 subptr(rsp, 8);
7917 movflt(Address(rsp, 0), src);
7918 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
7919 pop(dst);
7920 bind(done);
7921 }
7922
convert_d2l(Register dst,XMMRegister src)7923 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
7924 Label done;
7925 cvttsd2siq(dst, src);
7926 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
7927 jccb(Assembler::notEqual, done);
7928 subptr(rsp, 8);
7929 movdbl(Address(rsp, 0), src);
7930 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
7931 pop(dst);
7932 bind(done);
7933 }
7934
cache_wb(Address line)7935 void MacroAssembler::cache_wb(Address line)
7936 {
7937 // 64 bit cpus always support clflush
7938 assert(VM_Version::supports_clflush(), "clflush should be available");
7939 bool optimized = VM_Version::supports_clflushopt();
7940 bool no_evict = VM_Version::supports_clwb();
7941
7942 // prefer clwb (writeback without evict) otherwise
7943 // prefer clflushopt (potentially parallel writeback with evict)
7944 // otherwise fallback on clflush (serial writeback with evict)
7945
7946 if (optimized) {
7947 if (no_evict) {
7948 clwb(line);
7949 } else {
7950 clflushopt(line);
7951 }
7952 } else {
7953 // no need for fence when using CLFLUSH
7954 clflush(line);
7955 }
7956 }
7957
cache_wbsync(bool is_pre)7958 void MacroAssembler::cache_wbsync(bool is_pre)
7959 {
7960 assert(VM_Version::supports_clflush(), "clflush should be available");
7961 bool optimized = VM_Version::supports_clflushopt();
7962 bool no_evict = VM_Version::supports_clwb();
7963
7964 // pick the correct implementation
7965
7966 if (!is_pre && (optimized || no_evict)) {
7967 // need an sfence for post flush when using clflushopt or clwb
7968 // otherwise no no need for any synchroniaztion
7969
7970 sfence();
7971 }
7972 }
7973 #endif // _LP64
7974
negate_condition(Assembler::Condition cond)7975 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
7976 switch (cond) {
7977 // Note some conditions are synonyms for others
7978 case Assembler::zero: return Assembler::notZero;
7979 case Assembler::notZero: return Assembler::zero;
7980 case Assembler::less: return Assembler::greaterEqual;
7981 case Assembler::lessEqual: return Assembler::greater;
7982 case Assembler::greater: return Assembler::lessEqual;
7983 case Assembler::greaterEqual: return Assembler::less;
7984 case Assembler::below: return Assembler::aboveEqual;
7985 case Assembler::belowEqual: return Assembler::above;
7986 case Assembler::above: return Assembler::belowEqual;
7987 case Assembler::aboveEqual: return Assembler::below;
7988 case Assembler::overflow: return Assembler::noOverflow;
7989 case Assembler::noOverflow: return Assembler::overflow;
7990 case Assembler::negative: return Assembler::positive;
7991 case Assembler::positive: return Assembler::negative;
7992 case Assembler::parity: return Assembler::noParity;
7993 case Assembler::noParity: return Assembler::parity;
7994 }
7995 ShouldNotReachHere(); return Assembler::overflow;
7996 }
7997
SkipIfEqual(MacroAssembler * masm,const bool * flag_addr,bool value)7998 SkipIfEqual::SkipIfEqual(
7999 MacroAssembler* masm, const bool* flag_addr, bool value) {
8000 _masm = masm;
8001 _masm->cmp8(ExternalAddress((address)flag_addr), value);
8002 _masm->jcc(Assembler::equal, _label);
8003 }
8004
~SkipIfEqual()8005 SkipIfEqual::~SkipIfEqual() {
8006 _masm->bind(_label);
8007 }
8008
8009 // 32-bit Windows has its own fast-path implementation
8010 // of get_thread
8011 #if !defined(WIN32) || defined(_LP64)
8012
8013 // This is simply a call to Thread::current()
get_thread(Register thread)8014 void MacroAssembler::get_thread(Register thread) {
8015 if (thread != rax) {
8016 push(rax);
8017 }
8018 LP64_ONLY(push(rdi);)
8019 LP64_ONLY(push(rsi);)
8020 push(rdx);
8021 push(rcx);
8022 #ifdef _LP64
8023 push(r8);
8024 push(r9);
8025 push(r10);
8026 push(r11);
8027 #endif
8028
8029 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
8030
8031 #ifdef _LP64
8032 pop(r11);
8033 pop(r10);
8034 pop(r9);
8035 pop(r8);
8036 #endif
8037 pop(rcx);
8038 pop(rdx);
8039 LP64_ONLY(pop(rsi);)
8040 LP64_ONLY(pop(rdi);)
8041 if (thread != rax) {
8042 mov(thread, rax);
8043 pop(rax);
8044 }
8045 }
8046
8047 #endif // !WIN32 || _LP64
8048