1 /*
2 * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "oops/methodData.hpp"
29 #include "opto/c2_MacroAssembler.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/opcodes.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/biasedLocking.hpp"
34 #include "runtime/objectMonitor.hpp"
35 #include "runtime/stubRoutines.hpp"
36
vector_length_encoding(int vlen_in_bytes)37 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
38 switch (vlen_in_bytes) {
39 case 4: // fall-through
40 case 8: // fall-through
41 case 16: return Assembler::AVX_128bit;
42 case 32: return Assembler::AVX_256bit;
43 case 64: return Assembler::AVX_512bit;
44
45 default: {
46 ShouldNotReachHere();
47 return Assembler::AVX_NoVec;
48 }
49 }
50 }
51
setvectmask(Register dst,Register src)52 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
53 guarantee(PostLoopMultiversioning, "must be");
54 Assembler::movl(dst, 1);
55 Assembler::shlxl(dst, dst, src);
56 Assembler::decl(dst);
57 Assembler::kmovdl(k1, dst);
58 Assembler::movl(dst, src);
59 }
60
restorevectmask()61 void C2_MacroAssembler::restorevectmask() {
62 guarantee(PostLoopMultiversioning, "must be");
63 Assembler::knotwl(k1, k0);
64 }
65
66 #if INCLUDE_RTM_OPT
67
68 // Update rtm_counters based on abort status
69 // input: abort_status
70 // rtm_counters (RTMLockingCounters*)
71 // flags are killed
rtm_counters_update(Register abort_status,Register rtm_counters)72 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
73
74 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
75 if (PrintPreciseRTMLockingStatistics) {
76 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
77 Label check_abort;
78 testl(abort_status, (1<<i));
79 jccb(Assembler::equal, check_abort);
80 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
81 bind(check_abort);
82 }
83 }
84 }
85
86 // Branch if (random & (count-1) != 0), count is 2^n
87 // tmp, scr and flags are killed
branch_on_random_using_rdtsc(Register tmp,Register scr,int count,Label & brLabel)88 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
89 assert(tmp == rax, "");
90 assert(scr == rdx, "");
91 rdtsc(); // modifies EDX:EAX
92 andptr(tmp, count-1);
93 jccb(Assembler::notZero, brLabel);
94 }
95
96 // Perform abort ratio calculation, set no_rtm bit if high ratio
97 // input: rtm_counters_Reg (RTMLockingCounters* address)
98 // tmpReg, rtm_counters_Reg and flags are killed
rtm_abort_ratio_calculation(Register tmpReg,Register rtm_counters_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data)99 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
100 Register rtm_counters_Reg,
101 RTMLockingCounters* rtm_counters,
102 Metadata* method_data) {
103 Label L_done, L_check_always_rtm1, L_check_always_rtm2;
104
105 if (RTMLockingCalculationDelay > 0) {
106 // Delay calculation
107 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
108 testptr(tmpReg, tmpReg);
109 jccb(Assembler::equal, L_done);
110 }
111 // Abort ratio calculation only if abort_count > RTMAbortThreshold
112 // Aborted transactions = abort_count * 100
113 // All transactions = total_count * RTMTotalCountIncrRate
114 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
115
116 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
117 cmpptr(tmpReg, RTMAbortThreshold);
118 jccb(Assembler::below, L_check_always_rtm2);
119 imulptr(tmpReg, tmpReg, 100);
120
121 Register scrReg = rtm_counters_Reg;
122 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
123 imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
124 imulptr(scrReg, scrReg, RTMAbortRatio);
125 cmpptr(tmpReg, scrReg);
126 jccb(Assembler::below, L_check_always_rtm1);
127 if (method_data != NULL) {
128 // set rtm_state to "no rtm" in MDO
129 mov_metadata(tmpReg, method_data);
130 lock();
131 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
132 }
133 jmpb(L_done);
134 bind(L_check_always_rtm1);
135 // Reload RTMLockingCounters* address
136 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
137 bind(L_check_always_rtm2);
138 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
139 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
140 jccb(Assembler::below, L_done);
141 if (method_data != NULL) {
142 // set rtm_state to "always rtm" in MDO
143 mov_metadata(tmpReg, method_data);
144 lock();
145 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
146 }
147 bind(L_done);
148 }
149
150 // Update counters and perform abort ratio calculation
151 // input: abort_status_Reg
152 // rtm_counters_Reg, flags are killed
rtm_profiling(Register abort_status_Reg,Register rtm_counters_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm)153 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
154 Register rtm_counters_Reg,
155 RTMLockingCounters* rtm_counters,
156 Metadata* method_data,
157 bool profile_rtm) {
158
159 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
160 // update rtm counters based on rax value at abort
161 // reads abort_status_Reg, updates flags
162 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
163 rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
164 if (profile_rtm) {
165 // Save abort status because abort_status_Reg is used by following code.
166 if (RTMRetryCount > 0) {
167 push(abort_status_Reg);
168 }
169 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
170 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
171 // restore abort status
172 if (RTMRetryCount > 0) {
173 pop(abort_status_Reg);
174 }
175 }
176 }
177
178 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
179 // inputs: retry_count_Reg
180 // : abort_status_Reg
181 // output: retry_count_Reg decremented by 1
182 // flags are killed
rtm_retry_lock_on_abort(Register retry_count_Reg,Register abort_status_Reg,Label & retryLabel)183 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
184 Label doneRetry;
185 assert(abort_status_Reg == rax, "");
186 // The abort reason bits are in eax (see all states in rtmLocking.hpp)
187 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
188 // if reason is in 0x6 and retry count != 0 then retry
189 andptr(abort_status_Reg, 0x6);
190 jccb(Assembler::zero, doneRetry);
191 testl(retry_count_Reg, retry_count_Reg);
192 jccb(Assembler::zero, doneRetry);
193 pause();
194 decrementl(retry_count_Reg);
195 jmp(retryLabel);
196 bind(doneRetry);
197 }
198
199 // Spin and retry if lock is busy,
200 // inputs: box_Reg (monitor address)
201 // : retry_count_Reg
202 // output: retry_count_Reg decremented by 1
203 // : clear z flag if retry count exceeded
204 // tmp_Reg, scr_Reg, flags are killed
rtm_retry_lock_on_busy(Register retry_count_Reg,Register box_Reg,Register tmp_Reg,Register scr_Reg,Label & retryLabel)205 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
206 Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
207 Label SpinLoop, SpinExit, doneRetry;
208 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
209
210 testl(retry_count_Reg, retry_count_Reg);
211 jccb(Assembler::zero, doneRetry);
212 decrementl(retry_count_Reg);
213 movptr(scr_Reg, RTMSpinLoopCount);
214
215 bind(SpinLoop);
216 pause();
217 decrementl(scr_Reg);
218 jccb(Assembler::lessEqual, SpinExit);
219 movptr(tmp_Reg, Address(box_Reg, owner_offset));
220 testptr(tmp_Reg, tmp_Reg);
221 jccb(Assembler::notZero, SpinLoop);
222
223 bind(SpinExit);
224 jmp(retryLabel);
225 bind(doneRetry);
226 incrementl(retry_count_Reg); // clear z flag
227 }
228
229 // Use RTM for normal stack locks
230 // Input: objReg (object to lock)
rtm_stack_locking(Register objReg,Register tmpReg,Register scrReg,Register retry_on_abort_count_Reg,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL,Label & IsInflated)231 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
232 Register retry_on_abort_count_Reg,
233 RTMLockingCounters* stack_rtm_counters,
234 Metadata* method_data, bool profile_rtm,
235 Label& DONE_LABEL, Label& IsInflated) {
236 assert(UseRTMForStackLocks, "why call this otherwise?");
237 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
238 assert(tmpReg == rax, "");
239 assert(scrReg == rdx, "");
240 Label L_rtm_retry, L_decrement_retry, L_on_abort;
241
242 if (RTMRetryCount > 0) {
243 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
244 bind(L_rtm_retry);
245 }
246 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
247 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
248 jcc(Assembler::notZero, IsInflated);
249
250 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
251 Label L_noincrement;
252 if (RTMTotalCountIncrRate > 1) {
253 // tmpReg, scrReg and flags are killed
254 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
255 }
256 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
257 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
258 bind(L_noincrement);
259 }
260 xbegin(L_on_abort);
261 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
262 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
263 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
264 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked
265
266 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
267 if (UseRTMXendForLockBusy) {
268 xend();
269 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)
270 jmp(L_decrement_retry);
271 }
272 else {
273 xabort(0);
274 }
275 bind(L_on_abort);
276 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
277 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
278 }
279 bind(L_decrement_retry);
280 if (RTMRetryCount > 0) {
281 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
282 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
283 }
284 }
285
286 // Use RTM for inflating locks
287 // inputs: objReg (object to lock)
288 // boxReg (on-stack box address (displaced header location) - KILLED)
289 // tmpReg (ObjectMonitor address + markWord::monitor_value)
rtm_inflated_locking(Register objReg,Register boxReg,Register tmpReg,Register scrReg,Register retry_on_busy_count_Reg,Register retry_on_abort_count_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL)290 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
291 Register scrReg, Register retry_on_busy_count_Reg,
292 Register retry_on_abort_count_Reg,
293 RTMLockingCounters* rtm_counters,
294 Metadata* method_data, bool profile_rtm,
295 Label& DONE_LABEL) {
296 assert(UseRTMLocking, "why call this otherwise?");
297 assert(tmpReg == rax, "");
298 assert(scrReg == rdx, "");
299 Label L_rtm_retry, L_decrement_retry, L_on_abort;
300 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
301
302 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
303 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
304 movptr(boxReg, tmpReg); // Save ObjectMonitor address
305
306 if (RTMRetryCount > 0) {
307 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy
308 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
309 bind(L_rtm_retry);
310 }
311 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
312 Label L_noincrement;
313 if (RTMTotalCountIncrRate > 1) {
314 // tmpReg, scrReg and flags are killed
315 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
316 }
317 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
318 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
319 bind(L_noincrement);
320 }
321 xbegin(L_on_abort);
322 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
323 movptr(tmpReg, Address(tmpReg, owner_offset));
324 testptr(tmpReg, tmpReg);
325 jcc(Assembler::zero, DONE_LABEL);
326 if (UseRTMXendForLockBusy) {
327 xend();
328 jmp(L_decrement_retry);
329 }
330 else {
331 xabort(0);
332 }
333 bind(L_on_abort);
334 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
335 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
336 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
337 }
338 if (RTMRetryCount > 0) {
339 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
340 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
341 }
342
343 movptr(tmpReg, Address(boxReg, owner_offset)) ;
344 testptr(tmpReg, tmpReg) ;
345 jccb(Assembler::notZero, L_decrement_retry) ;
346
347 // Appears unlocked - try to swing _owner from null to non-null.
348 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
349 #ifdef _LP64
350 Register threadReg = r15_thread;
351 #else
352 get_thread(scrReg);
353 Register threadReg = scrReg;
354 #endif
355 lock();
356 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
357
358 if (RTMRetryCount > 0) {
359 // success done else retry
360 jccb(Assembler::equal, DONE_LABEL) ;
361 bind(L_decrement_retry);
362 // Spin and retry if lock is busy.
363 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
364 }
365 else {
366 bind(L_decrement_retry);
367 }
368 }
369
370 #endif // INCLUDE_RTM_OPT
371
372 // fast_lock and fast_unlock used by C2
373
374 // Because the transitions from emitted code to the runtime
375 // monitorenter/exit helper stubs are so slow it's critical that
376 // we inline both the stack-locking fast path and the inflated fast path.
377 //
378 // See also: cmpFastLock and cmpFastUnlock.
379 //
380 // What follows is a specialized inline transliteration of the code
381 // in enter() and exit(). If we're concerned about I$ bloat another
382 // option would be to emit TrySlowEnter and TrySlowExit methods
383 // at startup-time. These methods would accept arguments as
384 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
385 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
386 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
387 // In practice, however, the # of lock sites is bounded and is usually small.
388 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
389 // if the processor uses simple bimodal branch predictors keyed by EIP
390 // Since the helper routines would be called from multiple synchronization
391 // sites.
392 //
393 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
394 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
395 // to those specialized methods. That'd give us a mostly platform-independent
396 // implementation that the JITs could optimize and inline at their pleasure.
397 // Done correctly, the only time we'd need to cross to native could would be
398 // to park() or unpark() threads. We'd also need a few more unsafe operators
399 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
400 // (b) explicit barriers or fence operations.
401 //
402 // TODO:
403 //
404 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
405 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
406 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
407 // the lock operators would typically be faster than reifying Self.
408 //
409 // * Ideally I'd define the primitives as:
410 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
411 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
412 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
413 // Instead, we're stuck with a rather awkward and brittle register assignments below.
414 // Furthermore the register assignments are overconstrained, possibly resulting in
415 // sub-optimal code near the synchronization site.
416 //
417 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
418 // Alternately, use a better sp-proximity test.
419 //
420 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
421 // Either one is sufficient to uniquely identify a thread.
422 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
423 //
424 // * Intrinsify notify() and notifyAll() for the common cases where the
425 // object is locked by the calling thread but the waitlist is empty.
426 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
427 //
428 // * use jccb and jmpb instead of jcc and jmp to improve code density.
429 // But beware of excessive branch density on AMD Opterons.
430 //
431 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
432 // or failure of the fast path. If the fast path fails then we pass
433 // control to the slow path, typically in C. In fast_lock and
434 // fast_unlock we often branch to DONE_LABEL, just to find that C2
435 // will emit a conditional branch immediately after the node.
436 // So we have branches to branches and lots of ICC.ZF games.
437 // Instead, it might be better to have C2 pass a "FailureLabel"
438 // into fast_lock and fast_unlock. In the case of success, control
439 // will drop through the node. ICC.ZF is undefined at exit.
440 // In the case of failure, the node will branch directly to the
441 // FailureLabel
442
443
444 // obj: object to lock
445 // box: on-stack box address (displaced header location) - KILLED
446 // rax,: tmp -- KILLED
447 // scr: tmp -- KILLED
fast_lock(Register objReg,Register boxReg,Register tmpReg,Register scrReg,Register cx1Reg,Register cx2Reg,BiasedLockingCounters * counters,RTMLockingCounters * rtm_counters,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool use_rtm,bool profile_rtm)448 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
449 Register scrReg, Register cx1Reg, Register cx2Reg,
450 BiasedLockingCounters* counters,
451 RTMLockingCounters* rtm_counters,
452 RTMLockingCounters* stack_rtm_counters,
453 Metadata* method_data,
454 bool use_rtm, bool profile_rtm) {
455 // Ensure the register assignments are disjoint
456 assert(tmpReg == rax, "");
457
458 if (use_rtm) {
459 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
460 } else {
461 assert(cx2Reg == noreg, "");
462 assert_different_registers(objReg, boxReg, tmpReg, scrReg);
463 }
464
465 if (counters != NULL) {
466 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
467 }
468
469 // Possible cases that we'll encounter in fast_lock
470 // ------------------------------------------------
471 // * Inflated
472 // -- unlocked
473 // -- Locked
474 // = by self
475 // = by other
476 // * biased
477 // -- by Self
478 // -- by other
479 // * neutral
480 // * stack-locked
481 // -- by self
482 // = sp-proximity test hits
483 // = sp-proximity test generates false-negative
484 // -- by other
485 //
486
487 Label IsInflated, DONE_LABEL;
488
489 if (DiagnoseSyncOnValueBasedClasses != 0) {
490 load_klass(tmpReg, objReg, cx1Reg);
491 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
492 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
493 jcc(Assembler::notZero, DONE_LABEL);
494 }
495
496 // it's stack-locked, biased or neutral
497 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
498 // order to reduce the number of conditional branches in the most common cases.
499 // Beware -- there's a subtle invariant that fetch of the markword
500 // at [FETCH], below, will never observe a biased encoding (*101b).
501 // If this invariant is not held we risk exclusion (safety) failure.
502 if (UseBiasedLocking && !UseOptoBiasInlining) {
503 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
504 }
505
506 #if INCLUDE_RTM_OPT
507 if (UseRTMForStackLocks && use_rtm) {
508 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
509 stack_rtm_counters, method_data, profile_rtm,
510 DONE_LABEL, IsInflated);
511 }
512 #endif // INCLUDE_RTM_OPT
513
514 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH]
515 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
516 jccb(Assembler::notZero, IsInflated);
517
518 // Attempt stack-locking ...
519 orptr (tmpReg, markWord::unlocked_value);
520 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
521 lock();
522 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg
523 if (counters != NULL) {
524 cond_inc32(Assembler::equal,
525 ExternalAddress((address)counters->fast_path_entry_count_addr()));
526 }
527 jcc(Assembler::equal, DONE_LABEL); // Success
528
529 // Recursive locking.
530 // The object is stack-locked: markword contains stack pointer to BasicLock.
531 // Locked by current thread if difference with current SP is less than one page.
532 subptr(tmpReg, rsp);
533 // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
534 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
535 movptr(Address(boxReg, 0), tmpReg);
536 if (counters != NULL) {
537 cond_inc32(Assembler::equal,
538 ExternalAddress((address)counters->fast_path_entry_count_addr()));
539 }
540 jmp(DONE_LABEL);
541
542 bind(IsInflated);
543 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
544
545 #if INCLUDE_RTM_OPT
546 // Use the same RTM locking code in 32- and 64-bit VM.
547 if (use_rtm) {
548 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
549 rtm_counters, method_data, profile_rtm, DONE_LABEL);
550 } else {
551 #endif // INCLUDE_RTM_OPT
552
553 #ifndef _LP64
554 // The object is inflated.
555
556 // boxReg refers to the on-stack BasicLock in the current frame.
557 // We'd like to write:
558 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices.
559 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
560 // additional latency as we have another ST in the store buffer that must drain.
561
562 // avoid ST-before-CAS
563 // register juggle because we need tmpReg for cmpxchgptr below
564 movptr(scrReg, boxReg);
565 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
566
567 // Optimistic form: consider XORL tmpReg,tmpReg
568 movptr(tmpReg, NULL_WORD);
569
570 // Appears unlocked - try to swing _owner from null to non-null.
571 // Ideally, I'd manifest "Self" with get_thread and then attempt
572 // to CAS the register containing Self into m->Owner.
573 // But we don't have enough registers, so instead we can either try to CAS
574 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
575 // we later store "Self" into m->Owner. Transiently storing a stack address
576 // (rsp or the address of the box) into m->owner is harmless.
577 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
578 lock();
579 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
580 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
581 // If we weren't able to swing _owner from NULL to the BasicLock
582 // then take the slow path.
583 jccb (Assembler::notZero, DONE_LABEL);
584 // update _owner from BasicLock to thread
585 get_thread (scrReg); // beware: clobbers ICCs
586 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
587 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
588
589 // If the CAS fails we can either retry or pass control to the slow path.
590 // We use the latter tactic.
591 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
592 // If the CAS was successful ...
593 // Self has acquired the lock
594 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
595 // Intentional fall-through into DONE_LABEL ...
596 #else // _LP64
597 // It's inflated and we use scrReg for ObjectMonitor* in this section.
598 movq(scrReg, tmpReg);
599 xorq(tmpReg, tmpReg);
600 lock();
601 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
602 // Unconditionally set box->_displaced_header = markWord::unused_mark().
603 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
604 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
605 // Intentional fall-through into DONE_LABEL ...
606 // Propagate ICC.ZF from CAS above into DONE_LABEL.
607 #endif // _LP64
608 #if INCLUDE_RTM_OPT
609 } // use_rtm()
610 #endif
611 // DONE_LABEL is a hot target - we'd really like to place it at the
612 // start of cache line by padding with NOPs.
613 // See the AMD and Intel software optimization manuals for the
614 // most efficient "long" NOP encodings.
615 // Unfortunately none of our alignment mechanisms suffice.
616 bind(DONE_LABEL);
617
618 // At DONE_LABEL the icc ZFlag is set as follows ...
619 // fast_unlock uses the same protocol.
620 // ZFlag == 1 -> Success
621 // ZFlag == 0 -> Failure - force control through the slow path
622 }
623
624 // obj: object to unlock
625 // box: box address (displaced header location), killed. Must be EAX.
626 // tmp: killed, cannot be obj nor box.
627 //
628 // Some commentary on balanced locking:
629 //
630 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
631 // Methods that don't have provably balanced locking are forced to run in the
632 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
633 // The interpreter provides two properties:
634 // I1: At return-time the interpreter automatically and quietly unlocks any
635 // objects acquired the current activation (frame). Recall that the
636 // interpreter maintains an on-stack list of locks currently held by
637 // a frame.
638 // I2: If a method attempts to unlock an object that is not held by the
639 // the frame the interpreter throws IMSX.
640 //
641 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
642 // B() doesn't have provably balanced locking so it runs in the interpreter.
643 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
644 // is still locked by A().
645 //
646 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
647 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
648 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
649 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
650 // Arguably given that the spec legislates the JNI case as undefined our implementation
651 // could reasonably *avoid* checking owner in fast_unlock().
652 // In the interest of performance we elide m->Owner==Self check in unlock.
653 // A perfectly viable alternative is to elide the owner check except when
654 // Xcheck:jni is enabled.
655
fast_unlock(Register objReg,Register boxReg,Register tmpReg,bool use_rtm)656 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
657 assert(boxReg == rax, "");
658 assert_different_registers(objReg, boxReg, tmpReg);
659
660 Label DONE_LABEL, Stacked, CheckSucc;
661
662 // Critically, the biased locking test must have precedence over
663 // and appear before the (box->dhw == 0) recursive stack-lock test.
664 if (UseBiasedLocking && !UseOptoBiasInlining) {
665 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
666 }
667
668 #if INCLUDE_RTM_OPT
669 if (UseRTMForStackLocks && use_rtm) {
670 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
671 Label L_regular_unlock;
672 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
673 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
674 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
675 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
676 xend(); // otherwise end...
677 jmp(DONE_LABEL); // ... and we're done
678 bind(L_regular_unlock);
679 }
680 #endif
681
682 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
683 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
684 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
685 testptr(tmpReg, markWord::monitor_value); // Inflated?
686 jccb (Assembler::zero, Stacked);
687
688 // It's inflated.
689 #if INCLUDE_RTM_OPT
690 if (use_rtm) {
691 Label L_regular_inflated_unlock;
692 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
693 movptr(boxReg, Address(tmpReg, owner_offset));
694 testptr(boxReg, boxReg);
695 jccb(Assembler::notZero, L_regular_inflated_unlock);
696 xend();
697 jmpb(DONE_LABEL);
698 bind(L_regular_inflated_unlock);
699 }
700 #endif
701
702 // Despite our balanced locking property we still check that m->_owner == Self
703 // as java routines or native JNI code called by this thread might
704 // have released the lock.
705 // Refer to the comments in synchronizer.cpp for how we might encode extra
706 // state in _succ so we can avoid fetching EntryList|cxq.
707 //
708 // I'd like to add more cases in fast_lock() and fast_unlock() --
709 // such as recursive enter and exit -- but we have to be wary of
710 // I$ bloat, T$ effects and BP$ effects.
711 //
712 // If there's no contention try a 1-0 exit. That is, exit without
713 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
714 // we detect and recover from the race that the 1-0 exit admits.
715 //
716 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
717 // before it STs null into _owner, releasing the lock. Updates
718 // to data protected by the critical section must be visible before
719 // we drop the lock (and thus before any other thread could acquire
720 // the lock and observe the fields protected by the lock).
721 // IA32's memory-model is SPO, so STs are ordered with respect to
722 // each other and there's no need for an explicit barrier (fence).
723 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
724 #ifndef _LP64
725 get_thread (boxReg);
726
727 // Note that we could employ various encoding schemes to reduce
728 // the number of loads below (currently 4) to just 2 or 3.
729 // Refer to the comments in synchronizer.cpp.
730 // In practice the chain of fetches doesn't seem to impact performance, however.
731 xorptr(boxReg, boxReg);
732 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
733 jccb (Assembler::notZero, DONE_LABEL);
734 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
735 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
736 jccb (Assembler::notZero, CheckSucc);
737 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
738 jmpb (DONE_LABEL);
739
740 bind (Stacked);
741 // It's not inflated and it's not recursively stack-locked and it's not biased.
742 // It must be stack-locked.
743 // Try to reset the header to displaced header.
744 // The "box" value on the stack is stable, so we can reload
745 // and be assured we observe the same value as above.
746 movptr(tmpReg, Address(boxReg, 0));
747 lock();
748 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
749 // Intention fall-thru into DONE_LABEL
750
751 // DONE_LABEL is a hot target - we'd really like to place it at the
752 // start of cache line by padding with NOPs.
753 // See the AMD and Intel software optimization manuals for the
754 // most efficient "long" NOP encodings.
755 // Unfortunately none of our alignment mechanisms suffice.
756 bind (CheckSucc);
757 #else // _LP64
758 // It's inflated
759 xorptr(boxReg, boxReg);
760 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
761 jccb (Assembler::notZero, DONE_LABEL);
762 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
763 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
764 jccb (Assembler::notZero, CheckSucc);
765 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
766 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
767 jmpb (DONE_LABEL);
768
769 // Try to avoid passing control into the slow_path ...
770 Label LSuccess, LGoSlowPath ;
771 bind (CheckSucc);
772
773 // The following optional optimization can be elided if necessary
774 // Effectively: if (succ == null) goto slow path
775 // The code reduces the window for a race, however,
776 // and thus benefits performance.
777 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
778 jccb (Assembler::zero, LGoSlowPath);
779
780 xorptr(boxReg, boxReg);
781 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
782 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
783
784 // Memory barrier/fence
785 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
786 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
787 // This is faster on Nehalem and AMD Shanghai/Barcelona.
788 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
789 // We might also restructure (ST Owner=0;barrier;LD _Succ) to
790 // (mov box,0; xchgq box, &m->Owner; LD _succ) .
791 lock(); addl(Address(rsp, 0), 0);
792
793 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
794 jccb (Assembler::notZero, LSuccess);
795
796 // Rare inopportune interleaving - race.
797 // The successor vanished in the small window above.
798 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
799 // We need to ensure progress and succession.
800 // Try to reacquire the lock.
801 // If that fails then the new owner is responsible for succession and this
802 // thread needs to take no further action and can exit via the fast path (success).
803 // If the re-acquire succeeds then pass control into the slow path.
804 // As implemented, this latter mode is horrible because we generated more
805 // coherence traffic on the lock *and* artifically extended the critical section
806 // length while by virtue of passing control into the slow path.
807
808 // box is really RAX -- the following CMPXCHG depends on that binding
809 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
810 lock();
811 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
812 // There's no successor so we tried to regrab the lock.
813 // If that didn't work, then another thread grabbed the
814 // lock so we're done (and exit was a success).
815 jccb (Assembler::notEqual, LSuccess);
816 // Intentional fall-through into slow path
817
818 bind (LGoSlowPath);
819 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
820 jmpb (DONE_LABEL);
821
822 bind (LSuccess);
823 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
824 jmpb (DONE_LABEL);
825
826 bind (Stacked);
827 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
828 lock();
829 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
830
831 #endif
832 bind(DONE_LABEL);
833 }
834
835 //-------------------------------------------------------------------------------------------
836 // Generic instructions support for use in .ad files C2 code generation
837
vabsnegd(int opcode,XMMRegister dst,XMMRegister src,Register scr)838 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
839 if (dst != src) {
840 movdqu(dst, src);
841 }
842 if (opcode == Op_AbsVD) {
843 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
844 } else {
845 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
846 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
847 }
848 }
849
vabsnegd(int opcode,XMMRegister dst,XMMRegister src,int vector_len,Register scr)850 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
851 if (opcode == Op_AbsVD) {
852 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
853 } else {
854 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
855 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
856 }
857 }
858
vabsnegf(int opcode,XMMRegister dst,XMMRegister src,Register scr)859 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
860 if (dst != src) {
861 movdqu(dst, src);
862 }
863 if (opcode == Op_AbsVF) {
864 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
865 } else {
866 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
867 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
868 }
869 }
870
vabsnegf(int opcode,XMMRegister dst,XMMRegister src,int vector_len,Register scr)871 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
872 if (opcode == Op_AbsVF) {
873 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
874 } else {
875 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
876 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
877 }
878 }
879
pminmax(int opcode,BasicType elem_bt,XMMRegister dst,XMMRegister src,XMMRegister tmp)880 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
881 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
882 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
883
884 if (opcode == Op_MinV) {
885 if (elem_bt == T_BYTE) {
886 pminsb(dst, src);
887 } else if (elem_bt == T_SHORT) {
888 pminsw(dst, src);
889 } else if (elem_bt == T_INT) {
890 pminsd(dst, src);
891 } else {
892 assert(elem_bt == T_LONG, "required");
893 assert(tmp == xmm0, "required");
894 assert_different_registers(dst, src, tmp);
895 movdqu(xmm0, dst);
896 pcmpgtq(xmm0, src);
897 blendvpd(dst, src); // xmm0 as mask
898 }
899 } else { // opcode == Op_MaxV
900 if (elem_bt == T_BYTE) {
901 pmaxsb(dst, src);
902 } else if (elem_bt == T_SHORT) {
903 pmaxsw(dst, src);
904 } else if (elem_bt == T_INT) {
905 pmaxsd(dst, src);
906 } else {
907 assert(elem_bt == T_LONG, "required");
908 assert(tmp == xmm0, "required");
909 assert_different_registers(dst, src, tmp);
910 movdqu(xmm0, src);
911 pcmpgtq(xmm0, dst);
912 blendvpd(dst, src); // xmm0 as mask
913 }
914 }
915 }
916
vpminmax(int opcode,BasicType elem_bt,XMMRegister dst,XMMRegister src1,XMMRegister src2,int vlen_enc)917 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
918 XMMRegister dst, XMMRegister src1, XMMRegister src2,
919 int vlen_enc) {
920 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
921
922 if (opcode == Op_MinV) {
923 if (elem_bt == T_BYTE) {
924 vpminsb(dst, src1, src2, vlen_enc);
925 } else if (elem_bt == T_SHORT) {
926 vpminsw(dst, src1, src2, vlen_enc);
927 } else if (elem_bt == T_INT) {
928 vpminsd(dst, src1, src2, vlen_enc);
929 } else {
930 assert(elem_bt == T_LONG, "required");
931 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
932 vpminsq(dst, src1, src2, vlen_enc);
933 } else {
934 assert_different_registers(dst, src1, src2);
935 vpcmpgtq(dst, src1, src2, vlen_enc);
936 vblendvpd(dst, src1, src2, dst, vlen_enc);
937 }
938 }
939 } else { // opcode == Op_MaxV
940 if (elem_bt == T_BYTE) {
941 vpmaxsb(dst, src1, src2, vlen_enc);
942 } else if (elem_bt == T_SHORT) {
943 vpmaxsw(dst, src1, src2, vlen_enc);
944 } else if (elem_bt == T_INT) {
945 vpmaxsd(dst, src1, src2, vlen_enc);
946 } else {
947 assert(elem_bt == T_LONG, "required");
948 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
949 vpmaxsq(dst, src1, src2, vlen_enc);
950 } else {
951 assert_different_registers(dst, src1, src2);
952 vpcmpgtq(dst, src1, src2, vlen_enc);
953 vblendvpd(dst, src2, src1, dst, vlen_enc);
954 }
955 }
956 }
957 }
958
959 // Float/Double min max
960
vminmax_fp(int opcode,BasicType elem_bt,XMMRegister dst,XMMRegister a,XMMRegister b,XMMRegister tmp,XMMRegister atmp,XMMRegister btmp,int vlen_enc)961 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
962 XMMRegister dst, XMMRegister a, XMMRegister b,
963 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
964 int vlen_enc) {
965 assert(UseAVX > 0, "required");
966 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
967 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
968 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
969 assert_different_registers(a, b, tmp, atmp, btmp);
970
971 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
972 bool is_double_word = is_double_word_type(elem_bt);
973
974 if (!is_double_word && is_min) {
975 vblendvps(atmp, a, b, a, vlen_enc);
976 vblendvps(btmp, b, a, a, vlen_enc);
977 vminps(tmp, atmp, btmp, vlen_enc);
978 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
979 vblendvps(dst, tmp, atmp, btmp, vlen_enc);
980 } else if (!is_double_word && !is_min) {
981 vblendvps(btmp, b, a, b, vlen_enc);
982 vblendvps(atmp, a, b, b, vlen_enc);
983 vmaxps(tmp, atmp, btmp, vlen_enc);
984 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
985 vblendvps(dst, tmp, atmp, btmp, vlen_enc);
986 } else if (is_double_word && is_min) {
987 vblendvpd(atmp, a, b, a, vlen_enc);
988 vblendvpd(btmp, b, a, a, vlen_enc);
989 vminpd(tmp, atmp, btmp, vlen_enc);
990 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
991 vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
992 } else {
993 assert(is_double_word && !is_min, "sanity");
994 vblendvpd(btmp, b, a, b, vlen_enc);
995 vblendvpd(atmp, a, b, b, vlen_enc);
996 vmaxpd(tmp, atmp, btmp, vlen_enc);
997 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
998 vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
999 }
1000 }
1001
evminmax_fp(int opcode,BasicType elem_bt,XMMRegister dst,XMMRegister a,XMMRegister b,KRegister ktmp,XMMRegister atmp,XMMRegister btmp,int vlen_enc)1002 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1003 XMMRegister dst, XMMRegister a, XMMRegister b,
1004 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1005 int vlen_enc) {
1006 assert(UseAVX > 2, "required");
1007 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1008 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1009 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1010 assert_different_registers(dst, a, b, atmp, btmp);
1011
1012 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1013 bool is_double_word = is_double_word_type(elem_bt);
1014 bool merge = true;
1015
1016 if (!is_double_word && is_min) {
1017 evpmovd2m(ktmp, a, vlen_enc);
1018 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1019 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1020 vminps(dst, atmp, btmp, vlen_enc);
1021 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1022 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1023 } else if (!is_double_word && !is_min) {
1024 evpmovd2m(ktmp, b, vlen_enc);
1025 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1026 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1027 vmaxps(dst, atmp, btmp, vlen_enc);
1028 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1029 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1030 } else if (is_double_word && is_min) {
1031 evpmovq2m(ktmp, a, vlen_enc);
1032 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1033 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1034 vminpd(dst, atmp, btmp, vlen_enc);
1035 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1037 } else {
1038 assert(is_double_word && !is_min, "sanity");
1039 evpmovq2m(ktmp, b, vlen_enc);
1040 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1041 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1042 vmaxpd(dst, atmp, btmp, vlen_enc);
1043 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1044 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1045 }
1046 }
1047
vextendbw(bool sign,XMMRegister dst,XMMRegister src)1048 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1049 if (sign) {
1050 pmovsxbw(dst, src);
1051 } else {
1052 pmovzxbw(dst, src);
1053 }
1054 }
1055
vextendbw(bool sign,XMMRegister dst,XMMRegister src,int vector_len)1056 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1057 if (sign) {
1058 vpmovsxbw(dst, src, vector_len);
1059 } else {
1060 vpmovzxbw(dst, src, vector_len);
1061 }
1062 }
1063
vextendbd(bool sign,XMMRegister dst,XMMRegister src,int vector_len)1064 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1065 if (sign) {
1066 vpmovsxbd(dst, src, vector_len);
1067 } else {
1068 vpmovzxbd(dst, src, vector_len);
1069 }
1070 }
1071
vextendwd(bool sign,XMMRegister dst,XMMRegister src,int vector_len)1072 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1073 if (sign) {
1074 vpmovsxwd(dst, src, vector_len);
1075 } else {
1076 vpmovzxwd(dst, src, vector_len);
1077 }
1078 }
1079
vprotate_imm(int opcode,BasicType etype,XMMRegister dst,XMMRegister src,int shift,int vector_len)1080 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1081 int shift, int vector_len) {
1082 if (opcode == Op_RotateLeftV) {
1083 if (etype == T_INT) {
1084 evprold(dst, src, shift, vector_len);
1085 } else {
1086 assert(etype == T_LONG, "expected type T_LONG");
1087 evprolq(dst, src, shift, vector_len);
1088 }
1089 } else {
1090 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1091 if (etype == T_INT) {
1092 evprord(dst, src, shift, vector_len);
1093 } else {
1094 assert(etype == T_LONG, "expected type T_LONG");
1095 evprorq(dst, src, shift, vector_len);
1096 }
1097 }
1098 }
1099
vprotate_var(int opcode,BasicType etype,XMMRegister dst,XMMRegister src,XMMRegister shift,int vector_len)1100 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1101 XMMRegister shift, int vector_len) {
1102 if (opcode == Op_RotateLeftV) {
1103 if (etype == T_INT) {
1104 evprolvd(dst, src, shift, vector_len);
1105 } else {
1106 assert(etype == T_LONG, "expected type T_LONG");
1107 evprolvq(dst, src, shift, vector_len);
1108 }
1109 } else {
1110 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1111 if (etype == T_INT) {
1112 evprorvd(dst, src, shift, vector_len);
1113 } else {
1114 assert(etype == T_LONG, "expected type T_LONG");
1115 evprorvq(dst, src, shift, vector_len);
1116 }
1117 }
1118 }
1119
vshiftd_imm(int opcode,XMMRegister dst,int shift)1120 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1121 if (opcode == Op_RShiftVI) {
1122 psrad(dst, shift);
1123 } else if (opcode == Op_LShiftVI) {
1124 pslld(dst, shift);
1125 } else {
1126 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1127 psrld(dst, shift);
1128 }
1129 }
1130
vshiftd(int opcode,XMMRegister dst,XMMRegister shift)1131 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1132 switch (opcode) {
1133 case Op_RShiftVI: psrad(dst, shift); break;
1134 case Op_LShiftVI: pslld(dst, shift); break;
1135 case Op_URShiftVI: psrld(dst, shift); break;
1136
1137 default: assert(false, "%s", NodeClassNames[opcode]);
1138 }
1139 }
1140
vshiftd_imm(int opcode,XMMRegister dst,XMMRegister nds,int shift,int vector_len)1141 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1142 if (opcode == Op_RShiftVI) {
1143 vpsrad(dst, nds, shift, vector_len);
1144 } else if (opcode == Op_LShiftVI) {
1145 vpslld(dst, nds, shift, vector_len);
1146 } else {
1147 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1148 vpsrld(dst, nds, shift, vector_len);
1149 }
1150 }
1151
vshiftd(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc)1152 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1153 switch (opcode) {
1154 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1155 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1156 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1157
1158 default: assert(false, "%s", NodeClassNames[opcode]);
1159 }
1160 }
1161
vshiftw(int opcode,XMMRegister dst,XMMRegister shift)1162 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1163 switch (opcode) {
1164 case Op_RShiftVB: // fall-through
1165 case Op_RShiftVS: psraw(dst, shift); break;
1166
1167 case Op_LShiftVB: // fall-through
1168 case Op_LShiftVS: psllw(dst, shift); break;
1169
1170 case Op_URShiftVS: // fall-through
1171 case Op_URShiftVB: psrlw(dst, shift); break;
1172
1173 default: assert(false, "%s", NodeClassNames[opcode]);
1174 }
1175 }
1176
vshiftw(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc)1177 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1178 switch (opcode) {
1179 case Op_RShiftVB: // fall-through
1180 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1181
1182 case Op_LShiftVB: // fall-through
1183 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1184
1185 case Op_URShiftVS: // fall-through
1186 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1187
1188 default: assert(false, "%s", NodeClassNames[opcode]);
1189 }
1190 }
1191
vshiftq(int opcode,XMMRegister dst,XMMRegister shift)1192 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1193 switch (opcode) {
1194 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1195 case Op_LShiftVL: psllq(dst, shift); break;
1196 case Op_URShiftVL: psrlq(dst, shift); break;
1197
1198 default: assert(false, "%s", NodeClassNames[opcode]);
1199 }
1200 }
1201
vshiftq_imm(int opcode,XMMRegister dst,int shift)1202 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1203 if (opcode == Op_RShiftVL) {
1204 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1205 } else if (opcode == Op_LShiftVL) {
1206 psllq(dst, shift);
1207 } else {
1208 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1209 psrlq(dst, shift);
1210 }
1211 }
1212
vshiftq(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc)1213 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1214 switch (opcode) {
1215 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1216 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1217 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1218
1219 default: assert(false, "%s", NodeClassNames[opcode]);
1220 }
1221 }
1222
vshiftq_imm(int opcode,XMMRegister dst,XMMRegister nds,int shift,int vector_len)1223 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1224 if (opcode == Op_RShiftVL) {
1225 evpsraq(dst, nds, shift, vector_len);
1226 } else if (opcode == Op_LShiftVL) {
1227 vpsllq(dst, nds, shift, vector_len);
1228 } else {
1229 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1230 vpsrlq(dst, nds, shift, vector_len);
1231 }
1232 }
1233
varshiftd(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc)1234 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1235 switch (opcode) {
1236 case Op_RShiftVB: // fall-through
1237 case Op_RShiftVS: // fall-through
1238 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1239
1240 case Op_LShiftVB: // fall-through
1241 case Op_LShiftVS: // fall-through
1242 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1243
1244 case Op_URShiftVB: // fall-through
1245 case Op_URShiftVS: // fall-through
1246 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1247
1248 default: assert(false, "%s", NodeClassNames[opcode]);
1249 }
1250 }
1251
varshiftw(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc)1252 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1253 switch (opcode) {
1254 case Op_RShiftVB: // fall-through
1255 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1256
1257 case Op_LShiftVB: // fall-through
1258 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1259
1260 case Op_URShiftVB: // fall-through
1261 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1262
1263 default: assert(false, "%s", NodeClassNames[opcode]);
1264 }
1265 }
1266
varshiftq(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc,XMMRegister tmp)1267 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1268 assert(UseAVX >= 2, "required");
1269 switch (opcode) {
1270 case Op_RShiftVL: {
1271 if (UseAVX > 2) {
1272 assert(tmp == xnoreg, "not used");
1273 if (!VM_Version::supports_avx512vl()) {
1274 vlen_enc = Assembler::AVX_512bit;
1275 }
1276 evpsravq(dst, src, shift, vlen_enc);
1277 } else {
1278 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1279 vpsrlvq(dst, src, shift, vlen_enc);
1280 vpsrlvq(tmp, tmp, shift, vlen_enc);
1281 vpxor(dst, dst, tmp, vlen_enc);
1282 vpsubq(dst, dst, tmp, vlen_enc);
1283 }
1284 break;
1285 }
1286 case Op_LShiftVL: {
1287 assert(tmp == xnoreg, "not used");
1288 vpsllvq(dst, src, shift, vlen_enc);
1289 break;
1290 }
1291 case Op_URShiftVL: {
1292 assert(tmp == xnoreg, "not used");
1293 vpsrlvq(dst, src, shift, vlen_enc);
1294 break;
1295 }
1296 default: assert(false, "%s", NodeClassNames[opcode]);
1297 }
1298 }
1299
1300 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
varshiftbw(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vector_len,XMMRegister vtmp,Register scratch)1301 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1302 assert(opcode == Op_LShiftVB ||
1303 opcode == Op_RShiftVB ||
1304 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1305 bool sign = (opcode != Op_URShiftVB);
1306 assert(vector_len == 0, "required");
1307 vextendbd(sign, dst, src, 1);
1308 vpmovzxbd(vtmp, shift, 1);
1309 varshiftd(opcode, dst, dst, vtmp, 1);
1310 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1311 vextracti128_high(vtmp, dst);
1312 vpackusdw(dst, dst, vtmp, 0);
1313 }
1314
1315 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
evarshiftb(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vector_len,XMMRegister vtmp,Register scratch)1316 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1317 assert(opcode == Op_LShiftVB ||
1318 opcode == Op_RShiftVB ||
1319 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1320 bool sign = (opcode != Op_URShiftVB);
1321 int ext_vector_len = vector_len + 1;
1322 vextendbw(sign, dst, src, ext_vector_len);
1323 vpmovzxbw(vtmp, shift, ext_vector_len);
1324 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1325 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1326 if (vector_len == 0) {
1327 vextracti128_high(vtmp, dst);
1328 vpackuswb(dst, dst, vtmp, vector_len);
1329 } else {
1330 vextracti64x4_high(vtmp, dst);
1331 vpackuswb(dst, dst, vtmp, vector_len);
1332 vpermq(dst, dst, 0xD8, vector_len);
1333 }
1334 }
1335
insert(BasicType typ,XMMRegister dst,Register val,int idx)1336 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1337 switch(typ) {
1338 case T_BYTE:
1339 pinsrb(dst, val, idx);
1340 break;
1341 case T_SHORT:
1342 pinsrw(dst, val, idx);
1343 break;
1344 case T_INT:
1345 pinsrd(dst, val, idx);
1346 break;
1347 case T_LONG:
1348 pinsrq(dst, val, idx);
1349 break;
1350 default:
1351 assert(false,"Should not reach here.");
1352 break;
1353 }
1354 }
1355
vinsert(BasicType typ,XMMRegister dst,XMMRegister src,Register val,int idx)1356 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1357 switch(typ) {
1358 case T_BYTE:
1359 vpinsrb(dst, src, val, idx);
1360 break;
1361 case T_SHORT:
1362 vpinsrw(dst, src, val, idx);
1363 break;
1364 case T_INT:
1365 vpinsrd(dst, src, val, idx);
1366 break;
1367 case T_LONG:
1368 vpinsrq(dst, src, val, idx);
1369 break;
1370 default:
1371 assert(false,"Should not reach here.");
1372 break;
1373 }
1374 }
1375
vgather(BasicType typ,XMMRegister dst,Register base,XMMRegister idx,XMMRegister mask,int vector_len)1376 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1377 switch(typ) {
1378 case T_INT:
1379 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1380 break;
1381 case T_FLOAT:
1382 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1383 break;
1384 case T_LONG:
1385 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1386 break;
1387 case T_DOUBLE:
1388 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1389 break;
1390 default:
1391 assert(false,"Should not reach here.");
1392 break;
1393 }
1394 }
1395
evgather(BasicType typ,XMMRegister dst,KRegister mask,Register base,XMMRegister idx,int vector_len)1396 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1397 switch(typ) {
1398 case T_INT:
1399 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1400 break;
1401 case T_FLOAT:
1402 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1403 break;
1404 case T_LONG:
1405 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1406 break;
1407 case T_DOUBLE:
1408 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1409 break;
1410 default:
1411 assert(false,"Should not reach here.");
1412 break;
1413 }
1414 }
1415
evscatter(BasicType typ,Register base,XMMRegister idx,KRegister mask,XMMRegister src,int vector_len)1416 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1417 switch(typ) {
1418 case T_INT:
1419 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1420 break;
1421 case T_FLOAT:
1422 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1423 break;
1424 case T_LONG:
1425 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1426 break;
1427 case T_DOUBLE:
1428 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1429 break;
1430 default:
1431 assert(false,"Should not reach here.");
1432 break;
1433 }
1434 }
1435
load_vector_mask(XMMRegister dst,XMMRegister src,int vlen_in_bytes,BasicType elem_bt)1436 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) {
1437 if (vlen_in_bytes <= 16) {
1438 pxor (dst, dst);
1439 psubb(dst, src);
1440 switch (elem_bt) {
1441 case T_BYTE: /* nothing to do */ break;
1442 case T_SHORT: pmovsxbw(dst, dst); break;
1443 case T_INT: pmovsxbd(dst, dst); break;
1444 case T_FLOAT: pmovsxbd(dst, dst); break;
1445 case T_LONG: pmovsxbq(dst, dst); break;
1446 case T_DOUBLE: pmovsxbq(dst, dst); break;
1447
1448 default: assert(false, "%s", type2name(elem_bt));
1449 }
1450 } else {
1451 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1452
1453 vpxor (dst, dst, dst, vlen_enc);
1454 vpsubb(dst, dst, src, vlen_enc);
1455 switch (elem_bt) {
1456 case T_BYTE: /* nothing to do */ break;
1457 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1458 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1459 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1460 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1461 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1462
1463 default: assert(false, "%s", type2name(elem_bt));
1464 }
1465 }
1466 }
1467
load_iota_indices(XMMRegister dst,Register scratch,int vlen_in_bytes)1468 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1469 ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1470 if (vlen_in_bytes <= 16) {
1471 movdqu(dst, addr, scratch);
1472 } else if (vlen_in_bytes == 32) {
1473 vmovdqu(dst, addr, scratch);
1474 } else {
1475 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1476 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1477 }
1478 }
1479 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1480
reduce_operation_128(BasicType typ,int opcode,XMMRegister dst,XMMRegister src)1481 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1482 int vector_len = Assembler::AVX_128bit;
1483
1484 switch (opcode) {
1485 case Op_AndReductionV: pand(dst, src); break;
1486 case Op_OrReductionV: por (dst, src); break;
1487 case Op_XorReductionV: pxor(dst, src); break;
1488 case Op_MinReductionV:
1489 switch (typ) {
1490 case T_BYTE: pminsb(dst, src); break;
1491 case T_SHORT: pminsw(dst, src); break;
1492 case T_INT: pminsd(dst, src); break;
1493 case T_LONG: assert(UseAVX > 2, "required");
1494 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1495 default: assert(false, "wrong type");
1496 }
1497 break;
1498 case Op_MaxReductionV:
1499 switch (typ) {
1500 case T_BYTE: pmaxsb(dst, src); break;
1501 case T_SHORT: pmaxsw(dst, src); break;
1502 case T_INT: pmaxsd(dst, src); break;
1503 case T_LONG: assert(UseAVX > 2, "required");
1504 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1505 default: assert(false, "wrong type");
1506 }
1507 break;
1508 case Op_AddReductionVF: addss(dst, src); break;
1509 case Op_AddReductionVD: addsd(dst, src); break;
1510 case Op_AddReductionVI:
1511 switch (typ) {
1512 case T_BYTE: paddb(dst, src); break;
1513 case T_SHORT: paddw(dst, src); break;
1514 case T_INT: paddd(dst, src); break;
1515 default: assert(false, "wrong type");
1516 }
1517 break;
1518 case Op_AddReductionVL: paddq(dst, src); break;
1519 case Op_MulReductionVF: mulss(dst, src); break;
1520 case Op_MulReductionVD: mulsd(dst, src); break;
1521 case Op_MulReductionVI:
1522 switch (typ) {
1523 case T_SHORT: pmullw(dst, src); break;
1524 case T_INT: pmulld(dst, src); break;
1525 default: assert(false, "wrong type");
1526 }
1527 break;
1528 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1529 vpmullq(dst, dst, src, vector_len); break;
1530 default: assert(false, "wrong opcode");
1531 }
1532 }
1533
reduce_operation_256(BasicType typ,int opcode,XMMRegister dst,XMMRegister src1,XMMRegister src2)1534 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1535 int vector_len = Assembler::AVX_256bit;
1536
1537 switch (opcode) {
1538 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1539 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1540 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1541 case Op_MinReductionV:
1542 switch (typ) {
1543 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1544 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1545 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1546 case T_LONG: assert(UseAVX > 2, "required");
1547 vpminsq(dst, src1, src2, vector_len); break;
1548 default: assert(false, "wrong type");
1549 }
1550 break;
1551 case Op_MaxReductionV:
1552 switch (typ) {
1553 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1554 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1555 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1556 case T_LONG: assert(UseAVX > 2, "required");
1557 vpmaxsq(dst, src1, src2, vector_len); break;
1558 default: assert(false, "wrong type");
1559 }
1560 break;
1561 case Op_AddReductionVI:
1562 switch (typ) {
1563 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1564 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1565 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1566 default: assert(false, "wrong type");
1567 }
1568 break;
1569 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1570 case Op_MulReductionVI:
1571 switch (typ) {
1572 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1573 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1574 default: assert(false, "wrong type");
1575 }
1576 break;
1577 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1578 default: assert(false, "wrong opcode");
1579 }
1580 }
1581
reduce_fp(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1582 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1583 XMMRegister dst, XMMRegister src,
1584 XMMRegister vtmp1, XMMRegister vtmp2) {
1585 switch (opcode) {
1586 case Op_AddReductionVF:
1587 case Op_MulReductionVF:
1588 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1589 break;
1590
1591 case Op_AddReductionVD:
1592 case Op_MulReductionVD:
1593 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1594 break;
1595
1596 default: assert(false, "wrong opcode");
1597 }
1598 }
1599
reduceB(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1600 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1601 Register dst, Register src1, XMMRegister src2,
1602 XMMRegister vtmp1, XMMRegister vtmp2) {
1603 switch (vlen) {
1604 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1605 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1606 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1607 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1608
1609 default: assert(false, "wrong vector length");
1610 }
1611 }
1612
mulreduceB(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1613 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1614 Register dst, Register src1, XMMRegister src2,
1615 XMMRegister vtmp1, XMMRegister vtmp2) {
1616 switch (vlen) {
1617 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1618 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1619 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1620 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1621
1622 default: assert(false, "wrong vector length");
1623 }
1624 }
1625
reduceS(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1626 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1627 Register dst, Register src1, XMMRegister src2,
1628 XMMRegister vtmp1, XMMRegister vtmp2) {
1629 switch (vlen) {
1630 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1631 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1632 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1633 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1634
1635 default: assert(false, "wrong vector length");
1636 }
1637 }
1638
reduceI(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1639 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1640 Register dst, Register src1, XMMRegister src2,
1641 XMMRegister vtmp1, XMMRegister vtmp2) {
1642 switch (vlen) {
1643 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1644 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1645 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1646 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1647
1648 default: assert(false, "wrong vector length");
1649 }
1650 }
1651
1652 #ifdef _LP64
reduceL(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1653 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1654 Register dst, Register src1, XMMRegister src2,
1655 XMMRegister vtmp1, XMMRegister vtmp2) {
1656 switch (vlen) {
1657 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1658 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1659 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1660
1661 default: assert(false, "wrong vector length");
1662 }
1663 }
1664 #endif // _LP64
1665
reduceF(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1666 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1667 switch (vlen) {
1668 case 2:
1669 assert(vtmp2 == xnoreg, "");
1670 reduce2F(opcode, dst, src, vtmp1);
1671 break;
1672 case 4:
1673 assert(vtmp2 == xnoreg, "");
1674 reduce4F(opcode, dst, src, vtmp1);
1675 break;
1676 case 8:
1677 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1678 break;
1679 case 16:
1680 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1681 break;
1682 default: assert(false, "wrong vector length");
1683 }
1684 }
1685
reduceD(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1686 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1687 switch (vlen) {
1688 case 2:
1689 assert(vtmp2 == xnoreg, "");
1690 reduce2D(opcode, dst, src, vtmp1);
1691 break;
1692 case 4:
1693 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1694 break;
1695 case 8:
1696 reduce8D(opcode, dst, src, vtmp1, vtmp2);
1697 break;
1698 default: assert(false, "wrong vector length");
1699 }
1700 }
1701
reduce2I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1702 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1703 if (opcode == Op_AddReductionVI) {
1704 if (vtmp1 != src2) {
1705 movdqu(vtmp1, src2);
1706 }
1707 phaddd(vtmp1, vtmp1);
1708 } else {
1709 pshufd(vtmp1, src2, 0x1);
1710 reduce_operation_128(T_INT, opcode, vtmp1, src2);
1711 }
1712 movdl(vtmp2, src1);
1713 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1714 movdl(dst, vtmp1);
1715 }
1716
reduce4I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1717 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1718 if (opcode == Op_AddReductionVI) {
1719 if (vtmp1 != src2) {
1720 movdqu(vtmp1, src2);
1721 }
1722 phaddd(vtmp1, src2);
1723 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1724 } else {
1725 pshufd(vtmp2, src2, 0xE);
1726 reduce_operation_128(T_INT, opcode, vtmp2, src2);
1727 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1728 }
1729 }
1730
reduce8I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1731 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1732 if (opcode == Op_AddReductionVI) {
1733 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1734 vextracti128_high(vtmp2, vtmp1);
1735 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1736 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1737 } else {
1738 vextracti128_high(vtmp1, src2);
1739 reduce_operation_128(T_INT, opcode, vtmp1, src2);
1740 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1741 }
1742 }
1743
reduce16I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1744 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1745 vextracti64x4_high(vtmp2, src2);
1746 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1747 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1748 }
1749
reduce8B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1750 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1751 pshufd(vtmp2, src2, 0x1);
1752 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1753 movdqu(vtmp1, vtmp2);
1754 psrldq(vtmp1, 2);
1755 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1756 movdqu(vtmp2, vtmp1);
1757 psrldq(vtmp2, 1);
1758 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1759 movdl(vtmp2, src1);
1760 pmovsxbd(vtmp1, vtmp1);
1761 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1762 pextrb(dst, vtmp1, 0x0);
1763 movsbl(dst, dst);
1764 }
1765
reduce16B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1766 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1767 pshufd(vtmp1, src2, 0xE);
1768 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1769 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1770 }
1771
reduce32B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1772 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1773 vextracti128_high(vtmp2, src2);
1774 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1775 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1776 }
1777
reduce64B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1778 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1779 vextracti64x4_high(vtmp1, src2);
1780 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1781 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1782 }
1783
mulreduce8B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1784 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1785 pmovsxbw(vtmp2, src2);
1786 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1787 }
1788
mulreduce16B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1789 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1790 if (UseAVX > 1) {
1791 int vector_len = Assembler::AVX_256bit;
1792 vpmovsxbw(vtmp1, src2, vector_len);
1793 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1794 } else {
1795 pmovsxbw(vtmp2, src2);
1796 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1797 pshufd(vtmp2, src2, 0x1);
1798 pmovsxbw(vtmp2, src2);
1799 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1800 }
1801 }
1802
mulreduce32B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1803 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1804 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1805 int vector_len = Assembler::AVX_512bit;
1806 vpmovsxbw(vtmp1, src2, vector_len);
1807 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1808 } else {
1809 assert(UseAVX >= 2,"Should not reach here.");
1810 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1811 vextracti128_high(vtmp2, src2);
1812 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1813 }
1814 }
1815
mulreduce64B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1816 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1817 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1818 vextracti64x4_high(vtmp2, src2);
1819 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1820 }
1821
reduce4S(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1822 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1823 if (opcode == Op_AddReductionVI) {
1824 if (vtmp1 != src2) {
1825 movdqu(vtmp1, src2);
1826 }
1827 phaddw(vtmp1, vtmp1);
1828 phaddw(vtmp1, vtmp1);
1829 } else {
1830 pshufd(vtmp2, src2, 0x1);
1831 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1832 movdqu(vtmp1, vtmp2);
1833 psrldq(vtmp1, 2);
1834 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1835 }
1836 movdl(vtmp2, src1);
1837 pmovsxwd(vtmp1, vtmp1);
1838 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1839 pextrw(dst, vtmp1, 0x0);
1840 movswl(dst, dst);
1841 }
1842
reduce8S(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1843 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1844 if (opcode == Op_AddReductionVI) {
1845 if (vtmp1 != src2) {
1846 movdqu(vtmp1, src2);
1847 }
1848 phaddw(vtmp1, src2);
1849 } else {
1850 pshufd(vtmp1, src2, 0xE);
1851 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1852 }
1853 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1854 }
1855
reduce16S(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1856 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1857 if (opcode == Op_AddReductionVI) {
1858 int vector_len = Assembler::AVX_256bit;
1859 vphaddw(vtmp2, src2, src2, vector_len);
1860 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1861 } else {
1862 vextracti128_high(vtmp2, src2);
1863 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1864 }
1865 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1866 }
1867
reduce32S(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1868 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1869 int vector_len = Assembler::AVX_256bit;
1870 vextracti64x4_high(vtmp1, src2);
1871 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1872 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1873 }
1874
1875 #ifdef _LP64
reduce2L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1876 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1877 pshufd(vtmp2, src2, 0xE);
1878 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1879 movdq(vtmp1, src1);
1880 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1881 movdq(dst, vtmp1);
1882 }
1883
reduce4L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1884 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1885 vextracti128_high(vtmp1, src2);
1886 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1887 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1888 }
1889
reduce8L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1890 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1891 vextracti64x4_high(vtmp2, src2);
1892 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1893 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1894 }
1895
genmask(Register dst,Register len,Register temp)1896 void C2_MacroAssembler::genmask(Register dst, Register len, Register temp) {
1897 if (ArrayCopyPartialInlineSize <= 32) {
1898 mov64(dst, 1);
1899 shlxq(dst, dst, len);
1900 decq(dst);
1901 } else {
1902 mov64(dst, -1);
1903 movq(temp, len);
1904 negptr(temp);
1905 addptr(temp, 64);
1906 shrxq(dst, dst, temp);
1907 }
1908 }
1909 #endif // _LP64
1910
reduce2F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1911 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1912 reduce_operation_128(T_FLOAT, opcode, dst, src);
1913 pshufd(vtmp, src, 0x1);
1914 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1915 }
1916
reduce4F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1917 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1918 reduce2F(opcode, dst, src, vtmp);
1919 pshufd(vtmp, src, 0x2);
1920 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1921 pshufd(vtmp, src, 0x3);
1922 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1923 }
1924
reduce8F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1925 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1926 reduce4F(opcode, dst, src, vtmp2);
1927 vextractf128_high(vtmp2, src);
1928 reduce4F(opcode, dst, vtmp2, vtmp1);
1929 }
1930
reduce16F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1931 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1932 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1933 vextracti64x4_high(vtmp1, src);
1934 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1935 }
1936
reduce2D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1937 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1938 reduce_operation_128(T_DOUBLE, opcode, dst, src);
1939 pshufd(vtmp, src, 0xE);
1940 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1941 }
1942
reduce4D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1943 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1944 reduce2D(opcode, dst, src, vtmp2);
1945 vextractf128_high(vtmp2, src);
1946 reduce2D(opcode, dst, vtmp2, vtmp1);
1947 }
1948
reduce8D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1949 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1950 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1951 vextracti64x4_high(vtmp1, src);
1952 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1953 }
1954
evmovdqu(BasicType type,KRegister kmask,XMMRegister dst,Address src,int vector_len)1955 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1956 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1957 }
1958
evmovdqu(BasicType type,KRegister kmask,Address dst,XMMRegister src,int vector_len)1959 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1960 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1961 }
1962
1963
reduceFloatMinMax(int opcode,int vlen,bool is_dst_valid,XMMRegister dst,XMMRegister src,XMMRegister tmp,XMMRegister atmp,XMMRegister btmp,XMMRegister xmm_0,XMMRegister xmm_1)1964 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1965 XMMRegister dst, XMMRegister src,
1966 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1967 XMMRegister xmm_0, XMMRegister xmm_1) {
1968 int permconst[] = {1, 14};
1969 XMMRegister wsrc = src;
1970 XMMRegister wdst = xmm_0;
1971 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1972
1973 int vlen_enc = Assembler::AVX_128bit;
1974 if (vlen == 16) {
1975 vlen_enc = Assembler::AVX_256bit;
1976 }
1977
1978 for (int i = log2(vlen) - 1; i >=0; i--) {
1979 if (i == 0 && !is_dst_valid) {
1980 wdst = dst;
1981 }
1982 if (i == 3) {
1983 vextracti64x4_high(wtmp, wsrc);
1984 } else if (i == 2) {
1985 vextracti128_high(wtmp, wsrc);
1986 } else { // i = [0,1]
1987 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
1988 }
1989 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1990 wsrc = wdst;
1991 vlen_enc = Assembler::AVX_128bit;
1992 }
1993 if (is_dst_valid) {
1994 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1995 }
1996 }
1997
reduceDoubleMinMax(int opcode,int vlen,bool is_dst_valid,XMMRegister dst,XMMRegister src,XMMRegister tmp,XMMRegister atmp,XMMRegister btmp,XMMRegister xmm_0,XMMRegister xmm_1)1998 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
1999 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2000 XMMRegister xmm_0, XMMRegister xmm_1) {
2001 XMMRegister wsrc = src;
2002 XMMRegister wdst = xmm_0;
2003 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2004 int vlen_enc = Assembler::AVX_128bit;
2005 if (vlen == 8) {
2006 vlen_enc = Assembler::AVX_256bit;
2007 }
2008 for (int i = log2(vlen) - 1; i >=0; i--) {
2009 if (i == 0 && !is_dst_valid) {
2010 wdst = dst;
2011 }
2012 if (i == 1) {
2013 vextracti128_high(wtmp, wsrc);
2014 } else if (i == 2) {
2015 vextracti64x4_high(wtmp, wsrc);
2016 } else {
2017 assert(i == 0, "%d", i);
2018 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2019 }
2020 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2021 wsrc = wdst;
2022 vlen_enc = Assembler::AVX_128bit;
2023 }
2024 if (is_dst_valid) {
2025 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2026 }
2027 }
2028
extract(BasicType bt,Register dst,XMMRegister src,int idx)2029 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2030 switch (bt) {
2031 case T_BYTE: pextrb(dst, src, idx); break;
2032 case T_SHORT: pextrw(dst, src, idx); break;
2033 case T_INT: pextrd(dst, src, idx); break;
2034 case T_LONG: pextrq(dst, src, idx); break;
2035
2036 default:
2037 assert(false,"Should not reach here.");
2038 break;
2039 }
2040 }
2041
get_lane(BasicType typ,XMMRegister dst,XMMRegister src,int elemindex)2042 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2043 int esize = type2aelembytes(typ);
2044 int elem_per_lane = 16/esize;
2045 int lane = elemindex / elem_per_lane;
2046 int eindex = elemindex % elem_per_lane;
2047
2048 if (lane >= 2) {
2049 assert(UseAVX > 2, "required");
2050 vextractf32x4(dst, src, lane & 3);
2051 return dst;
2052 } else if (lane > 0) {
2053 assert(UseAVX > 0, "required");
2054 vextractf128(dst, src, lane);
2055 return dst;
2056 } else {
2057 return src;
2058 }
2059 }
2060
get_elem(BasicType typ,Register dst,XMMRegister src,int elemindex)2061 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2062 int esize = type2aelembytes(typ);
2063 int elem_per_lane = 16/esize;
2064 int eindex = elemindex % elem_per_lane;
2065 assert(is_integral_type(typ),"required");
2066
2067 if (eindex == 0) {
2068 if (typ == T_LONG) {
2069 movq(dst, src);
2070 } else {
2071 movdl(dst, src);
2072 if (typ == T_BYTE)
2073 movsbl(dst, dst);
2074 else if (typ == T_SHORT)
2075 movswl(dst, dst);
2076 }
2077 } else {
2078 extract(typ, dst, src, eindex);
2079 }
2080 }
2081
get_elem(BasicType typ,XMMRegister dst,XMMRegister src,int elemindex,Register tmp,XMMRegister vtmp)2082 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2083 int esize = type2aelembytes(typ);
2084 int elem_per_lane = 16/esize;
2085 int eindex = elemindex % elem_per_lane;
2086 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2087
2088 if (eindex == 0) {
2089 movq(dst, src);
2090 } else {
2091 if (typ == T_FLOAT) {
2092 if (UseAVX == 0) {
2093 movdqu(dst, src);
2094 pshufps(dst, dst, eindex);
2095 } else {
2096 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2097 }
2098 } else {
2099 if (UseAVX == 0) {
2100 movdqu(dst, src);
2101 psrldq(dst, eindex*esize);
2102 } else {
2103 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2104 }
2105 movq(dst, dst);
2106 }
2107 }
2108 // Zero upper bits
2109 if (typ == T_FLOAT) {
2110 if (UseAVX == 0) {
2111 assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2112 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2113 pand(dst, vtmp);
2114 } else {
2115 assert((tmp != noreg), "required.");
2116 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2117 }
2118 }
2119 }
2120
evpcmp(BasicType typ,KRegister kdmask,KRegister ksmask,XMMRegister src1,AddressLiteral adr,int comparison,int vector_len,Register scratch)2121 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2122 switch(typ) {
2123 case T_BYTE:
2124 evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2125 break;
2126 case T_SHORT:
2127 evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2128 break;
2129 case T_INT:
2130 case T_FLOAT:
2131 evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2132 break;
2133 case T_LONG:
2134 case T_DOUBLE:
2135 evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2136 break;
2137 default:
2138 assert(false,"Should not reach here.");
2139 break;
2140 }
2141 }
2142
evpblend(BasicType typ,XMMRegister dst,KRegister kmask,XMMRegister src1,XMMRegister src2,bool merge,int vector_len)2143 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2144 switch(typ) {
2145 case T_BYTE:
2146 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2147 break;
2148 case T_SHORT:
2149 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2150 break;
2151 case T_INT:
2152 case T_FLOAT:
2153 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2154 break;
2155 case T_LONG:
2156 case T_DOUBLE:
2157 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2158 break;
2159 default:
2160 assert(false,"Should not reach here.");
2161 break;
2162 }
2163 }
2164
vectortest(int bt,int vlen,XMMRegister src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)2165 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2166 switch(vlen) {
2167 case 4:
2168 assert(vtmp1 != xnoreg, "required.");
2169 // Broadcast lower 32 bits to 128 bits before ptest
2170 pshufd(vtmp1, src1, 0x0);
2171 if (bt == BoolTest::overflow) {
2172 assert(vtmp2 != xnoreg, "required.");
2173 pshufd(vtmp2, src2, 0x0);
2174 } else {
2175 assert(vtmp2 == xnoreg, "required.");
2176 vtmp2 = src2;
2177 }
2178 ptest(vtmp1, vtmp2);
2179 break;
2180 case 8:
2181 assert(vtmp1 != xnoreg, "required.");
2182 // Broadcast lower 64 bits to 128 bits before ptest
2183 pshufd(vtmp1, src1, 0x4);
2184 if (bt == BoolTest::overflow) {
2185 assert(vtmp2 != xnoreg, "required.");
2186 pshufd(vtmp2, src2, 0x4);
2187 } else {
2188 assert(vtmp2 == xnoreg, "required.");
2189 vtmp2 = src2;
2190 }
2191 ptest(vtmp1, vtmp2);
2192 break;
2193 case 16:
2194 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2195 ptest(src1, src2);
2196 break;
2197 case 32:
2198 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2199 vptest(src1, src2, Assembler::AVX_256bit);
2200 break;
2201 case 64:
2202 {
2203 KRegister ktemp = k2; // Use a hardcoded temp due to no k register allocation.
2204 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2205 evpcmpeqb(ktemp, src1, src2, Assembler::AVX_512bit);
2206 if (bt == BoolTest::ne) {
2207 ktestql(ktemp, ktemp);
2208 } else {
2209 assert(bt == BoolTest::overflow, "required");
2210 kortestql(ktemp, ktemp);
2211 }
2212 }
2213 break;
2214 default:
2215 assert(false,"Should not reach here.");
2216 break;
2217 }
2218 }
2219
2220 //-------------------------------------------------------------------------------------------
2221
2222 // IndexOf for constant substrings with size >= 8 chars
2223 // which don't need to be loaded through stack.
string_indexofC8(Register str1,Register str2,Register cnt1,Register cnt2,int int_cnt2,Register result,XMMRegister vec,Register tmp,int ae)2224 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2225 Register cnt1, Register cnt2,
2226 int int_cnt2, Register result,
2227 XMMRegister vec, Register tmp,
2228 int ae) {
2229 ShortBranchVerifier sbv(this);
2230 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2231 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2232
2233 // This method uses the pcmpestri instruction with bound registers
2234 // inputs:
2235 // xmm - substring
2236 // rax - substring length (elements count)
2237 // mem - scanned string
2238 // rdx - string length (elements count)
2239 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2240 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2241 // outputs:
2242 // rcx - matched index in string
2243 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2244 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2245 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2246 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2247 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2248
2249 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2250 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2251 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2252
2253 // Note, inline_string_indexOf() generates checks:
2254 // if (substr.count > string.count) return -1;
2255 // if (substr.count == 0) return 0;
2256 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2257
2258 // Load substring.
2259 if (ae == StrIntrinsicNode::UL) {
2260 pmovzxbw(vec, Address(str2, 0));
2261 } else {
2262 movdqu(vec, Address(str2, 0));
2263 }
2264 movl(cnt2, int_cnt2);
2265 movptr(result, str1); // string addr
2266
2267 if (int_cnt2 > stride) {
2268 jmpb(SCAN_TO_SUBSTR);
2269
2270 // Reload substr for rescan, this code
2271 // is executed only for large substrings (> 8 chars)
2272 bind(RELOAD_SUBSTR);
2273 if (ae == StrIntrinsicNode::UL) {
2274 pmovzxbw(vec, Address(str2, 0));
2275 } else {
2276 movdqu(vec, Address(str2, 0));
2277 }
2278 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2279
2280 bind(RELOAD_STR);
2281 // We came here after the beginning of the substring was
2282 // matched but the rest of it was not so we need to search
2283 // again. Start from the next element after the previous match.
2284
2285 // cnt2 is number of substring reminding elements and
2286 // cnt1 is number of string reminding elements when cmp failed.
2287 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2288 subl(cnt1, cnt2);
2289 addl(cnt1, int_cnt2);
2290 movl(cnt2, int_cnt2); // Now restore cnt2
2291
2292 decrementl(cnt1); // Shift to next element
2293 cmpl(cnt1, cnt2);
2294 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2295
2296 addptr(result, (1<<scale1));
2297
2298 } // (int_cnt2 > 8)
2299
2300 // Scan string for start of substr in 16-byte vectors
2301 bind(SCAN_TO_SUBSTR);
2302 pcmpestri(vec, Address(result, 0), mode);
2303 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2304 subl(cnt1, stride);
2305 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2306 cmpl(cnt1, cnt2);
2307 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2308 addptr(result, 16);
2309 jmpb(SCAN_TO_SUBSTR);
2310
2311 // Found a potential substr
2312 bind(FOUND_CANDIDATE);
2313 // Matched whole vector if first element matched (tmp(rcx) == 0).
2314 if (int_cnt2 == stride) {
2315 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2316 } else { // int_cnt2 > 8
2317 jccb(Assembler::overflow, FOUND_SUBSTR);
2318 }
2319 // After pcmpestri tmp(rcx) contains matched element index
2320 // Compute start addr of substr
2321 lea(result, Address(result, tmp, scale1));
2322
2323 // Make sure string is still long enough
2324 subl(cnt1, tmp);
2325 cmpl(cnt1, cnt2);
2326 if (int_cnt2 == stride) {
2327 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2328 } else { // int_cnt2 > 8
2329 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2330 }
2331 // Left less then substring.
2332
2333 bind(RET_NOT_FOUND);
2334 movl(result, -1);
2335 jmp(EXIT);
2336
2337 if (int_cnt2 > stride) {
2338 // This code is optimized for the case when whole substring
2339 // is matched if its head is matched.
2340 bind(MATCH_SUBSTR_HEAD);
2341 pcmpestri(vec, Address(result, 0), mode);
2342 // Reload only string if does not match
2343 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2344
2345 Label CONT_SCAN_SUBSTR;
2346 // Compare the rest of substring (> 8 chars).
2347 bind(FOUND_SUBSTR);
2348 // First 8 chars are already matched.
2349 negptr(cnt2);
2350 addptr(cnt2, stride);
2351
2352 bind(SCAN_SUBSTR);
2353 subl(cnt1, stride);
2354 cmpl(cnt2, -stride); // Do not read beyond substring
2355 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2356 // Back-up strings to avoid reading beyond substring:
2357 // cnt1 = cnt1 - cnt2 + 8
2358 addl(cnt1, cnt2); // cnt2 is negative
2359 addl(cnt1, stride);
2360 movl(cnt2, stride); negptr(cnt2);
2361 bind(CONT_SCAN_SUBSTR);
2362 if (int_cnt2 < (int)G) {
2363 int tail_off1 = int_cnt2<<scale1;
2364 int tail_off2 = int_cnt2<<scale2;
2365 if (ae == StrIntrinsicNode::UL) {
2366 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2367 } else {
2368 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2369 }
2370 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2371 } else {
2372 // calculate index in register to avoid integer overflow (int_cnt2*2)
2373 movl(tmp, int_cnt2);
2374 addptr(tmp, cnt2);
2375 if (ae == StrIntrinsicNode::UL) {
2376 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2377 } else {
2378 movdqu(vec, Address(str2, tmp, scale2, 0));
2379 }
2380 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2381 }
2382 // Need to reload strings pointers if not matched whole vector
2383 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2384 addptr(cnt2, stride);
2385 jcc(Assembler::negative, SCAN_SUBSTR);
2386 // Fall through if found full substring
2387
2388 } // (int_cnt2 > 8)
2389
2390 bind(RET_FOUND);
2391 // Found result if we matched full small substring.
2392 // Compute substr offset
2393 subptr(result, str1);
2394 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2395 shrl(result, 1); // index
2396 }
2397 bind(EXIT);
2398
2399 } // string_indexofC8
2400
2401 // Small strings are loaded through stack if they cross page boundary.
string_indexof(Register str1,Register str2,Register cnt1,Register cnt2,int int_cnt2,Register result,XMMRegister vec,Register tmp,int ae)2402 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2403 Register cnt1, Register cnt2,
2404 int int_cnt2, Register result,
2405 XMMRegister vec, Register tmp,
2406 int ae) {
2407 ShortBranchVerifier sbv(this);
2408 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2409 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2410
2411 //
2412 // int_cnt2 is length of small (< 8 chars) constant substring
2413 // or (-1) for non constant substring in which case its length
2414 // is in cnt2 register.
2415 //
2416 // Note, inline_string_indexOf() generates checks:
2417 // if (substr.count > string.count) return -1;
2418 // if (substr.count == 0) return 0;
2419 //
2420 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2421 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2422 // This method uses the pcmpestri instruction with bound registers
2423 // inputs:
2424 // xmm - substring
2425 // rax - substring length (elements count)
2426 // mem - scanned string
2427 // rdx - string length (elements count)
2428 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2429 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2430 // outputs:
2431 // rcx - matched index in string
2432 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2433 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2434 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2435 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2436
2437 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2438 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2439 FOUND_CANDIDATE;
2440
2441 { //========================================================
2442 // We don't know where these strings are located
2443 // and we can't read beyond them. Load them through stack.
2444 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2445
2446 movptr(tmp, rsp); // save old SP
2447
2448 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2449 if (int_cnt2 == (1>>scale2)) { // One byte
2450 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2451 load_unsigned_byte(result, Address(str2, 0));
2452 movdl(vec, result); // move 32 bits
2453 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2454 // Not enough header space in 32-bit VM: 12+3 = 15.
2455 movl(result, Address(str2, -1));
2456 shrl(result, 8);
2457 movdl(vec, result); // move 32 bits
2458 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2459 load_unsigned_short(result, Address(str2, 0));
2460 movdl(vec, result); // move 32 bits
2461 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2462 movdl(vec, Address(str2, 0)); // move 32 bits
2463 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2464 movq(vec, Address(str2, 0)); // move 64 bits
2465 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2466 // Array header size is 12 bytes in 32-bit VM
2467 // + 6 bytes for 3 chars == 18 bytes,
2468 // enough space to load vec and shift.
2469 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2470 if (ae == StrIntrinsicNode::UL) {
2471 int tail_off = int_cnt2-8;
2472 pmovzxbw(vec, Address(str2, tail_off));
2473 psrldq(vec, -2*tail_off);
2474 }
2475 else {
2476 int tail_off = int_cnt2*(1<<scale2);
2477 movdqu(vec, Address(str2, tail_off-16));
2478 psrldq(vec, 16-tail_off);
2479 }
2480 }
2481 } else { // not constant substring
2482 cmpl(cnt2, stride);
2483 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2484
2485 // We can read beyond string if srt+16 does not cross page boundary
2486 // since heaps are aligned and mapped by pages.
2487 assert(os::vm_page_size() < (int)G, "default page should be small");
2488 movl(result, str2); // We need only low 32 bits
2489 andl(result, (os::vm_page_size()-1));
2490 cmpl(result, (os::vm_page_size()-16));
2491 jccb(Assembler::belowEqual, CHECK_STR);
2492
2493 // Move small strings to stack to allow load 16 bytes into vec.
2494 subptr(rsp, 16);
2495 int stk_offset = wordSize-(1<<scale2);
2496 push(cnt2);
2497
2498 bind(COPY_SUBSTR);
2499 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2500 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2501 movb(Address(rsp, cnt2, scale2, stk_offset), result);
2502 } else if (ae == StrIntrinsicNode::UU) {
2503 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2504 movw(Address(rsp, cnt2, scale2, stk_offset), result);
2505 }
2506 decrement(cnt2);
2507 jccb(Assembler::notZero, COPY_SUBSTR);
2508
2509 pop(cnt2);
2510 movptr(str2, rsp); // New substring address
2511 } // non constant
2512
2513 bind(CHECK_STR);
2514 cmpl(cnt1, stride);
2515 jccb(Assembler::aboveEqual, BIG_STRINGS);
2516
2517 // Check cross page boundary.
2518 movl(result, str1); // We need only low 32 bits
2519 andl(result, (os::vm_page_size()-1));
2520 cmpl(result, (os::vm_page_size()-16));
2521 jccb(Assembler::belowEqual, BIG_STRINGS);
2522
2523 subptr(rsp, 16);
2524 int stk_offset = -(1<<scale1);
2525 if (int_cnt2 < 0) { // not constant
2526 push(cnt2);
2527 stk_offset += wordSize;
2528 }
2529 movl(cnt2, cnt1);
2530
2531 bind(COPY_STR);
2532 if (ae == StrIntrinsicNode::LL) {
2533 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2534 movb(Address(rsp, cnt2, scale1, stk_offset), result);
2535 } else {
2536 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2537 movw(Address(rsp, cnt2, scale1, stk_offset), result);
2538 }
2539 decrement(cnt2);
2540 jccb(Assembler::notZero, COPY_STR);
2541
2542 if (int_cnt2 < 0) { // not constant
2543 pop(cnt2);
2544 }
2545 movptr(str1, rsp); // New string address
2546
2547 bind(BIG_STRINGS);
2548 // Load substring.
2549 if (int_cnt2 < 0) { // -1
2550 if (ae == StrIntrinsicNode::UL) {
2551 pmovzxbw(vec, Address(str2, 0));
2552 } else {
2553 movdqu(vec, Address(str2, 0));
2554 }
2555 push(cnt2); // substr count
2556 push(str2); // substr addr
2557 push(str1); // string addr
2558 } else {
2559 // Small (< 8 chars) constant substrings are loaded already.
2560 movl(cnt2, int_cnt2);
2561 }
2562 push(tmp); // original SP
2563
2564 } // Finished loading
2565
2566 //========================================================
2567 // Start search
2568 //
2569
2570 movptr(result, str1); // string addr
2571
2572 if (int_cnt2 < 0) { // Only for non constant substring
2573 jmpb(SCAN_TO_SUBSTR);
2574
2575 // SP saved at sp+0
2576 // String saved at sp+1*wordSize
2577 // Substr saved at sp+2*wordSize
2578 // Substr count saved at sp+3*wordSize
2579
2580 // Reload substr for rescan, this code
2581 // is executed only for large substrings (> 8 chars)
2582 bind(RELOAD_SUBSTR);
2583 movptr(str2, Address(rsp, 2*wordSize));
2584 movl(cnt2, Address(rsp, 3*wordSize));
2585 if (ae == StrIntrinsicNode::UL) {
2586 pmovzxbw(vec, Address(str2, 0));
2587 } else {
2588 movdqu(vec, Address(str2, 0));
2589 }
2590 // We came here after the beginning of the substring was
2591 // matched but the rest of it was not so we need to search
2592 // again. Start from the next element after the previous match.
2593 subptr(str1, result); // Restore counter
2594 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2595 shrl(str1, 1);
2596 }
2597 addl(cnt1, str1);
2598 decrementl(cnt1); // Shift to next element
2599 cmpl(cnt1, cnt2);
2600 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2601
2602 addptr(result, (1<<scale1));
2603 } // non constant
2604
2605 // Scan string for start of substr in 16-byte vectors
2606 bind(SCAN_TO_SUBSTR);
2607 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2608 pcmpestri(vec, Address(result, 0), mode);
2609 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2610 subl(cnt1, stride);
2611 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2612 cmpl(cnt1, cnt2);
2613 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2614 addptr(result, 16);
2615
2616 bind(ADJUST_STR);
2617 cmpl(cnt1, stride); // Do not read beyond string
2618 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2619 // Back-up string to avoid reading beyond string.
2620 lea(result, Address(result, cnt1, scale1, -16));
2621 movl(cnt1, stride);
2622 jmpb(SCAN_TO_SUBSTR);
2623
2624 // Found a potential substr
2625 bind(FOUND_CANDIDATE);
2626 // After pcmpestri tmp(rcx) contains matched element index
2627
2628 // Make sure string is still long enough
2629 subl(cnt1, tmp);
2630 cmpl(cnt1, cnt2);
2631 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2632 // Left less then substring.
2633
2634 bind(RET_NOT_FOUND);
2635 movl(result, -1);
2636 jmp(CLEANUP);
2637
2638 bind(FOUND_SUBSTR);
2639 // Compute start addr of substr
2640 lea(result, Address(result, tmp, scale1));
2641 if (int_cnt2 > 0) { // Constant substring
2642 // Repeat search for small substring (< 8 chars)
2643 // from new point without reloading substring.
2644 // Have to check that we don't read beyond string.
2645 cmpl(tmp, stride-int_cnt2);
2646 jccb(Assembler::greater, ADJUST_STR);
2647 // Fall through if matched whole substring.
2648 } else { // non constant
2649 assert(int_cnt2 == -1, "should be != 0");
2650
2651 addl(tmp, cnt2);
2652 // Found result if we matched whole substring.
2653 cmpl(tmp, stride);
2654 jcc(Assembler::lessEqual, RET_FOUND);
2655
2656 // Repeat search for small substring (<= 8 chars)
2657 // from new point 'str1' without reloading substring.
2658 cmpl(cnt2, stride);
2659 // Have to check that we don't read beyond string.
2660 jccb(Assembler::lessEqual, ADJUST_STR);
2661
2662 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2663 // Compare the rest of substring (> 8 chars).
2664 movptr(str1, result);
2665
2666 cmpl(tmp, cnt2);
2667 // First 8 chars are already matched.
2668 jccb(Assembler::equal, CHECK_NEXT);
2669
2670 bind(SCAN_SUBSTR);
2671 pcmpestri(vec, Address(str1, 0), mode);
2672 // Need to reload strings pointers if not matched whole vector
2673 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2674
2675 bind(CHECK_NEXT);
2676 subl(cnt2, stride);
2677 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2678 addptr(str1, 16);
2679 if (ae == StrIntrinsicNode::UL) {
2680 addptr(str2, 8);
2681 } else {
2682 addptr(str2, 16);
2683 }
2684 subl(cnt1, stride);
2685 cmpl(cnt2, stride); // Do not read beyond substring
2686 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2687 // Back-up strings to avoid reading beyond substring.
2688
2689 if (ae == StrIntrinsicNode::UL) {
2690 lea(str2, Address(str2, cnt2, scale2, -8));
2691 lea(str1, Address(str1, cnt2, scale1, -16));
2692 } else {
2693 lea(str2, Address(str2, cnt2, scale2, -16));
2694 lea(str1, Address(str1, cnt2, scale1, -16));
2695 }
2696 subl(cnt1, cnt2);
2697 movl(cnt2, stride);
2698 addl(cnt1, stride);
2699 bind(CONT_SCAN_SUBSTR);
2700 if (ae == StrIntrinsicNode::UL) {
2701 pmovzxbw(vec, Address(str2, 0));
2702 } else {
2703 movdqu(vec, Address(str2, 0));
2704 }
2705 jmp(SCAN_SUBSTR);
2706
2707 bind(RET_FOUND_LONG);
2708 movptr(str1, Address(rsp, wordSize));
2709 } // non constant
2710
2711 bind(RET_FOUND);
2712 // Compute substr offset
2713 subptr(result, str1);
2714 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2715 shrl(result, 1); // index
2716 }
2717 bind(CLEANUP);
2718 pop(rsp); // restore SP
2719
2720 } // string_indexof
2721
string_indexof_char(Register str1,Register cnt1,Register ch,Register result,XMMRegister vec1,XMMRegister vec2,XMMRegister vec3,Register tmp)2722 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2723 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2724 ShortBranchVerifier sbv(this);
2725 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2726
2727 int stride = 8;
2728
2729 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2730 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2731 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2732 FOUND_SEQ_CHAR, DONE_LABEL;
2733
2734 movptr(result, str1);
2735 if (UseAVX >= 2) {
2736 cmpl(cnt1, stride);
2737 jcc(Assembler::less, SCAN_TO_CHAR);
2738 cmpl(cnt1, 2*stride);
2739 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2740 movdl(vec1, ch);
2741 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2742 vpxor(vec2, vec2);
2743 movl(tmp, cnt1);
2744 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
2745 andl(cnt1,0x0000000F); //tail count (in chars)
2746
2747 bind(SCAN_TO_16_CHAR_LOOP);
2748 vmovdqu(vec3, Address(result, 0));
2749 vpcmpeqw(vec3, vec3, vec1, 1);
2750 vptest(vec2, vec3);
2751 jcc(Assembler::carryClear, FOUND_CHAR);
2752 addptr(result, 32);
2753 subl(tmp, 2*stride);
2754 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2755 jmp(SCAN_TO_8_CHAR);
2756 bind(SCAN_TO_8_CHAR_INIT);
2757 movdl(vec1, ch);
2758 pshuflw(vec1, vec1, 0x00);
2759 pshufd(vec1, vec1, 0);
2760 pxor(vec2, vec2);
2761 }
2762 bind(SCAN_TO_8_CHAR);
2763 cmpl(cnt1, stride);
2764 jcc(Assembler::less, SCAN_TO_CHAR);
2765 if (UseAVX < 2) {
2766 movdl(vec1, ch);
2767 pshuflw(vec1, vec1, 0x00);
2768 pshufd(vec1, vec1, 0);
2769 pxor(vec2, vec2);
2770 }
2771 movl(tmp, cnt1);
2772 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
2773 andl(cnt1,0x00000007); //tail count (in chars)
2774
2775 bind(SCAN_TO_8_CHAR_LOOP);
2776 movdqu(vec3, Address(result, 0));
2777 pcmpeqw(vec3, vec1);
2778 ptest(vec2, vec3);
2779 jcc(Assembler::carryClear, FOUND_CHAR);
2780 addptr(result, 16);
2781 subl(tmp, stride);
2782 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2783 bind(SCAN_TO_CHAR);
2784 testl(cnt1, cnt1);
2785 jcc(Assembler::zero, RET_NOT_FOUND);
2786 bind(SCAN_TO_CHAR_LOOP);
2787 load_unsigned_short(tmp, Address(result, 0));
2788 cmpl(ch, tmp);
2789 jccb(Assembler::equal, FOUND_SEQ_CHAR);
2790 addptr(result, 2);
2791 subl(cnt1, 1);
2792 jccb(Assembler::zero, RET_NOT_FOUND);
2793 jmp(SCAN_TO_CHAR_LOOP);
2794
2795 bind(RET_NOT_FOUND);
2796 movl(result, -1);
2797 jmpb(DONE_LABEL);
2798
2799 bind(FOUND_CHAR);
2800 if (UseAVX >= 2) {
2801 vpmovmskb(tmp, vec3);
2802 } else {
2803 pmovmskb(tmp, vec3);
2804 }
2805 bsfl(ch, tmp);
2806 addptr(result, ch);
2807
2808 bind(FOUND_SEQ_CHAR);
2809 subptr(result, str1);
2810 shrl(result, 1);
2811
2812 bind(DONE_LABEL);
2813 } // string_indexof_char
2814
stringL_indexof_char(Register str1,Register cnt1,Register ch,Register result,XMMRegister vec1,XMMRegister vec2,XMMRegister vec3,Register tmp)2815 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2816 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2817 ShortBranchVerifier sbv(this);
2818 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2819
2820 int stride = 16;
2821
2822 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2823 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2824 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2825 FOUND_SEQ_CHAR, DONE_LABEL;
2826
2827 movptr(result, str1);
2828 if (UseAVX >= 2) {
2829 cmpl(cnt1, stride);
2830 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2831 cmpl(cnt1, stride*2);
2832 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2833 movdl(vec1, ch);
2834 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2835 vpxor(vec2, vec2);
2836 movl(tmp, cnt1);
2837 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
2838 andl(cnt1,0x0000001F); //tail count (in chars)
2839
2840 bind(SCAN_TO_32_CHAR_LOOP);
2841 vmovdqu(vec3, Address(result, 0));
2842 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2843 vptest(vec2, vec3);
2844 jcc(Assembler::carryClear, FOUND_CHAR);
2845 addptr(result, 32);
2846 subl(tmp, stride*2);
2847 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2848 jmp(SCAN_TO_16_CHAR);
2849
2850 bind(SCAN_TO_16_CHAR_INIT);
2851 movdl(vec1, ch);
2852 pxor(vec2, vec2);
2853 pshufb(vec1, vec2);
2854 }
2855
2856 bind(SCAN_TO_16_CHAR);
2857 cmpl(cnt1, stride);
2858 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
2859 if (UseAVX < 2) {
2860 movdl(vec1, ch);
2861 pxor(vec2, vec2);
2862 pshufb(vec1, vec2);
2863 }
2864 movl(tmp, cnt1);
2865 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
2866 andl(cnt1,0x0000000F); //tail count (in bytes)
2867
2868 bind(SCAN_TO_16_CHAR_LOOP);
2869 movdqu(vec3, Address(result, 0));
2870 pcmpeqb(vec3, vec1);
2871 ptest(vec2, vec3);
2872 jcc(Assembler::carryClear, FOUND_CHAR);
2873 addptr(result, 16);
2874 subl(tmp, stride);
2875 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
2876
2877 bind(SCAN_TO_CHAR_INIT);
2878 testl(cnt1, cnt1);
2879 jcc(Assembler::zero, RET_NOT_FOUND);
2880 bind(SCAN_TO_CHAR_LOOP);
2881 load_unsigned_byte(tmp, Address(result, 0));
2882 cmpl(ch, tmp);
2883 jccb(Assembler::equal, FOUND_SEQ_CHAR);
2884 addptr(result, 1);
2885 subl(cnt1, 1);
2886 jccb(Assembler::zero, RET_NOT_FOUND);
2887 jmp(SCAN_TO_CHAR_LOOP);
2888
2889 bind(RET_NOT_FOUND);
2890 movl(result, -1);
2891 jmpb(DONE_LABEL);
2892
2893 bind(FOUND_CHAR);
2894 if (UseAVX >= 2) {
2895 vpmovmskb(tmp, vec3);
2896 } else {
2897 pmovmskb(tmp, vec3);
2898 }
2899 bsfl(ch, tmp);
2900 addptr(result, ch);
2901
2902 bind(FOUND_SEQ_CHAR);
2903 subptr(result, str1);
2904
2905 bind(DONE_LABEL);
2906 } // stringL_indexof_char
2907
2908 // helper function for string_compare
load_next_elements(Register elem1,Register elem2,Register str1,Register str2,Address::ScaleFactor scale,Address::ScaleFactor scale1,Address::ScaleFactor scale2,Register index,int ae)2909 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
2910 Address::ScaleFactor scale, Address::ScaleFactor scale1,
2911 Address::ScaleFactor scale2, Register index, int ae) {
2912 if (ae == StrIntrinsicNode::LL) {
2913 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
2914 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
2915 } else if (ae == StrIntrinsicNode::UU) {
2916 load_unsigned_short(elem1, Address(str1, index, scale, 0));
2917 load_unsigned_short(elem2, Address(str2, index, scale, 0));
2918 } else {
2919 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
2920 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
2921 }
2922 }
2923
2924 // Compare strings, used for char[] and byte[].
string_compare(Register str1,Register str2,Register cnt1,Register cnt2,Register result,XMMRegister vec1,int ae)2925 void C2_MacroAssembler::string_compare(Register str1, Register str2,
2926 Register cnt1, Register cnt2, Register result,
2927 XMMRegister vec1, int ae) {
2928 ShortBranchVerifier sbv(this);
2929 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
2930 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3
2931 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
2932 int stride2x2 = 0x40;
2933 Address::ScaleFactor scale = Address::no_scale;
2934 Address::ScaleFactor scale1 = Address::no_scale;
2935 Address::ScaleFactor scale2 = Address::no_scale;
2936
2937 if (ae != StrIntrinsicNode::LL) {
2938 stride2x2 = 0x20;
2939 }
2940
2941 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
2942 shrl(cnt2, 1);
2943 }
2944 // Compute the minimum of the string lengths and the
2945 // difference of the string lengths (stack).
2946 // Do the conditional move stuff
2947 movl(result, cnt1);
2948 subl(cnt1, cnt2);
2949 push(cnt1);
2950 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
2951
2952 // Is the minimum length zero?
2953 testl(cnt2, cnt2);
2954 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2955 if (ae == StrIntrinsicNode::LL) {
2956 // Load first bytes
2957 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
2958 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
2959 } else if (ae == StrIntrinsicNode::UU) {
2960 // Load first characters
2961 load_unsigned_short(result, Address(str1, 0));
2962 load_unsigned_short(cnt1, Address(str2, 0));
2963 } else {
2964 load_unsigned_byte(result, Address(str1, 0));
2965 load_unsigned_short(cnt1, Address(str2, 0));
2966 }
2967 subl(result, cnt1);
2968 jcc(Assembler::notZero, POP_LABEL);
2969
2970 if (ae == StrIntrinsicNode::UU) {
2971 // Divide length by 2 to get number of chars
2972 shrl(cnt2, 1);
2973 }
2974 cmpl(cnt2, 1);
2975 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
2976
2977 // Check if the strings start at the same location and setup scale and stride
2978 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2979 cmpptr(str1, str2);
2980 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
2981 if (ae == StrIntrinsicNode::LL) {
2982 scale = Address::times_1;
2983 stride = 16;
2984 } else {
2985 scale = Address::times_2;
2986 stride = 8;
2987 }
2988 } else {
2989 scale1 = Address::times_1;
2990 scale2 = Address::times_2;
2991 // scale not used
2992 stride = 8;
2993 }
2994
2995 if (UseAVX >= 2 && UseSSE42Intrinsics) {
2996 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
2997 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
2998 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
2999 Label COMPARE_TAIL_LONG;
3000 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
3001
3002 int pcmpmask = 0x19;
3003 if (ae == StrIntrinsicNode::LL) {
3004 pcmpmask &= ~0x01;
3005 }
3006
3007 // Setup to compare 16-chars (32-bytes) vectors,
3008 // start from first character again because it has aligned address.
3009 if (ae == StrIntrinsicNode::LL) {
3010 stride2 = 32;
3011 } else {
3012 stride2 = 16;
3013 }
3014 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3015 adr_stride = stride << scale;
3016 } else {
3017 adr_stride1 = 8; //stride << scale1;
3018 adr_stride2 = 16; //stride << scale2;
3019 }
3020
3021 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3022 // rax and rdx are used by pcmpestri as elements counters
3023 movl(result, cnt2);
3024 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3025 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3026
3027 // fast path : compare first 2 8-char vectors.
3028 bind(COMPARE_16_CHARS);
3029 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3030 movdqu(vec1, Address(str1, 0));
3031 } else {
3032 pmovzxbw(vec1, Address(str1, 0));
3033 }
3034 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3035 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3036
3037 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3038 movdqu(vec1, Address(str1, adr_stride));
3039 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3040 } else {
3041 pmovzxbw(vec1, Address(str1, adr_stride1));
3042 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3043 }
3044 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3045 addl(cnt1, stride);
3046
3047 // Compare the characters at index in cnt1
3048 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3049 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3050 subl(result, cnt2);
3051 jmp(POP_LABEL);
3052
3053 // Setup the registers to start vector comparison loop
3054 bind(COMPARE_WIDE_VECTORS);
3055 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3056 lea(str1, Address(str1, result, scale));
3057 lea(str2, Address(str2, result, scale));
3058 } else {
3059 lea(str1, Address(str1, result, scale1));
3060 lea(str2, Address(str2, result, scale2));
3061 }
3062 subl(result, stride2);
3063 subl(cnt2, stride2);
3064 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3065 negptr(result);
3066
3067 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3068 bind(COMPARE_WIDE_VECTORS_LOOP);
3069
3070 #ifdef _LP64
3071 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3072 cmpl(cnt2, stride2x2);
3073 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3074 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3075 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3076
3077 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3078 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3079 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3080 evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3081 } else {
3082 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3083 evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3084 }
3085 kortestql(k7, k7);
3086 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3087 addptr(result, stride2x2); // update since we already compared at this addr
3088 subl(cnt2, stride2x2); // and sub the size too
3089 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3090
3091 vpxor(vec1, vec1);
3092 jmpb(COMPARE_WIDE_TAIL);
3093 }//if (VM_Version::supports_avx512vlbw())
3094 #endif // _LP64
3095
3096
3097 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3098 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3099 vmovdqu(vec1, Address(str1, result, scale));
3100 vpxor(vec1, Address(str2, result, scale));
3101 } else {
3102 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3103 vpxor(vec1, Address(str2, result, scale2));
3104 }
3105 vptest(vec1, vec1);
3106 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3107 addptr(result, stride2);
3108 subl(cnt2, stride2);
3109 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3110 // clean upper bits of YMM registers
3111 vpxor(vec1, vec1);
3112
3113 // compare wide vectors tail
3114 bind(COMPARE_WIDE_TAIL);
3115 testptr(result, result);
3116 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3117
3118 movl(result, stride2);
3119 movl(cnt2, result);
3120 negptr(result);
3121 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3122
3123 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3124 bind(VECTOR_NOT_EQUAL);
3125 // clean upper bits of YMM registers
3126 vpxor(vec1, vec1);
3127 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3128 lea(str1, Address(str1, result, scale));
3129 lea(str2, Address(str2, result, scale));
3130 } else {
3131 lea(str1, Address(str1, result, scale1));
3132 lea(str2, Address(str2, result, scale2));
3133 }
3134 jmp(COMPARE_16_CHARS);
3135
3136 // Compare tail chars, length between 1 to 15 chars
3137 bind(COMPARE_TAIL_LONG);
3138 movl(cnt2, result);
3139 cmpl(cnt2, stride);
3140 jcc(Assembler::less, COMPARE_SMALL_STR);
3141
3142 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3143 movdqu(vec1, Address(str1, 0));
3144 } else {
3145 pmovzxbw(vec1, Address(str1, 0));
3146 }
3147 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3148 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3149 subptr(cnt2, stride);
3150 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3151 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3152 lea(str1, Address(str1, result, scale));
3153 lea(str2, Address(str2, result, scale));
3154 } else {
3155 lea(str1, Address(str1, result, scale1));
3156 lea(str2, Address(str2, result, scale2));
3157 }
3158 negptr(cnt2);
3159 jmpb(WHILE_HEAD_LABEL);
3160
3161 bind(COMPARE_SMALL_STR);
3162 } else if (UseSSE42Intrinsics) {
3163 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3164 int pcmpmask = 0x19;
3165 // Setup to compare 8-char (16-byte) vectors,
3166 // start from first character again because it has aligned address.
3167 movl(result, cnt2);
3168 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3169 if (ae == StrIntrinsicNode::LL) {
3170 pcmpmask &= ~0x01;
3171 }
3172 jcc(Assembler::zero, COMPARE_TAIL);
3173 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3174 lea(str1, Address(str1, result, scale));
3175 lea(str2, Address(str2, result, scale));
3176 } else {
3177 lea(str1, Address(str1, result, scale1));
3178 lea(str2, Address(str2, result, scale2));
3179 }
3180 negptr(result);
3181
3182 // pcmpestri
3183 // inputs:
3184 // vec1- substring
3185 // rax - negative string length (elements count)
3186 // mem - scanned string
3187 // rdx - string length (elements count)
3188 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3189 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3190 // outputs:
3191 // rcx - first mismatched element index
3192 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3193
3194 bind(COMPARE_WIDE_VECTORS);
3195 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3196 movdqu(vec1, Address(str1, result, scale));
3197 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3198 } else {
3199 pmovzxbw(vec1, Address(str1, result, scale1));
3200 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3201 }
3202 // After pcmpestri cnt1(rcx) contains mismatched element index
3203
3204 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3205 addptr(result, stride);
3206 subptr(cnt2, stride);
3207 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3208
3209 // compare wide vectors tail
3210 testptr(result, result);
3211 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3212
3213 movl(cnt2, stride);
3214 movl(result, stride);
3215 negptr(result);
3216 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3217 movdqu(vec1, Address(str1, result, scale));
3218 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3219 } else {
3220 pmovzxbw(vec1, Address(str1, result, scale1));
3221 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3222 }
3223 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3224
3225 // Mismatched characters in the vectors
3226 bind(VECTOR_NOT_EQUAL);
3227 addptr(cnt1, result);
3228 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3229 subl(result, cnt2);
3230 jmpb(POP_LABEL);
3231
3232 bind(COMPARE_TAIL); // limit is zero
3233 movl(cnt2, result);
3234 // Fallthru to tail compare
3235 }
3236 // Shift str2 and str1 to the end of the arrays, negate min
3237 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3238 lea(str1, Address(str1, cnt2, scale));
3239 lea(str2, Address(str2, cnt2, scale));
3240 } else {
3241 lea(str1, Address(str1, cnt2, scale1));
3242 lea(str2, Address(str2, cnt2, scale2));
3243 }
3244 decrementl(cnt2); // first character was compared already
3245 negptr(cnt2);
3246
3247 // Compare the rest of the elements
3248 bind(WHILE_HEAD_LABEL);
3249 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3250 subl(result, cnt1);
3251 jccb(Assembler::notZero, POP_LABEL);
3252 increment(cnt2);
3253 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3254
3255 // Strings are equal up to min length. Return the length difference.
3256 bind(LENGTH_DIFF_LABEL);
3257 pop(result);
3258 if (ae == StrIntrinsicNode::UU) {
3259 // Divide diff by 2 to get number of chars
3260 sarl(result, 1);
3261 }
3262 jmpb(DONE_LABEL);
3263
3264 #ifdef _LP64
3265 if (VM_Version::supports_avx512vlbw()) {
3266
3267 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3268
3269 kmovql(cnt1, k7);
3270 notq(cnt1);
3271 bsfq(cnt2, cnt1);
3272 if (ae != StrIntrinsicNode::LL) {
3273 // Divide diff by 2 to get number of chars
3274 sarl(cnt2, 1);
3275 }
3276 addq(result, cnt2);
3277 if (ae == StrIntrinsicNode::LL) {
3278 load_unsigned_byte(cnt1, Address(str2, result));
3279 load_unsigned_byte(result, Address(str1, result));
3280 } else if (ae == StrIntrinsicNode::UU) {
3281 load_unsigned_short(cnt1, Address(str2, result, scale));
3282 load_unsigned_short(result, Address(str1, result, scale));
3283 } else {
3284 load_unsigned_short(cnt1, Address(str2, result, scale2));
3285 load_unsigned_byte(result, Address(str1, result, scale1));
3286 }
3287 subl(result, cnt1);
3288 jmpb(POP_LABEL);
3289 }//if (VM_Version::supports_avx512vlbw())
3290 #endif // _LP64
3291
3292 // Discard the stored length difference
3293 bind(POP_LABEL);
3294 pop(cnt1);
3295
3296 // That's it
3297 bind(DONE_LABEL);
3298 if(ae == StrIntrinsicNode::UL) {
3299 negl(result);
3300 }
3301
3302 }
3303
3304 // Search for Non-ASCII character (Negative byte value) in a byte array,
3305 // return true if it has any and false otherwise.
3306 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3307 // @IntrinsicCandidate
3308 // private static boolean hasNegatives(byte[] ba, int off, int len) {
3309 // for (int i = off; i < off + len; i++) {
3310 // if (ba[i] < 0) {
3311 // return true;
3312 // }
3313 // }
3314 // return false;
3315 // }
has_negatives(Register ary1,Register len,Register result,Register tmp1,XMMRegister vec1,XMMRegister vec2)3316 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3317 Register result, Register tmp1,
3318 XMMRegister vec1, XMMRegister vec2) {
3319 // rsi: byte array
3320 // rcx: len
3321 // rax: result
3322 ShortBranchVerifier sbv(this);
3323 assert_different_registers(ary1, len, result, tmp1);
3324 assert_different_registers(vec1, vec2);
3325 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3326
3327 // len == 0
3328 testl(len, len);
3329 jcc(Assembler::zero, FALSE_LABEL);
3330
3331 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3332 VM_Version::supports_avx512vlbw() &&
3333 VM_Version::supports_bmi2()) {
3334
3335 Label test_64_loop, test_tail;
3336 Register tmp3_aliased = len;
3337
3338 movl(tmp1, len);
3339 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3340
3341 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
3342 andl(len, ~(64 - 1)); // vector count (in chars)
3343 jccb(Assembler::zero, test_tail);
3344
3345 lea(ary1, Address(ary1, len, Address::times_1));
3346 negptr(len);
3347
3348 bind(test_64_loop);
3349 // Check whether our 64 elements of size byte contain negatives
3350 evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3351 kortestql(k2, k2);
3352 jcc(Assembler::notZero, TRUE_LABEL);
3353
3354 addptr(len, 64);
3355 jccb(Assembler::notZero, test_64_loop);
3356
3357
3358 bind(test_tail);
3359 // bail out when there is nothing to be done
3360 testl(tmp1, -1);
3361 jcc(Assembler::zero, FALSE_LABEL);
3362
3363 // ~(~0 << len) applied up to two times (for 32-bit scenario)
3364 #ifdef _LP64
3365 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3366 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3367 notq(tmp3_aliased);
3368 kmovql(k3, tmp3_aliased);
3369 #else
3370 Label k_init;
3371 jmp(k_init);
3372
3373 // We could not read 64-bits from a general purpose register thus we move
3374 // data required to compose 64 1's to the instruction stream
3375 // We emit 64 byte wide series of elements from 0..63 which later on would
3376 // be used as a compare targets with tail count contained in tmp1 register.
3377 // Result would be a k register having tmp1 consecutive number or 1
3378 // counting from least significant bit.
3379 address tmp = pc();
3380 emit_int64(0x0706050403020100);
3381 emit_int64(0x0F0E0D0C0B0A0908);
3382 emit_int64(0x1716151413121110);
3383 emit_int64(0x1F1E1D1C1B1A1918);
3384 emit_int64(0x2726252423222120);
3385 emit_int64(0x2F2E2D2C2B2A2928);
3386 emit_int64(0x3736353433323130);
3387 emit_int64(0x3F3E3D3C3B3A3938);
3388
3389 bind(k_init);
3390 lea(len, InternalAddress(tmp));
3391 // create mask to test for negative byte inside a vector
3392 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3393 evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
3394
3395 #endif
3396 evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3397 ktestq(k2, k3);
3398 jcc(Assembler::notZero, TRUE_LABEL);
3399
3400 jmp(FALSE_LABEL);
3401 } else {
3402 movl(result, len); // copy
3403
3404 if (UseAVX >= 2 && UseSSE >= 2) {
3405 // With AVX2, use 32-byte vector compare
3406 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3407
3408 // Compare 32-byte vectors
3409 andl(result, 0x0000001f); // tail count (in bytes)
3410 andl(len, 0xffffffe0); // vector count (in bytes)
3411 jccb(Assembler::zero, COMPARE_TAIL);
3412
3413 lea(ary1, Address(ary1, len, Address::times_1));
3414 negptr(len);
3415
3416 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
3417 movdl(vec2, tmp1);
3418 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3419
3420 bind(COMPARE_WIDE_VECTORS);
3421 vmovdqu(vec1, Address(ary1, len, Address::times_1));
3422 vptest(vec1, vec2);
3423 jccb(Assembler::notZero, TRUE_LABEL);
3424 addptr(len, 32);
3425 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3426
3427 testl(result, result);
3428 jccb(Assembler::zero, FALSE_LABEL);
3429
3430 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3431 vptest(vec1, vec2);
3432 jccb(Assembler::notZero, TRUE_LABEL);
3433 jmpb(FALSE_LABEL);
3434
3435 bind(COMPARE_TAIL); // len is zero
3436 movl(len, result);
3437 // Fallthru to tail compare
3438 } else if (UseSSE42Intrinsics) {
3439 // With SSE4.2, use double quad vector compare
3440 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3441
3442 // Compare 16-byte vectors
3443 andl(result, 0x0000000f); // tail count (in bytes)
3444 andl(len, 0xfffffff0); // vector count (in bytes)
3445 jcc(Assembler::zero, COMPARE_TAIL);
3446
3447 lea(ary1, Address(ary1, len, Address::times_1));
3448 negptr(len);
3449
3450 movl(tmp1, 0x80808080);
3451 movdl(vec2, tmp1);
3452 pshufd(vec2, vec2, 0);
3453
3454 bind(COMPARE_WIDE_VECTORS);
3455 movdqu(vec1, Address(ary1, len, Address::times_1));
3456 ptest(vec1, vec2);
3457 jcc(Assembler::notZero, TRUE_LABEL);
3458 addptr(len, 16);
3459 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3460
3461 testl(result, result);
3462 jcc(Assembler::zero, FALSE_LABEL);
3463
3464 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3465 ptest(vec1, vec2);
3466 jccb(Assembler::notZero, TRUE_LABEL);
3467 jmpb(FALSE_LABEL);
3468
3469 bind(COMPARE_TAIL); // len is zero
3470 movl(len, result);
3471 // Fallthru to tail compare
3472 }
3473 }
3474 // Compare 4-byte vectors
3475 andl(len, 0xfffffffc); // vector count (in bytes)
3476 jccb(Assembler::zero, COMPARE_CHAR);
3477
3478 lea(ary1, Address(ary1, len, Address::times_1));
3479 negptr(len);
3480
3481 bind(COMPARE_VECTORS);
3482 movl(tmp1, Address(ary1, len, Address::times_1));
3483 andl(tmp1, 0x80808080);
3484 jccb(Assembler::notZero, TRUE_LABEL);
3485 addptr(len, 4);
3486 jcc(Assembler::notZero, COMPARE_VECTORS);
3487
3488 // Compare trailing char (final 2 bytes), if any
3489 bind(COMPARE_CHAR);
3490 testl(result, 0x2); // tail char
3491 jccb(Assembler::zero, COMPARE_BYTE);
3492 load_unsigned_short(tmp1, Address(ary1, 0));
3493 andl(tmp1, 0x00008080);
3494 jccb(Assembler::notZero, TRUE_LABEL);
3495 subptr(result, 2);
3496 lea(ary1, Address(ary1, 2));
3497
3498 bind(COMPARE_BYTE);
3499 testl(result, 0x1); // tail byte
3500 jccb(Assembler::zero, FALSE_LABEL);
3501 load_unsigned_byte(tmp1, Address(ary1, 0));
3502 andl(tmp1, 0x00000080);
3503 jccb(Assembler::notEqual, TRUE_LABEL);
3504 jmpb(FALSE_LABEL);
3505
3506 bind(TRUE_LABEL);
3507 movl(result, 1); // return true
3508 jmpb(DONE);
3509
3510 bind(FALSE_LABEL);
3511 xorl(result, result); // return false
3512
3513 // That's it
3514 bind(DONE);
3515 if (UseAVX >= 2 && UseSSE >= 2) {
3516 // clean upper bits of YMM registers
3517 vpxor(vec1, vec1);
3518 vpxor(vec2, vec2);
3519 }
3520 }
3521 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
arrays_equals(bool is_array_equ,Register ary1,Register ary2,Register limit,Register result,Register chr,XMMRegister vec1,XMMRegister vec2,bool is_char)3522 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3523 Register limit, Register result, Register chr,
3524 XMMRegister vec1, XMMRegister vec2, bool is_char) {
3525 ShortBranchVerifier sbv(this);
3526 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3527
3528 int length_offset = arrayOopDesc::length_offset_in_bytes();
3529 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3530
3531 if (is_array_equ) {
3532 // Check the input args
3533 cmpoop(ary1, ary2);
3534 jcc(Assembler::equal, TRUE_LABEL);
3535
3536 // Need additional checks for arrays_equals.
3537 testptr(ary1, ary1);
3538 jcc(Assembler::zero, FALSE_LABEL);
3539 testptr(ary2, ary2);
3540 jcc(Assembler::zero, FALSE_LABEL);
3541
3542 // Check the lengths
3543 movl(limit, Address(ary1, length_offset));
3544 cmpl(limit, Address(ary2, length_offset));
3545 jcc(Assembler::notEqual, FALSE_LABEL);
3546 }
3547
3548 // count == 0
3549 testl(limit, limit);
3550 jcc(Assembler::zero, TRUE_LABEL);
3551
3552 if (is_array_equ) {
3553 // Load array address
3554 lea(ary1, Address(ary1, base_offset));
3555 lea(ary2, Address(ary2, base_offset));
3556 }
3557
3558 if (is_array_equ && is_char) {
3559 // arrays_equals when used for char[].
3560 shll(limit, 1); // byte count != 0
3561 }
3562 movl(result, limit); // copy
3563
3564 if (UseAVX >= 2) {
3565 // With AVX2, use 32-byte vector compare
3566 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3567
3568 // Compare 32-byte vectors
3569 andl(result, 0x0000001f); // tail count (in bytes)
3570 andl(limit, 0xffffffe0); // vector count (in bytes)
3571 jcc(Assembler::zero, COMPARE_TAIL);
3572
3573 lea(ary1, Address(ary1, limit, Address::times_1));
3574 lea(ary2, Address(ary2, limit, Address::times_1));
3575 negptr(limit);
3576
3577 #ifdef _LP64
3578 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3579 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3580
3581 cmpl(limit, -64);
3582 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3583
3584 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3585
3586 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3587 evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3588 kortestql(k7, k7);
3589 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
3590 addptr(limit, 64); // update since we already compared at this addr
3591 cmpl(limit, -64);
3592 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3593
3594 // At this point we may still need to compare -limit+result bytes.
3595 // We could execute the next two instruction and just continue via non-wide path:
3596 // cmpl(limit, 0);
3597 // jcc(Assembler::equal, COMPARE_TAIL); // true
3598 // But since we stopped at the points ary{1,2}+limit which are
3599 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3600 // (|limit| <= 32 and result < 32),
3601 // we may just compare the last 64 bytes.
3602 //
3603 addptr(result, -64); // it is safe, bc we just came from this area
3604 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3605 evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3606 kortestql(k7, k7);
3607 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
3608
3609 jmp(TRUE_LABEL);
3610
3611 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3612
3613 }//if (VM_Version::supports_avx512vlbw())
3614 #endif //_LP64
3615 bind(COMPARE_WIDE_VECTORS);
3616 vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3617 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3618 vpxor(vec1, vec2);
3619
3620 vptest(vec1, vec1);
3621 jcc(Assembler::notZero, FALSE_LABEL);
3622 addptr(limit, 32);
3623 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3624
3625 testl(result, result);
3626 jcc(Assembler::zero, TRUE_LABEL);
3627
3628 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3629 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3630 vpxor(vec1, vec2);
3631
3632 vptest(vec1, vec1);
3633 jccb(Assembler::notZero, FALSE_LABEL);
3634 jmpb(TRUE_LABEL);
3635
3636 bind(COMPARE_TAIL); // limit is zero
3637 movl(limit, result);
3638 // Fallthru to tail compare
3639 } else if (UseSSE42Intrinsics) {
3640 // With SSE4.2, use double quad vector compare
3641 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3642
3643 // Compare 16-byte vectors
3644 andl(result, 0x0000000f); // tail count (in bytes)
3645 andl(limit, 0xfffffff0); // vector count (in bytes)
3646 jcc(Assembler::zero, COMPARE_TAIL);
3647
3648 lea(ary1, Address(ary1, limit, Address::times_1));
3649 lea(ary2, Address(ary2, limit, Address::times_1));
3650 negptr(limit);
3651
3652 bind(COMPARE_WIDE_VECTORS);
3653 movdqu(vec1, Address(ary1, limit, Address::times_1));
3654 movdqu(vec2, Address(ary2, limit, Address::times_1));
3655 pxor(vec1, vec2);
3656
3657 ptest(vec1, vec1);
3658 jcc(Assembler::notZero, FALSE_LABEL);
3659 addptr(limit, 16);
3660 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3661
3662 testl(result, result);
3663 jcc(Assembler::zero, TRUE_LABEL);
3664
3665 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3666 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3667 pxor(vec1, vec2);
3668
3669 ptest(vec1, vec1);
3670 jccb(Assembler::notZero, FALSE_LABEL);
3671 jmpb(TRUE_LABEL);
3672
3673 bind(COMPARE_TAIL); // limit is zero
3674 movl(limit, result);
3675 // Fallthru to tail compare
3676 }
3677
3678 // Compare 4-byte vectors
3679 andl(limit, 0xfffffffc); // vector count (in bytes)
3680 jccb(Assembler::zero, COMPARE_CHAR);
3681
3682 lea(ary1, Address(ary1, limit, Address::times_1));
3683 lea(ary2, Address(ary2, limit, Address::times_1));
3684 negptr(limit);
3685
3686 bind(COMPARE_VECTORS);
3687 movl(chr, Address(ary1, limit, Address::times_1));
3688 cmpl(chr, Address(ary2, limit, Address::times_1));
3689 jccb(Assembler::notEqual, FALSE_LABEL);
3690 addptr(limit, 4);
3691 jcc(Assembler::notZero, COMPARE_VECTORS);
3692
3693 // Compare trailing char (final 2 bytes), if any
3694 bind(COMPARE_CHAR);
3695 testl(result, 0x2); // tail char
3696 jccb(Assembler::zero, COMPARE_BYTE);
3697 load_unsigned_short(chr, Address(ary1, 0));
3698 load_unsigned_short(limit, Address(ary2, 0));
3699 cmpl(chr, limit);
3700 jccb(Assembler::notEqual, FALSE_LABEL);
3701
3702 if (is_array_equ && is_char) {
3703 bind(COMPARE_BYTE);
3704 } else {
3705 lea(ary1, Address(ary1, 2));
3706 lea(ary2, Address(ary2, 2));
3707
3708 bind(COMPARE_BYTE);
3709 testl(result, 0x1); // tail byte
3710 jccb(Assembler::zero, TRUE_LABEL);
3711 load_unsigned_byte(chr, Address(ary1, 0));
3712 load_unsigned_byte(limit, Address(ary2, 0));
3713 cmpl(chr, limit);
3714 jccb(Assembler::notEqual, FALSE_LABEL);
3715 }
3716 bind(TRUE_LABEL);
3717 movl(result, 1); // return true
3718 jmpb(DONE);
3719
3720 bind(FALSE_LABEL);
3721 xorl(result, result); // return false
3722
3723 // That's it
3724 bind(DONE);
3725 if (UseAVX >= 2) {
3726 // clean upper bits of YMM registers
3727 vpxor(vec1, vec1);
3728 vpxor(vec2, vec2);
3729 }
3730 }
3731