1 /*
2 * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "oops/methodData.hpp"
29 #include "opto/c2_MacroAssembler.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/opcodes.hpp"
32 #include "runtime/biasedLocking.hpp"
33 #include "runtime/objectMonitor.hpp"
34 #include "runtime/stubRoutines.hpp"
35
setvectmask(Register dst,Register src)36 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
37 guarantee(PostLoopMultiversioning, "must be");
38 Assembler::movl(dst, 1);
39 Assembler::shlxl(dst, dst, src);
40 Assembler::decl(dst);
41 Assembler::kmovdl(k1, dst);
42 Assembler::movl(dst, src);
43 }
44
restorevectmask()45 void C2_MacroAssembler::restorevectmask() {
46 guarantee(PostLoopMultiversioning, "must be");
47 Assembler::knotwl(k1, k0);
48 }
49
50 #if INCLUDE_RTM_OPT
51
52 // Update rtm_counters based on abort status
53 // input: abort_status
54 // rtm_counters (RTMLockingCounters*)
55 // flags are killed
rtm_counters_update(Register abort_status,Register rtm_counters)56 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
57
58 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
59 if (PrintPreciseRTMLockingStatistics) {
60 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
61 Label check_abort;
62 testl(abort_status, (1<<i));
63 jccb(Assembler::equal, check_abort);
64 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
65 bind(check_abort);
66 }
67 }
68 }
69
70 // Branch if (random & (count-1) != 0), count is 2^n
71 // tmp, scr and flags are killed
branch_on_random_using_rdtsc(Register tmp,Register scr,int count,Label & brLabel)72 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
73 assert(tmp == rax, "");
74 assert(scr == rdx, "");
75 rdtsc(); // modifies EDX:EAX
76 andptr(tmp, count-1);
77 jccb(Assembler::notZero, brLabel);
78 }
79
80 // Perform abort ratio calculation, set no_rtm bit if high ratio
81 // input: rtm_counters_Reg (RTMLockingCounters* address)
82 // tmpReg, rtm_counters_Reg and flags are killed
rtm_abort_ratio_calculation(Register tmpReg,Register rtm_counters_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data)83 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
84 Register rtm_counters_Reg,
85 RTMLockingCounters* rtm_counters,
86 Metadata* method_data) {
87 Label L_done, L_check_always_rtm1, L_check_always_rtm2;
88
89 if (RTMLockingCalculationDelay > 0) {
90 // Delay calculation
91 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
92 testptr(tmpReg, tmpReg);
93 jccb(Assembler::equal, L_done);
94 }
95 // Abort ratio calculation only if abort_count > RTMAbortThreshold
96 // Aborted transactions = abort_count * 100
97 // All transactions = total_count * RTMTotalCountIncrRate
98 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
99
100 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
101 cmpptr(tmpReg, RTMAbortThreshold);
102 jccb(Assembler::below, L_check_always_rtm2);
103 imulptr(tmpReg, tmpReg, 100);
104
105 Register scrReg = rtm_counters_Reg;
106 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
107 imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
108 imulptr(scrReg, scrReg, RTMAbortRatio);
109 cmpptr(tmpReg, scrReg);
110 jccb(Assembler::below, L_check_always_rtm1);
111 if (method_data != NULL) {
112 // set rtm_state to "no rtm" in MDO
113 mov_metadata(tmpReg, method_data);
114 lock();
115 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
116 }
117 jmpb(L_done);
118 bind(L_check_always_rtm1);
119 // Reload RTMLockingCounters* address
120 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
121 bind(L_check_always_rtm2);
122 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
123 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
124 jccb(Assembler::below, L_done);
125 if (method_data != NULL) {
126 // set rtm_state to "always rtm" in MDO
127 mov_metadata(tmpReg, method_data);
128 lock();
129 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
130 }
131 bind(L_done);
132 }
133
134 // Update counters and perform abort ratio calculation
135 // input: abort_status_Reg
136 // rtm_counters_Reg, flags are killed
rtm_profiling(Register abort_status_Reg,Register rtm_counters_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm)137 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
138 Register rtm_counters_Reg,
139 RTMLockingCounters* rtm_counters,
140 Metadata* method_data,
141 bool profile_rtm) {
142
143 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
144 // update rtm counters based on rax value at abort
145 // reads abort_status_Reg, updates flags
146 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
147 rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
148 if (profile_rtm) {
149 // Save abort status because abort_status_Reg is used by following code.
150 if (RTMRetryCount > 0) {
151 push(abort_status_Reg);
152 }
153 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
154 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
155 // restore abort status
156 if (RTMRetryCount > 0) {
157 pop(abort_status_Reg);
158 }
159 }
160 }
161
162 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
163 // inputs: retry_count_Reg
164 // : abort_status_Reg
165 // output: retry_count_Reg decremented by 1
166 // flags are killed
rtm_retry_lock_on_abort(Register retry_count_Reg,Register abort_status_Reg,Label & retryLabel)167 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
168 Label doneRetry;
169 assert(abort_status_Reg == rax, "");
170 // The abort reason bits are in eax (see all states in rtmLocking.hpp)
171 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
172 // if reason is in 0x6 and retry count != 0 then retry
173 andptr(abort_status_Reg, 0x6);
174 jccb(Assembler::zero, doneRetry);
175 testl(retry_count_Reg, retry_count_Reg);
176 jccb(Assembler::zero, doneRetry);
177 pause();
178 decrementl(retry_count_Reg);
179 jmp(retryLabel);
180 bind(doneRetry);
181 }
182
183 // Spin and retry if lock is busy,
184 // inputs: box_Reg (monitor address)
185 // : retry_count_Reg
186 // output: retry_count_Reg decremented by 1
187 // : clear z flag if retry count exceeded
188 // tmp_Reg, scr_Reg, flags are killed
rtm_retry_lock_on_busy(Register retry_count_Reg,Register box_Reg,Register tmp_Reg,Register scr_Reg,Label & retryLabel)189 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
190 Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
191 Label SpinLoop, SpinExit, doneRetry;
192 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
193
194 testl(retry_count_Reg, retry_count_Reg);
195 jccb(Assembler::zero, doneRetry);
196 decrementl(retry_count_Reg);
197 movptr(scr_Reg, RTMSpinLoopCount);
198
199 bind(SpinLoop);
200 pause();
201 decrementl(scr_Reg);
202 jccb(Assembler::lessEqual, SpinExit);
203 movptr(tmp_Reg, Address(box_Reg, owner_offset));
204 testptr(tmp_Reg, tmp_Reg);
205 jccb(Assembler::notZero, SpinLoop);
206
207 bind(SpinExit);
208 jmp(retryLabel);
209 bind(doneRetry);
210 incrementl(retry_count_Reg); // clear z flag
211 }
212
213 // Use RTM for normal stack locks
214 // Input: objReg (object to lock)
rtm_stack_locking(Register objReg,Register tmpReg,Register scrReg,Register retry_on_abort_count_Reg,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL,Label & IsInflated)215 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
216 Register retry_on_abort_count_Reg,
217 RTMLockingCounters* stack_rtm_counters,
218 Metadata* method_data, bool profile_rtm,
219 Label& DONE_LABEL, Label& IsInflated) {
220 assert(UseRTMForStackLocks, "why call this otherwise?");
221 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
222 assert(tmpReg == rax, "");
223 assert(scrReg == rdx, "");
224 Label L_rtm_retry, L_decrement_retry, L_on_abort;
225
226 if (RTMRetryCount > 0) {
227 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
228 bind(L_rtm_retry);
229 }
230 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
231 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
232 jcc(Assembler::notZero, IsInflated);
233
234 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
235 Label L_noincrement;
236 if (RTMTotalCountIncrRate > 1) {
237 // tmpReg, scrReg and flags are killed
238 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
239 }
240 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
241 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
242 bind(L_noincrement);
243 }
244 xbegin(L_on_abort);
245 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
246 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
247 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
248 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked
249
250 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
251 if (UseRTMXendForLockBusy) {
252 xend();
253 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)
254 jmp(L_decrement_retry);
255 }
256 else {
257 xabort(0);
258 }
259 bind(L_on_abort);
260 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
261 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
262 }
263 bind(L_decrement_retry);
264 if (RTMRetryCount > 0) {
265 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
266 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
267 }
268 }
269
270 // Use RTM for inflating locks
271 // inputs: objReg (object to lock)
272 // boxReg (on-stack box address (displaced header location) - KILLED)
273 // tmpReg (ObjectMonitor address + markWord::monitor_value)
rtm_inflated_locking(Register objReg,Register boxReg,Register tmpReg,Register scrReg,Register retry_on_busy_count_Reg,Register retry_on_abort_count_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL)274 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
275 Register scrReg, Register retry_on_busy_count_Reg,
276 Register retry_on_abort_count_Reg,
277 RTMLockingCounters* rtm_counters,
278 Metadata* method_data, bool profile_rtm,
279 Label& DONE_LABEL) {
280 assert(UseRTMLocking, "why call this otherwise?");
281 assert(tmpReg == rax, "");
282 assert(scrReg == rdx, "");
283 Label L_rtm_retry, L_decrement_retry, L_on_abort;
284 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
285
286 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
287 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
288 movptr(boxReg, tmpReg); // Save ObjectMonitor address
289
290 if (RTMRetryCount > 0) {
291 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy
292 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
293 bind(L_rtm_retry);
294 }
295 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
296 Label L_noincrement;
297 if (RTMTotalCountIncrRate > 1) {
298 // tmpReg, scrReg and flags are killed
299 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
300 }
301 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
302 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
303 bind(L_noincrement);
304 }
305 xbegin(L_on_abort);
306 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
307 movptr(tmpReg, Address(tmpReg, owner_offset));
308 testptr(tmpReg, tmpReg);
309 jcc(Assembler::zero, DONE_LABEL);
310 if (UseRTMXendForLockBusy) {
311 xend();
312 jmp(L_decrement_retry);
313 }
314 else {
315 xabort(0);
316 }
317 bind(L_on_abort);
318 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
319 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
320 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
321 }
322 if (RTMRetryCount > 0) {
323 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
324 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
325 }
326
327 movptr(tmpReg, Address(boxReg, owner_offset)) ;
328 testptr(tmpReg, tmpReg) ;
329 jccb(Assembler::notZero, L_decrement_retry) ;
330
331 // Appears unlocked - try to swing _owner from null to non-null.
332 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
333 #ifdef _LP64
334 Register threadReg = r15_thread;
335 #else
336 get_thread(scrReg);
337 Register threadReg = scrReg;
338 #endif
339 lock();
340 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
341
342 if (RTMRetryCount > 0) {
343 // success done else retry
344 jccb(Assembler::equal, DONE_LABEL) ;
345 bind(L_decrement_retry);
346 // Spin and retry if lock is busy.
347 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
348 }
349 else {
350 bind(L_decrement_retry);
351 }
352 }
353
354 #endif // INCLUDE_RTM_OPT
355
356 // fast_lock and fast_unlock used by C2
357
358 // Because the transitions from emitted code to the runtime
359 // monitorenter/exit helper stubs are so slow it's critical that
360 // we inline both the stack-locking fast path and the inflated fast path.
361 //
362 // See also: cmpFastLock and cmpFastUnlock.
363 //
364 // What follows is a specialized inline transliteration of the code
365 // in enter() and exit(). If we're concerned about I$ bloat another
366 // option would be to emit TrySlowEnter and TrySlowExit methods
367 // at startup-time. These methods would accept arguments as
368 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
369 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
370 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
371 // In practice, however, the # of lock sites is bounded and is usually small.
372 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
373 // if the processor uses simple bimodal branch predictors keyed by EIP
374 // Since the helper routines would be called from multiple synchronization
375 // sites.
376 //
377 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
378 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
379 // to those specialized methods. That'd give us a mostly platform-independent
380 // implementation that the JITs could optimize and inline at their pleasure.
381 // Done correctly, the only time we'd need to cross to native could would be
382 // to park() or unpark() threads. We'd also need a few more unsafe operators
383 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
384 // (b) explicit barriers or fence operations.
385 //
386 // TODO:
387 //
388 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
389 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
390 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
391 // the lock operators would typically be faster than reifying Self.
392 //
393 // * Ideally I'd define the primitives as:
394 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
395 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
396 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
397 // Instead, we're stuck with a rather awkward and brittle register assignments below.
398 // Furthermore the register assignments are overconstrained, possibly resulting in
399 // sub-optimal code near the synchronization site.
400 //
401 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
402 // Alternately, use a better sp-proximity test.
403 //
404 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
405 // Either one is sufficient to uniquely identify a thread.
406 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
407 //
408 // * Intrinsify notify() and notifyAll() for the common cases where the
409 // object is locked by the calling thread but the waitlist is empty.
410 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
411 //
412 // * use jccb and jmpb instead of jcc and jmp to improve code density.
413 // But beware of excessive branch density on AMD Opterons.
414 //
415 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
416 // or failure of the fast path. If the fast path fails then we pass
417 // control to the slow path, typically in C. In fast_lock and
418 // fast_unlock we often branch to DONE_LABEL, just to find that C2
419 // will emit a conditional branch immediately after the node.
420 // So we have branches to branches and lots of ICC.ZF games.
421 // Instead, it might be better to have C2 pass a "FailureLabel"
422 // into fast_lock and fast_unlock. In the case of success, control
423 // will drop through the node. ICC.ZF is undefined at exit.
424 // In the case of failure, the node will branch directly to the
425 // FailureLabel
426
427
428 // obj: object to lock
429 // box: on-stack box address (displaced header location) - KILLED
430 // rax,: tmp -- KILLED
431 // scr: tmp -- KILLED
fast_lock(Register objReg,Register boxReg,Register tmpReg,Register scrReg,Register cx1Reg,Register cx2Reg,BiasedLockingCounters * counters,RTMLockingCounters * rtm_counters,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool use_rtm,bool profile_rtm)432 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
433 Register scrReg, Register cx1Reg, Register cx2Reg,
434 BiasedLockingCounters* counters,
435 RTMLockingCounters* rtm_counters,
436 RTMLockingCounters* stack_rtm_counters,
437 Metadata* method_data,
438 bool use_rtm, bool profile_rtm) {
439 // Ensure the register assignments are disjoint
440 assert(tmpReg == rax, "");
441
442 if (use_rtm) {
443 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
444 } else {
445 assert(cx2Reg == noreg, "");
446 assert_different_registers(objReg, boxReg, tmpReg, scrReg);
447 }
448
449 if (counters != NULL) {
450 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
451 }
452
453 // Possible cases that we'll encounter in fast_lock
454 // ------------------------------------------------
455 // * Inflated
456 // -- unlocked
457 // -- Locked
458 // = by self
459 // = by other
460 // * biased
461 // -- by Self
462 // -- by other
463 // * neutral
464 // * stack-locked
465 // -- by self
466 // = sp-proximity test hits
467 // = sp-proximity test generates false-negative
468 // -- by other
469 //
470
471 Label IsInflated, DONE_LABEL;
472
473 // it's stack-locked, biased or neutral
474 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
475 // order to reduce the number of conditional branches in the most common cases.
476 // Beware -- there's a subtle invariant that fetch of the markword
477 // at [FETCH], below, will never observe a biased encoding (*101b).
478 // If this invariant is not held we risk exclusion (safety) failure.
479 if (UseBiasedLocking && !UseOptoBiasInlining) {
480 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
481 }
482
483 #if INCLUDE_RTM_OPT
484 if (UseRTMForStackLocks && use_rtm) {
485 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
486 stack_rtm_counters, method_data, profile_rtm,
487 DONE_LABEL, IsInflated);
488 }
489 #endif // INCLUDE_RTM_OPT
490
491 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH]
492 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
493 jccb(Assembler::notZero, IsInflated);
494
495 // Attempt stack-locking ...
496 orptr (tmpReg, markWord::unlocked_value);
497 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
498 lock();
499 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg
500 if (counters != NULL) {
501 cond_inc32(Assembler::equal,
502 ExternalAddress((address)counters->fast_path_entry_count_addr()));
503 }
504 jcc(Assembler::equal, DONE_LABEL); // Success
505
506 // Recursive locking.
507 // The object is stack-locked: markword contains stack pointer to BasicLock.
508 // Locked by current thread if difference with current SP is less than one page.
509 subptr(tmpReg, rsp);
510 // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
511 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
512 movptr(Address(boxReg, 0), tmpReg);
513 if (counters != NULL) {
514 cond_inc32(Assembler::equal,
515 ExternalAddress((address)counters->fast_path_entry_count_addr()));
516 }
517 jmp(DONE_LABEL);
518
519 bind(IsInflated);
520 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
521
522 #if INCLUDE_RTM_OPT
523 // Use the same RTM locking code in 32- and 64-bit VM.
524 if (use_rtm) {
525 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
526 rtm_counters, method_data, profile_rtm, DONE_LABEL);
527 } else {
528 #endif // INCLUDE_RTM_OPT
529
530 #ifndef _LP64
531 // The object is inflated.
532
533 // boxReg refers to the on-stack BasicLock in the current frame.
534 // We'd like to write:
535 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices.
536 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
537 // additional latency as we have another ST in the store buffer that must drain.
538
539 // avoid ST-before-CAS
540 // register juggle because we need tmpReg for cmpxchgptr below
541 movptr(scrReg, boxReg);
542 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
543
544 // Optimistic form: consider XORL tmpReg,tmpReg
545 movptr(tmpReg, NULL_WORD);
546
547 // Appears unlocked - try to swing _owner from null to non-null.
548 // Ideally, I'd manifest "Self" with get_thread and then attempt
549 // to CAS the register containing Self into m->Owner.
550 // But we don't have enough registers, so instead we can either try to CAS
551 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
552 // we later store "Self" into m->Owner. Transiently storing a stack address
553 // (rsp or the address of the box) into m->owner is harmless.
554 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
555 lock();
556 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
557 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
558 // If we weren't able to swing _owner from NULL to the BasicLock
559 // then take the slow path.
560 jccb (Assembler::notZero, DONE_LABEL);
561 // update _owner from BasicLock to thread
562 get_thread (scrReg); // beware: clobbers ICCs
563 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
564 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
565
566 // If the CAS fails we can either retry or pass control to the slow path.
567 // We use the latter tactic.
568 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
569 // If the CAS was successful ...
570 // Self has acquired the lock
571 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
572 // Intentional fall-through into DONE_LABEL ...
573 #else // _LP64
574 // It's inflated and we use scrReg for ObjectMonitor* in this section.
575 movq(scrReg, tmpReg);
576 xorq(tmpReg, tmpReg);
577 lock();
578 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
579 // Unconditionally set box->_displaced_header = markWord::unused_mark().
580 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
581 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
582 // Intentional fall-through into DONE_LABEL ...
583 // Propagate ICC.ZF from CAS above into DONE_LABEL.
584 #endif // _LP64
585 #if INCLUDE_RTM_OPT
586 } // use_rtm()
587 #endif
588 // DONE_LABEL is a hot target - we'd really like to place it at the
589 // start of cache line by padding with NOPs.
590 // See the AMD and Intel software optimization manuals for the
591 // most efficient "long" NOP encodings.
592 // Unfortunately none of our alignment mechanisms suffice.
593 bind(DONE_LABEL);
594
595 // At DONE_LABEL the icc ZFlag is set as follows ...
596 // fast_unlock uses the same protocol.
597 // ZFlag == 1 -> Success
598 // ZFlag == 0 -> Failure - force control through the slow path
599 }
600
601 // obj: object to unlock
602 // box: box address (displaced header location), killed. Must be EAX.
603 // tmp: killed, cannot be obj nor box.
604 //
605 // Some commentary on balanced locking:
606 //
607 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
608 // Methods that don't have provably balanced locking are forced to run in the
609 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
610 // The interpreter provides two properties:
611 // I1: At return-time the interpreter automatically and quietly unlocks any
612 // objects acquired the current activation (frame). Recall that the
613 // interpreter maintains an on-stack list of locks currently held by
614 // a frame.
615 // I2: If a method attempts to unlock an object that is not held by the
616 // the frame the interpreter throws IMSX.
617 //
618 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
619 // B() doesn't have provably balanced locking so it runs in the interpreter.
620 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
621 // is still locked by A().
622 //
623 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
624 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
625 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
626 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
627 // Arguably given that the spec legislates the JNI case as undefined our implementation
628 // could reasonably *avoid* checking owner in fast_unlock().
629 // In the interest of performance we elide m->Owner==Self check in unlock.
630 // A perfectly viable alternative is to elide the owner check except when
631 // Xcheck:jni is enabled.
632
fast_unlock(Register objReg,Register boxReg,Register tmpReg,bool use_rtm)633 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
634 assert(boxReg == rax, "");
635 assert_different_registers(objReg, boxReg, tmpReg);
636
637 Label DONE_LABEL, Stacked, CheckSucc;
638
639 // Critically, the biased locking test must have precedence over
640 // and appear before the (box->dhw == 0) recursive stack-lock test.
641 if (UseBiasedLocking && !UseOptoBiasInlining) {
642 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
643 }
644
645 #if INCLUDE_RTM_OPT
646 if (UseRTMForStackLocks && use_rtm) {
647 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
648 Label L_regular_unlock;
649 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
650 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
651 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
652 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
653 xend(); // otherwise end...
654 jmp(DONE_LABEL); // ... and we're done
655 bind(L_regular_unlock);
656 }
657 #endif
658
659 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
660 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
661 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
662 testptr(tmpReg, markWord::monitor_value); // Inflated?
663 jccb (Assembler::zero, Stacked);
664
665 // It's inflated.
666 #if INCLUDE_RTM_OPT
667 if (use_rtm) {
668 Label L_regular_inflated_unlock;
669 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
670 movptr(boxReg, Address(tmpReg, owner_offset));
671 testptr(boxReg, boxReg);
672 jccb(Assembler::notZero, L_regular_inflated_unlock);
673 xend();
674 jmpb(DONE_LABEL);
675 bind(L_regular_inflated_unlock);
676 }
677 #endif
678
679 // Despite our balanced locking property we still check that m->_owner == Self
680 // as java routines or native JNI code called by this thread might
681 // have released the lock.
682 // Refer to the comments in synchronizer.cpp for how we might encode extra
683 // state in _succ so we can avoid fetching EntryList|cxq.
684 //
685 // I'd like to add more cases in fast_lock() and fast_unlock() --
686 // such as recursive enter and exit -- but we have to be wary of
687 // I$ bloat, T$ effects and BP$ effects.
688 //
689 // If there's no contention try a 1-0 exit. That is, exit without
690 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
691 // we detect and recover from the race that the 1-0 exit admits.
692 //
693 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
694 // before it STs null into _owner, releasing the lock. Updates
695 // to data protected by the critical section must be visible before
696 // we drop the lock (and thus before any other thread could acquire
697 // the lock and observe the fields protected by the lock).
698 // IA32's memory-model is SPO, so STs are ordered with respect to
699 // each other and there's no need for an explicit barrier (fence).
700 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
701 #ifndef _LP64
702 get_thread (boxReg);
703
704 // Note that we could employ various encoding schemes to reduce
705 // the number of loads below (currently 4) to just 2 or 3.
706 // Refer to the comments in synchronizer.cpp.
707 // In practice the chain of fetches doesn't seem to impact performance, however.
708 xorptr(boxReg, boxReg);
709 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
710 jccb (Assembler::notZero, DONE_LABEL);
711 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
712 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
713 jccb (Assembler::notZero, CheckSucc);
714 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
715 jmpb (DONE_LABEL);
716
717 bind (Stacked);
718 // It's not inflated and it's not recursively stack-locked and it's not biased.
719 // It must be stack-locked.
720 // Try to reset the header to displaced header.
721 // The "box" value on the stack is stable, so we can reload
722 // and be assured we observe the same value as above.
723 movptr(tmpReg, Address(boxReg, 0));
724 lock();
725 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
726 // Intention fall-thru into DONE_LABEL
727
728 // DONE_LABEL is a hot target - we'd really like to place it at the
729 // start of cache line by padding with NOPs.
730 // See the AMD and Intel software optimization manuals for the
731 // most efficient "long" NOP encodings.
732 // Unfortunately none of our alignment mechanisms suffice.
733 bind (CheckSucc);
734 #else // _LP64
735 // It's inflated
736 xorptr(boxReg, boxReg);
737 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
738 jccb (Assembler::notZero, DONE_LABEL);
739 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
740 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
741 jccb (Assembler::notZero, CheckSucc);
742 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
743 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
744 jmpb (DONE_LABEL);
745
746 // Try to avoid passing control into the slow_path ...
747 Label LSuccess, LGoSlowPath ;
748 bind (CheckSucc);
749
750 // The following optional optimization can be elided if necessary
751 // Effectively: if (succ == null) goto slow path
752 // The code reduces the window for a race, however,
753 // and thus benefits performance.
754 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
755 jccb (Assembler::zero, LGoSlowPath);
756
757 xorptr(boxReg, boxReg);
758 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
759 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
760
761 // Memory barrier/fence
762 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
763 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
764 // This is faster on Nehalem and AMD Shanghai/Barcelona.
765 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
766 // We might also restructure (ST Owner=0;barrier;LD _Succ) to
767 // (mov box,0; xchgq box, &m->Owner; LD _succ) .
768 lock(); addl(Address(rsp, 0), 0);
769
770 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
771 jccb (Assembler::notZero, LSuccess);
772
773 // Rare inopportune interleaving - race.
774 // The successor vanished in the small window above.
775 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
776 // We need to ensure progress and succession.
777 // Try to reacquire the lock.
778 // If that fails then the new owner is responsible for succession and this
779 // thread needs to take no further action and can exit via the fast path (success).
780 // If the re-acquire succeeds then pass control into the slow path.
781 // As implemented, this latter mode is horrible because we generated more
782 // coherence traffic on the lock *and* artifically extended the critical section
783 // length while by virtue of passing control into the slow path.
784
785 // box is really RAX -- the following CMPXCHG depends on that binding
786 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
787 lock();
788 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
789 // There's no successor so we tried to regrab the lock.
790 // If that didn't work, then another thread grabbed the
791 // lock so we're done (and exit was a success).
792 jccb (Assembler::notEqual, LSuccess);
793 // Intentional fall-through into slow path
794
795 bind (LGoSlowPath);
796 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
797 jmpb (DONE_LABEL);
798
799 bind (LSuccess);
800 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
801 jmpb (DONE_LABEL);
802
803 bind (Stacked);
804 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
805 lock();
806 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
807
808 #endif
809 bind(DONE_LABEL);
810 }
811
812 //-------------------------------------------------------------------------------------------
813 // Generic instructions support for use in .ad files C2 code generation
814
vabsnegd(int opcode,XMMRegister dst,XMMRegister src,Register scr)815 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
816 if (dst != src) {
817 movdqu(dst, src);
818 }
819 if (opcode == Op_AbsVD) {
820 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
821 } else {
822 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
823 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
824 }
825 }
826
vabsnegd(int opcode,XMMRegister dst,XMMRegister src,int vector_len,Register scr)827 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
828 if (opcode == Op_AbsVD) {
829 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
830 } else {
831 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
832 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
833 }
834 }
835
vabsnegf(int opcode,XMMRegister dst,XMMRegister src,Register scr)836 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
837 if (dst != src) {
838 movdqu(dst, src);
839 }
840 if (opcode == Op_AbsVF) {
841 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
842 } else {
843 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
844 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
845 }
846 }
847
vabsnegf(int opcode,XMMRegister dst,XMMRegister src,int vector_len,Register scr)848 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
849 if (opcode == Op_AbsVF) {
850 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
851 } else {
852 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
853 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
854 }
855 }
856
vextendbw(bool sign,XMMRegister dst,XMMRegister src)857 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
858 if (sign) {
859 pmovsxbw(dst, src);
860 } else {
861 pmovzxbw(dst, src);
862 }
863 }
864
vextendbw(bool sign,XMMRegister dst,XMMRegister src,int vector_len)865 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
866 if (sign) {
867 vpmovsxbw(dst, src, vector_len);
868 } else {
869 vpmovzxbw(dst, src, vector_len);
870 }
871 }
872
vshiftd(int opcode,XMMRegister dst,XMMRegister src)873 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
874 if (opcode == Op_RShiftVI) {
875 psrad(dst, src);
876 } else if (opcode == Op_LShiftVI) {
877 pslld(dst, src);
878 } else {
879 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
880 psrld(dst, src);
881 }
882 }
883
vshiftd(int opcode,XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)884 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
885 if (opcode == Op_RShiftVI) {
886 vpsrad(dst, nds, src, vector_len);
887 } else if (opcode == Op_LShiftVI) {
888 vpslld(dst, nds, src, vector_len);
889 } else {
890 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
891 vpsrld(dst, nds, src, vector_len);
892 }
893 }
894
vshiftw(int opcode,XMMRegister dst,XMMRegister src)895 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
896 if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
897 psraw(dst, src);
898 } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
899 psllw(dst, src);
900 } else {
901 assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
902 psrlw(dst, src);
903 }
904 }
905
vshiftw(int opcode,XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)906 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
907 if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
908 vpsraw(dst, nds, src, vector_len);
909 } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
910 vpsllw(dst, nds, src, vector_len);
911 } else {
912 assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
913 vpsrlw(dst, nds, src, vector_len);
914 }
915 }
916
vshiftq(int opcode,XMMRegister dst,XMMRegister src)917 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
918 if (opcode == Op_RShiftVL) {
919 psrlq(dst, src); // using srl to implement sra on pre-avs512 systems
920 } else if (opcode == Op_LShiftVL) {
921 psllq(dst, src);
922 } else {
923 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
924 psrlq(dst, src);
925 }
926 }
927
vshiftq(int opcode,XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)928 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
929 if (opcode == Op_RShiftVL) {
930 evpsraq(dst, nds, src, vector_len);
931 } else if (opcode == Op_LShiftVL) {
932 vpsllq(dst, nds, src, vector_len);
933 } else {
934 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
935 vpsrlq(dst, nds, src, vector_len);
936 }
937 }
938
939 // Reductions for vectors of ints, longs, floats, and doubles.
940
reduce_operation_128(int opcode,XMMRegister dst,XMMRegister src)941 void C2_MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) {
942 int vector_len = Assembler::AVX_128bit;
943
944 switch (opcode) {
945 case Op_AndReductionV: pand(dst, src); break;
946 case Op_OrReductionV: por (dst, src); break;
947 case Op_XorReductionV: pxor(dst, src); break;
948
949 case Op_AddReductionVF: addss(dst, src); break;
950 case Op_AddReductionVD: addsd(dst, src); break;
951 case Op_AddReductionVI: paddd(dst, src); break;
952 case Op_AddReductionVL: paddq(dst, src); break;
953
954 case Op_MulReductionVF: mulss(dst, src); break;
955 case Op_MulReductionVD: mulsd(dst, src); break;
956 case Op_MulReductionVI: pmulld(dst, src); break;
957 case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break;
958
959 default: assert(false, "wrong opcode");
960 }
961 }
962
reduce_operation_256(int opcode,XMMRegister dst,XMMRegister src1,XMMRegister src2)963 void C2_MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
964 int vector_len = Assembler::AVX_256bit;
965
966 switch (opcode) {
967 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
968 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
969 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
970
971 case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break;
972 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
973
974 case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break;
975 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
976
977 default: assert(false, "wrong opcode");
978 }
979 }
980
reduce_fp(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)981 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
982 XMMRegister dst, XMMRegister src,
983 XMMRegister vtmp1, XMMRegister vtmp2) {
984 switch (opcode) {
985 case Op_AddReductionVF:
986 case Op_MulReductionVF:
987 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
988 break;
989
990 case Op_AddReductionVD:
991 case Op_MulReductionVD:
992 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
993 break;
994
995 default: assert(false, "wrong opcode");
996 }
997 }
998
reduceI(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)999 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1000 Register dst, Register src1, XMMRegister src2,
1001 XMMRegister vtmp1, XMMRegister vtmp2) {
1002 switch (vlen) {
1003 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1004 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1005 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1006 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1007
1008 default: assert(false, "wrong vector length");
1009 }
1010 }
1011
1012 #ifdef _LP64
reduceL(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1013 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1014 Register dst, Register src1, XMMRegister src2,
1015 XMMRegister vtmp1, XMMRegister vtmp2) {
1016 switch (vlen) {
1017 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1018 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1019 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1020
1021 default: assert(false, "wrong vector length");
1022 }
1023 }
1024 #endif // _LP64
1025
reduceF(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1026 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1027 switch (vlen) {
1028 case 2:
1029 assert(vtmp2 == xnoreg, "");
1030 reduce2F(opcode, dst, src, vtmp1);
1031 break;
1032 case 4:
1033 assert(vtmp2 == xnoreg, "");
1034 reduce4F(opcode, dst, src, vtmp1);
1035 break;
1036 case 8:
1037 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1038 break;
1039 case 16:
1040 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1041 break;
1042 default: assert(false, "wrong vector length");
1043 }
1044 }
1045
reduceD(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1046 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1047 switch (vlen) {
1048 case 2:
1049 assert(vtmp2 == xnoreg, "");
1050 reduce2D(opcode, dst, src, vtmp1);
1051 break;
1052 case 4:
1053 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1054 break;
1055 case 8:
1056 reduce8D(opcode, dst, src, vtmp1, vtmp2);
1057 break;
1058 default: assert(false, "wrong vector length");
1059 }
1060 }
1061
reduce2I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1062 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1063 if (opcode == Op_AddReductionVI) {
1064 if (vtmp1 != src2) {
1065 movdqu(vtmp1, src2);
1066 }
1067 phaddd(vtmp1, vtmp1);
1068 } else {
1069 pshufd(vtmp1, src2, 0x1);
1070 reduce_operation_128(opcode, vtmp1, src2);
1071 }
1072 movdl(vtmp2, src1);
1073 reduce_operation_128(opcode, vtmp1, vtmp2);
1074 movdl(dst, vtmp1);
1075 }
1076
reduce4I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1077 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1078 if (opcode == Op_AddReductionVI) {
1079 if (vtmp1 != src2) {
1080 movdqu(vtmp1, src2);
1081 }
1082 phaddd(vtmp1, src2);
1083 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1084 } else {
1085 pshufd(vtmp2, src2, 0xE);
1086 reduce_operation_128(opcode, vtmp2, src2);
1087 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1088 }
1089 }
1090
reduce8I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1091 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1092 if (opcode == Op_AddReductionVI) {
1093 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1094 vextracti128_high(vtmp2, vtmp1);
1095 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1096 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1097 } else {
1098 vextracti128_high(vtmp1, src2);
1099 reduce_operation_128(opcode, vtmp1, src2);
1100 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1101 }
1102 }
1103
reduce16I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1104 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1105 vextracti64x4_high(vtmp2, src2);
1106 reduce_operation_256(opcode, vtmp2, vtmp2, src2);
1107 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1108 }
1109
1110 #ifdef _LP64
reduce2L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1111 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1112 pshufd(vtmp2, src2, 0xE);
1113 reduce_operation_128(opcode, vtmp2, src2);
1114 movdq(vtmp1, src1);
1115 reduce_operation_128(opcode, vtmp1, vtmp2);
1116 movdq(dst, vtmp1);
1117 }
1118
reduce4L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1119 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1120 vextracti128_high(vtmp1, src2);
1121 reduce_operation_128(opcode, vtmp1, src2);
1122 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1123 }
1124
reduce8L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1125 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1126 vextracti64x4_high(vtmp2, src2);
1127 reduce_operation_256(opcode, vtmp2, vtmp2, src2);
1128 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1129 }
1130 #endif // _LP64
1131
reduce2F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1132 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1133 reduce_operation_128(opcode, dst, src);
1134 pshufd(vtmp, src, 0x1);
1135 reduce_operation_128(opcode, dst, vtmp);
1136 }
1137
reduce4F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1138 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1139 reduce2F(opcode, dst, src, vtmp);
1140 pshufd(vtmp, src, 0x2);
1141 reduce_operation_128(opcode, dst, vtmp);
1142 pshufd(vtmp, src, 0x3);
1143 reduce_operation_128(opcode, dst, vtmp);
1144 }
1145
reduce8F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1146 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1147 reduce4F(opcode, dst, src, vtmp2);
1148 vextractf128_high(vtmp2, src);
1149 reduce4F(opcode, dst, vtmp2, vtmp1);
1150 }
1151
reduce16F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1152 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1153 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1154 vextracti64x4_high(vtmp1, src);
1155 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1156 }
1157
reduce2D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1158 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1159 reduce_operation_128(opcode, dst, src);
1160 pshufd(vtmp, src, 0xE);
1161 reduce_operation_128(opcode, dst, vtmp);
1162 }
1163
reduce4D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1164 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1165 reduce2D(opcode, dst, src, vtmp2);
1166 vextractf128_high(vtmp2, src);
1167 reduce2D(opcode, dst, vtmp2, vtmp1);
1168 }
1169
reduce8D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1170 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1171 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1172 vextracti64x4_high(vtmp1, src);
1173 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1174 }
1175
1176 //-------------------------------------------------------------------------------------------
1177
1178 // IndexOf for constant substrings with size >= 8 chars
1179 // which don't need to be loaded through stack.
string_indexofC8(Register str1,Register str2,Register cnt1,Register cnt2,int int_cnt2,Register result,XMMRegister vec,Register tmp,int ae)1180 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
1181 Register cnt1, Register cnt2,
1182 int int_cnt2, Register result,
1183 XMMRegister vec, Register tmp,
1184 int ae) {
1185 ShortBranchVerifier sbv(this);
1186 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1187 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1188
1189 // This method uses the pcmpestri instruction with bound registers
1190 // inputs:
1191 // xmm - substring
1192 // rax - substring length (elements count)
1193 // mem - scanned string
1194 // rdx - string length (elements count)
1195 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
1196 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
1197 // outputs:
1198 // rcx - matched index in string
1199 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1200 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
1201 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
1202 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
1203 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
1204
1205 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
1206 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
1207 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
1208
1209 // Note, inline_string_indexOf() generates checks:
1210 // if (substr.count > string.count) return -1;
1211 // if (substr.count == 0) return 0;
1212 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
1213
1214 // Load substring.
1215 if (ae == StrIntrinsicNode::UL) {
1216 pmovzxbw(vec, Address(str2, 0));
1217 } else {
1218 movdqu(vec, Address(str2, 0));
1219 }
1220 movl(cnt2, int_cnt2);
1221 movptr(result, str1); // string addr
1222
1223 if (int_cnt2 > stride) {
1224 jmpb(SCAN_TO_SUBSTR);
1225
1226 // Reload substr for rescan, this code
1227 // is executed only for large substrings (> 8 chars)
1228 bind(RELOAD_SUBSTR);
1229 if (ae == StrIntrinsicNode::UL) {
1230 pmovzxbw(vec, Address(str2, 0));
1231 } else {
1232 movdqu(vec, Address(str2, 0));
1233 }
1234 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
1235
1236 bind(RELOAD_STR);
1237 // We came here after the beginning of the substring was
1238 // matched but the rest of it was not so we need to search
1239 // again. Start from the next element after the previous match.
1240
1241 // cnt2 is number of substring reminding elements and
1242 // cnt1 is number of string reminding elements when cmp failed.
1243 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
1244 subl(cnt1, cnt2);
1245 addl(cnt1, int_cnt2);
1246 movl(cnt2, int_cnt2); // Now restore cnt2
1247
1248 decrementl(cnt1); // Shift to next element
1249 cmpl(cnt1, cnt2);
1250 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
1251
1252 addptr(result, (1<<scale1));
1253
1254 } // (int_cnt2 > 8)
1255
1256 // Scan string for start of substr in 16-byte vectors
1257 bind(SCAN_TO_SUBSTR);
1258 pcmpestri(vec, Address(result, 0), mode);
1259 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
1260 subl(cnt1, stride);
1261 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
1262 cmpl(cnt1, cnt2);
1263 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
1264 addptr(result, 16);
1265 jmpb(SCAN_TO_SUBSTR);
1266
1267 // Found a potential substr
1268 bind(FOUND_CANDIDATE);
1269 // Matched whole vector if first element matched (tmp(rcx) == 0).
1270 if (int_cnt2 == stride) {
1271 jccb(Assembler::overflow, RET_FOUND); // OF == 1
1272 } else { // int_cnt2 > 8
1273 jccb(Assembler::overflow, FOUND_SUBSTR);
1274 }
1275 // After pcmpestri tmp(rcx) contains matched element index
1276 // Compute start addr of substr
1277 lea(result, Address(result, tmp, scale1));
1278
1279 // Make sure string is still long enough
1280 subl(cnt1, tmp);
1281 cmpl(cnt1, cnt2);
1282 if (int_cnt2 == stride) {
1283 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
1284 } else { // int_cnt2 > 8
1285 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
1286 }
1287 // Left less then substring.
1288
1289 bind(RET_NOT_FOUND);
1290 movl(result, -1);
1291 jmp(EXIT);
1292
1293 if (int_cnt2 > stride) {
1294 // This code is optimized for the case when whole substring
1295 // is matched if its head is matched.
1296 bind(MATCH_SUBSTR_HEAD);
1297 pcmpestri(vec, Address(result, 0), mode);
1298 // Reload only string if does not match
1299 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
1300
1301 Label CONT_SCAN_SUBSTR;
1302 // Compare the rest of substring (> 8 chars).
1303 bind(FOUND_SUBSTR);
1304 // First 8 chars are already matched.
1305 negptr(cnt2);
1306 addptr(cnt2, stride);
1307
1308 bind(SCAN_SUBSTR);
1309 subl(cnt1, stride);
1310 cmpl(cnt2, -stride); // Do not read beyond substring
1311 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
1312 // Back-up strings to avoid reading beyond substring:
1313 // cnt1 = cnt1 - cnt2 + 8
1314 addl(cnt1, cnt2); // cnt2 is negative
1315 addl(cnt1, stride);
1316 movl(cnt2, stride); negptr(cnt2);
1317 bind(CONT_SCAN_SUBSTR);
1318 if (int_cnt2 < (int)G) {
1319 int tail_off1 = int_cnt2<<scale1;
1320 int tail_off2 = int_cnt2<<scale2;
1321 if (ae == StrIntrinsicNode::UL) {
1322 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
1323 } else {
1324 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
1325 }
1326 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
1327 } else {
1328 // calculate index in register to avoid integer overflow (int_cnt2*2)
1329 movl(tmp, int_cnt2);
1330 addptr(tmp, cnt2);
1331 if (ae == StrIntrinsicNode::UL) {
1332 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
1333 } else {
1334 movdqu(vec, Address(str2, tmp, scale2, 0));
1335 }
1336 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
1337 }
1338 // Need to reload strings pointers if not matched whole vector
1339 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
1340 addptr(cnt2, stride);
1341 jcc(Assembler::negative, SCAN_SUBSTR);
1342 // Fall through if found full substring
1343
1344 } // (int_cnt2 > 8)
1345
1346 bind(RET_FOUND);
1347 // Found result if we matched full small substring.
1348 // Compute substr offset
1349 subptr(result, str1);
1350 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1351 shrl(result, 1); // index
1352 }
1353 bind(EXIT);
1354
1355 } // string_indexofC8
1356
1357 // Small strings are loaded through stack if they cross page boundary.
string_indexof(Register str1,Register str2,Register cnt1,Register cnt2,int int_cnt2,Register result,XMMRegister vec,Register tmp,int ae)1358 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
1359 Register cnt1, Register cnt2,
1360 int int_cnt2, Register result,
1361 XMMRegister vec, Register tmp,
1362 int ae) {
1363 ShortBranchVerifier sbv(this);
1364 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1365 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1366
1367 //
1368 // int_cnt2 is length of small (< 8 chars) constant substring
1369 // or (-1) for non constant substring in which case its length
1370 // is in cnt2 register.
1371 //
1372 // Note, inline_string_indexOf() generates checks:
1373 // if (substr.count > string.count) return -1;
1374 // if (substr.count == 0) return 0;
1375 //
1376 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
1377 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
1378 // This method uses the pcmpestri instruction with bound registers
1379 // inputs:
1380 // xmm - substring
1381 // rax - substring length (elements count)
1382 // mem - scanned string
1383 // rdx - string length (elements count)
1384 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
1385 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
1386 // outputs:
1387 // rcx - matched index in string
1388 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1389 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
1390 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
1391 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
1392
1393 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
1394 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
1395 FOUND_CANDIDATE;
1396
1397 { //========================================================
1398 // We don't know where these strings are located
1399 // and we can't read beyond them. Load them through stack.
1400 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
1401
1402 movptr(tmp, rsp); // save old SP
1403
1404 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
1405 if (int_cnt2 == (1>>scale2)) { // One byte
1406 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
1407 load_unsigned_byte(result, Address(str2, 0));
1408 movdl(vec, result); // move 32 bits
1409 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
1410 // Not enough header space in 32-bit VM: 12+3 = 15.
1411 movl(result, Address(str2, -1));
1412 shrl(result, 8);
1413 movdl(vec, result); // move 32 bits
1414 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
1415 load_unsigned_short(result, Address(str2, 0));
1416 movdl(vec, result); // move 32 bits
1417 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
1418 movdl(vec, Address(str2, 0)); // move 32 bits
1419 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
1420 movq(vec, Address(str2, 0)); // move 64 bits
1421 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
1422 // Array header size is 12 bytes in 32-bit VM
1423 // + 6 bytes for 3 chars == 18 bytes,
1424 // enough space to load vec and shift.
1425 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
1426 if (ae == StrIntrinsicNode::UL) {
1427 int tail_off = int_cnt2-8;
1428 pmovzxbw(vec, Address(str2, tail_off));
1429 psrldq(vec, -2*tail_off);
1430 }
1431 else {
1432 int tail_off = int_cnt2*(1<<scale2);
1433 movdqu(vec, Address(str2, tail_off-16));
1434 psrldq(vec, 16-tail_off);
1435 }
1436 }
1437 } else { // not constant substring
1438 cmpl(cnt2, stride);
1439 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
1440
1441 // We can read beyond string if srt+16 does not cross page boundary
1442 // since heaps are aligned and mapped by pages.
1443 assert(os::vm_page_size() < (int)G, "default page should be small");
1444 movl(result, str2); // We need only low 32 bits
1445 andl(result, (os::vm_page_size()-1));
1446 cmpl(result, (os::vm_page_size()-16));
1447 jccb(Assembler::belowEqual, CHECK_STR);
1448
1449 // Move small strings to stack to allow load 16 bytes into vec.
1450 subptr(rsp, 16);
1451 int stk_offset = wordSize-(1<<scale2);
1452 push(cnt2);
1453
1454 bind(COPY_SUBSTR);
1455 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
1456 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
1457 movb(Address(rsp, cnt2, scale2, stk_offset), result);
1458 } else if (ae == StrIntrinsicNode::UU) {
1459 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
1460 movw(Address(rsp, cnt2, scale2, stk_offset), result);
1461 }
1462 decrement(cnt2);
1463 jccb(Assembler::notZero, COPY_SUBSTR);
1464
1465 pop(cnt2);
1466 movptr(str2, rsp); // New substring address
1467 } // non constant
1468
1469 bind(CHECK_STR);
1470 cmpl(cnt1, stride);
1471 jccb(Assembler::aboveEqual, BIG_STRINGS);
1472
1473 // Check cross page boundary.
1474 movl(result, str1); // We need only low 32 bits
1475 andl(result, (os::vm_page_size()-1));
1476 cmpl(result, (os::vm_page_size()-16));
1477 jccb(Assembler::belowEqual, BIG_STRINGS);
1478
1479 subptr(rsp, 16);
1480 int stk_offset = -(1<<scale1);
1481 if (int_cnt2 < 0) { // not constant
1482 push(cnt2);
1483 stk_offset += wordSize;
1484 }
1485 movl(cnt2, cnt1);
1486
1487 bind(COPY_STR);
1488 if (ae == StrIntrinsicNode::LL) {
1489 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
1490 movb(Address(rsp, cnt2, scale1, stk_offset), result);
1491 } else {
1492 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
1493 movw(Address(rsp, cnt2, scale1, stk_offset), result);
1494 }
1495 decrement(cnt2);
1496 jccb(Assembler::notZero, COPY_STR);
1497
1498 if (int_cnt2 < 0) { // not constant
1499 pop(cnt2);
1500 }
1501 movptr(str1, rsp); // New string address
1502
1503 bind(BIG_STRINGS);
1504 // Load substring.
1505 if (int_cnt2 < 0) { // -1
1506 if (ae == StrIntrinsicNode::UL) {
1507 pmovzxbw(vec, Address(str2, 0));
1508 } else {
1509 movdqu(vec, Address(str2, 0));
1510 }
1511 push(cnt2); // substr count
1512 push(str2); // substr addr
1513 push(str1); // string addr
1514 } else {
1515 // Small (< 8 chars) constant substrings are loaded already.
1516 movl(cnt2, int_cnt2);
1517 }
1518 push(tmp); // original SP
1519
1520 } // Finished loading
1521
1522 //========================================================
1523 // Start search
1524 //
1525
1526 movptr(result, str1); // string addr
1527
1528 if (int_cnt2 < 0) { // Only for non constant substring
1529 jmpb(SCAN_TO_SUBSTR);
1530
1531 // SP saved at sp+0
1532 // String saved at sp+1*wordSize
1533 // Substr saved at sp+2*wordSize
1534 // Substr count saved at sp+3*wordSize
1535
1536 // Reload substr for rescan, this code
1537 // is executed only for large substrings (> 8 chars)
1538 bind(RELOAD_SUBSTR);
1539 movptr(str2, Address(rsp, 2*wordSize));
1540 movl(cnt2, Address(rsp, 3*wordSize));
1541 if (ae == StrIntrinsicNode::UL) {
1542 pmovzxbw(vec, Address(str2, 0));
1543 } else {
1544 movdqu(vec, Address(str2, 0));
1545 }
1546 // We came here after the beginning of the substring was
1547 // matched but the rest of it was not so we need to search
1548 // again. Start from the next element after the previous match.
1549 subptr(str1, result); // Restore counter
1550 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1551 shrl(str1, 1);
1552 }
1553 addl(cnt1, str1);
1554 decrementl(cnt1); // Shift to next element
1555 cmpl(cnt1, cnt2);
1556 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
1557
1558 addptr(result, (1<<scale1));
1559 } // non constant
1560
1561 // Scan string for start of substr in 16-byte vectors
1562 bind(SCAN_TO_SUBSTR);
1563 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1564 pcmpestri(vec, Address(result, 0), mode);
1565 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
1566 subl(cnt1, stride);
1567 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
1568 cmpl(cnt1, cnt2);
1569 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
1570 addptr(result, 16);
1571
1572 bind(ADJUST_STR);
1573 cmpl(cnt1, stride); // Do not read beyond string
1574 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
1575 // Back-up string to avoid reading beyond string.
1576 lea(result, Address(result, cnt1, scale1, -16));
1577 movl(cnt1, stride);
1578 jmpb(SCAN_TO_SUBSTR);
1579
1580 // Found a potential substr
1581 bind(FOUND_CANDIDATE);
1582 // After pcmpestri tmp(rcx) contains matched element index
1583
1584 // Make sure string is still long enough
1585 subl(cnt1, tmp);
1586 cmpl(cnt1, cnt2);
1587 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
1588 // Left less then substring.
1589
1590 bind(RET_NOT_FOUND);
1591 movl(result, -1);
1592 jmp(CLEANUP);
1593
1594 bind(FOUND_SUBSTR);
1595 // Compute start addr of substr
1596 lea(result, Address(result, tmp, scale1));
1597 if (int_cnt2 > 0) { // Constant substring
1598 // Repeat search for small substring (< 8 chars)
1599 // from new point without reloading substring.
1600 // Have to check that we don't read beyond string.
1601 cmpl(tmp, stride-int_cnt2);
1602 jccb(Assembler::greater, ADJUST_STR);
1603 // Fall through if matched whole substring.
1604 } else { // non constant
1605 assert(int_cnt2 == -1, "should be != 0");
1606
1607 addl(tmp, cnt2);
1608 // Found result if we matched whole substring.
1609 cmpl(tmp, stride);
1610 jcc(Assembler::lessEqual, RET_FOUND);
1611
1612 // Repeat search for small substring (<= 8 chars)
1613 // from new point 'str1' without reloading substring.
1614 cmpl(cnt2, stride);
1615 // Have to check that we don't read beyond string.
1616 jccb(Assembler::lessEqual, ADJUST_STR);
1617
1618 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
1619 // Compare the rest of substring (> 8 chars).
1620 movptr(str1, result);
1621
1622 cmpl(tmp, cnt2);
1623 // First 8 chars are already matched.
1624 jccb(Assembler::equal, CHECK_NEXT);
1625
1626 bind(SCAN_SUBSTR);
1627 pcmpestri(vec, Address(str1, 0), mode);
1628 // Need to reload strings pointers if not matched whole vector
1629 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
1630
1631 bind(CHECK_NEXT);
1632 subl(cnt2, stride);
1633 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
1634 addptr(str1, 16);
1635 if (ae == StrIntrinsicNode::UL) {
1636 addptr(str2, 8);
1637 } else {
1638 addptr(str2, 16);
1639 }
1640 subl(cnt1, stride);
1641 cmpl(cnt2, stride); // Do not read beyond substring
1642 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
1643 // Back-up strings to avoid reading beyond substring.
1644
1645 if (ae == StrIntrinsicNode::UL) {
1646 lea(str2, Address(str2, cnt2, scale2, -8));
1647 lea(str1, Address(str1, cnt2, scale1, -16));
1648 } else {
1649 lea(str2, Address(str2, cnt2, scale2, -16));
1650 lea(str1, Address(str1, cnt2, scale1, -16));
1651 }
1652 subl(cnt1, cnt2);
1653 movl(cnt2, stride);
1654 addl(cnt1, stride);
1655 bind(CONT_SCAN_SUBSTR);
1656 if (ae == StrIntrinsicNode::UL) {
1657 pmovzxbw(vec, Address(str2, 0));
1658 } else {
1659 movdqu(vec, Address(str2, 0));
1660 }
1661 jmp(SCAN_SUBSTR);
1662
1663 bind(RET_FOUND_LONG);
1664 movptr(str1, Address(rsp, wordSize));
1665 } // non constant
1666
1667 bind(RET_FOUND);
1668 // Compute substr offset
1669 subptr(result, str1);
1670 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1671 shrl(result, 1); // index
1672 }
1673 bind(CLEANUP);
1674 pop(rsp); // restore SP
1675
1676 } // string_indexof
1677
string_indexof_char(Register str1,Register cnt1,Register ch,Register result,XMMRegister vec1,XMMRegister vec2,XMMRegister vec3,Register tmp)1678 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
1679 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
1680 ShortBranchVerifier sbv(this);
1681 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1682
1683 int stride = 8;
1684
1685 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
1686 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
1687 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
1688 FOUND_SEQ_CHAR, DONE_LABEL;
1689
1690 movptr(result, str1);
1691 if (UseAVX >= 2) {
1692 cmpl(cnt1, stride);
1693 jcc(Assembler::less, SCAN_TO_CHAR);
1694 cmpl(cnt1, 2*stride);
1695 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
1696 movdl(vec1, ch);
1697 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
1698 vpxor(vec2, vec2);
1699 movl(tmp, cnt1);
1700 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
1701 andl(cnt1,0x0000000F); //tail count (in chars)
1702
1703 bind(SCAN_TO_16_CHAR_LOOP);
1704 vmovdqu(vec3, Address(result, 0));
1705 vpcmpeqw(vec3, vec3, vec1, 1);
1706 vptest(vec2, vec3);
1707 jcc(Assembler::carryClear, FOUND_CHAR);
1708 addptr(result, 32);
1709 subl(tmp, 2*stride);
1710 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
1711 jmp(SCAN_TO_8_CHAR);
1712 bind(SCAN_TO_8_CHAR_INIT);
1713 movdl(vec1, ch);
1714 pshuflw(vec1, vec1, 0x00);
1715 pshufd(vec1, vec1, 0);
1716 pxor(vec2, vec2);
1717 }
1718 bind(SCAN_TO_8_CHAR);
1719 cmpl(cnt1, stride);
1720 jcc(Assembler::less, SCAN_TO_CHAR);
1721 if (UseAVX < 2) {
1722 movdl(vec1, ch);
1723 pshuflw(vec1, vec1, 0x00);
1724 pshufd(vec1, vec1, 0);
1725 pxor(vec2, vec2);
1726 }
1727 movl(tmp, cnt1);
1728 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
1729 andl(cnt1,0x00000007); //tail count (in chars)
1730
1731 bind(SCAN_TO_8_CHAR_LOOP);
1732 movdqu(vec3, Address(result, 0));
1733 pcmpeqw(vec3, vec1);
1734 ptest(vec2, vec3);
1735 jcc(Assembler::carryClear, FOUND_CHAR);
1736 addptr(result, 16);
1737 subl(tmp, stride);
1738 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
1739 bind(SCAN_TO_CHAR);
1740 testl(cnt1, cnt1);
1741 jcc(Assembler::zero, RET_NOT_FOUND);
1742 bind(SCAN_TO_CHAR_LOOP);
1743 load_unsigned_short(tmp, Address(result, 0));
1744 cmpl(ch, tmp);
1745 jccb(Assembler::equal, FOUND_SEQ_CHAR);
1746 addptr(result, 2);
1747 subl(cnt1, 1);
1748 jccb(Assembler::zero, RET_NOT_FOUND);
1749 jmp(SCAN_TO_CHAR_LOOP);
1750
1751 bind(RET_NOT_FOUND);
1752 movl(result, -1);
1753 jmpb(DONE_LABEL);
1754
1755 bind(FOUND_CHAR);
1756 if (UseAVX >= 2) {
1757 vpmovmskb(tmp, vec3);
1758 } else {
1759 pmovmskb(tmp, vec3);
1760 }
1761 bsfl(ch, tmp);
1762 addptr(result, ch);
1763
1764 bind(FOUND_SEQ_CHAR);
1765 subptr(result, str1);
1766 shrl(result, 1);
1767
1768 bind(DONE_LABEL);
1769 } // string_indexof_char
1770
1771 // helper function for string_compare
load_next_elements(Register elem1,Register elem2,Register str1,Register str2,Address::ScaleFactor scale,Address::ScaleFactor scale1,Address::ScaleFactor scale2,Register index,int ae)1772 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
1773 Address::ScaleFactor scale, Address::ScaleFactor scale1,
1774 Address::ScaleFactor scale2, Register index, int ae) {
1775 if (ae == StrIntrinsicNode::LL) {
1776 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
1777 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
1778 } else if (ae == StrIntrinsicNode::UU) {
1779 load_unsigned_short(elem1, Address(str1, index, scale, 0));
1780 load_unsigned_short(elem2, Address(str2, index, scale, 0));
1781 } else {
1782 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
1783 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
1784 }
1785 }
1786
1787 // Compare strings, used for char[] and byte[].
string_compare(Register str1,Register str2,Register cnt1,Register cnt2,Register result,XMMRegister vec1,int ae)1788 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1789 Register cnt1, Register cnt2, Register result,
1790 XMMRegister vec1, int ae) {
1791 ShortBranchVerifier sbv(this);
1792 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
1793 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3
1794 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
1795 int stride2x2 = 0x40;
1796 Address::ScaleFactor scale = Address::no_scale;
1797 Address::ScaleFactor scale1 = Address::no_scale;
1798 Address::ScaleFactor scale2 = Address::no_scale;
1799
1800 if (ae != StrIntrinsicNode::LL) {
1801 stride2x2 = 0x20;
1802 }
1803
1804 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
1805 shrl(cnt2, 1);
1806 }
1807 // Compute the minimum of the string lengths and the
1808 // difference of the string lengths (stack).
1809 // Do the conditional move stuff
1810 movl(result, cnt1);
1811 subl(cnt1, cnt2);
1812 push(cnt1);
1813 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
1814
1815 // Is the minimum length zero?
1816 testl(cnt2, cnt2);
1817 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
1818 if (ae == StrIntrinsicNode::LL) {
1819 // Load first bytes
1820 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
1821 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
1822 } else if (ae == StrIntrinsicNode::UU) {
1823 // Load first characters
1824 load_unsigned_short(result, Address(str1, 0));
1825 load_unsigned_short(cnt1, Address(str2, 0));
1826 } else {
1827 load_unsigned_byte(result, Address(str1, 0));
1828 load_unsigned_short(cnt1, Address(str2, 0));
1829 }
1830 subl(result, cnt1);
1831 jcc(Assembler::notZero, POP_LABEL);
1832
1833 if (ae == StrIntrinsicNode::UU) {
1834 // Divide length by 2 to get number of chars
1835 shrl(cnt2, 1);
1836 }
1837 cmpl(cnt2, 1);
1838 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
1839
1840 // Check if the strings start at the same location and setup scale and stride
1841 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1842 cmpptr(str1, str2);
1843 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
1844 if (ae == StrIntrinsicNode::LL) {
1845 scale = Address::times_1;
1846 stride = 16;
1847 } else {
1848 scale = Address::times_2;
1849 stride = 8;
1850 }
1851 } else {
1852 scale1 = Address::times_1;
1853 scale2 = Address::times_2;
1854 // scale not used
1855 stride = 8;
1856 }
1857
1858 if (UseAVX >= 2 && UseSSE42Intrinsics) {
1859 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
1860 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
1861 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
1862 Label COMPARE_TAIL_LONG;
1863 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
1864
1865 int pcmpmask = 0x19;
1866 if (ae == StrIntrinsicNode::LL) {
1867 pcmpmask &= ~0x01;
1868 }
1869
1870 // Setup to compare 16-chars (32-bytes) vectors,
1871 // start from first character again because it has aligned address.
1872 if (ae == StrIntrinsicNode::LL) {
1873 stride2 = 32;
1874 } else {
1875 stride2 = 16;
1876 }
1877 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1878 adr_stride = stride << scale;
1879 } else {
1880 adr_stride1 = 8; //stride << scale1;
1881 adr_stride2 = 16; //stride << scale2;
1882 }
1883
1884 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
1885 // rax and rdx are used by pcmpestri as elements counters
1886 movl(result, cnt2);
1887 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
1888 jcc(Assembler::zero, COMPARE_TAIL_LONG);
1889
1890 // fast path : compare first 2 8-char vectors.
1891 bind(COMPARE_16_CHARS);
1892 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1893 movdqu(vec1, Address(str1, 0));
1894 } else {
1895 pmovzxbw(vec1, Address(str1, 0));
1896 }
1897 pcmpestri(vec1, Address(str2, 0), pcmpmask);
1898 jccb(Assembler::below, COMPARE_INDEX_CHAR);
1899
1900 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1901 movdqu(vec1, Address(str1, adr_stride));
1902 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
1903 } else {
1904 pmovzxbw(vec1, Address(str1, adr_stride1));
1905 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
1906 }
1907 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
1908 addl(cnt1, stride);
1909
1910 // Compare the characters at index in cnt1
1911 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
1912 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
1913 subl(result, cnt2);
1914 jmp(POP_LABEL);
1915
1916 // Setup the registers to start vector comparison loop
1917 bind(COMPARE_WIDE_VECTORS);
1918 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1919 lea(str1, Address(str1, result, scale));
1920 lea(str2, Address(str2, result, scale));
1921 } else {
1922 lea(str1, Address(str1, result, scale1));
1923 lea(str2, Address(str2, result, scale2));
1924 }
1925 subl(result, stride2);
1926 subl(cnt2, stride2);
1927 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
1928 negptr(result);
1929
1930 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
1931 bind(COMPARE_WIDE_VECTORS_LOOP);
1932
1933 #ifdef _LP64
1934 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
1935 cmpl(cnt2, stride2x2);
1936 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
1937 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
1938 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
1939
1940 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
1941 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1942 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
1943 evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
1944 } else {
1945 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
1946 evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
1947 }
1948 kortestql(k7, k7);
1949 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
1950 addptr(result, stride2x2); // update since we already compared at this addr
1951 subl(cnt2, stride2x2); // and sub the size too
1952 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
1953
1954 vpxor(vec1, vec1);
1955 jmpb(COMPARE_WIDE_TAIL);
1956 }//if (VM_Version::supports_avx512vlbw())
1957 #endif // _LP64
1958
1959
1960 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
1961 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1962 vmovdqu(vec1, Address(str1, result, scale));
1963 vpxor(vec1, Address(str2, result, scale));
1964 } else {
1965 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
1966 vpxor(vec1, Address(str2, result, scale2));
1967 }
1968 vptest(vec1, vec1);
1969 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
1970 addptr(result, stride2);
1971 subl(cnt2, stride2);
1972 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
1973 // clean upper bits of YMM registers
1974 vpxor(vec1, vec1);
1975
1976 // compare wide vectors tail
1977 bind(COMPARE_WIDE_TAIL);
1978 testptr(result, result);
1979 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
1980
1981 movl(result, stride2);
1982 movl(cnt2, result);
1983 negptr(result);
1984 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
1985
1986 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
1987 bind(VECTOR_NOT_EQUAL);
1988 // clean upper bits of YMM registers
1989 vpxor(vec1, vec1);
1990 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1991 lea(str1, Address(str1, result, scale));
1992 lea(str2, Address(str2, result, scale));
1993 } else {
1994 lea(str1, Address(str1, result, scale1));
1995 lea(str2, Address(str2, result, scale2));
1996 }
1997 jmp(COMPARE_16_CHARS);
1998
1999 // Compare tail chars, length between 1 to 15 chars
2000 bind(COMPARE_TAIL_LONG);
2001 movl(cnt2, result);
2002 cmpl(cnt2, stride);
2003 jcc(Assembler::less, COMPARE_SMALL_STR);
2004
2005 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2006 movdqu(vec1, Address(str1, 0));
2007 } else {
2008 pmovzxbw(vec1, Address(str1, 0));
2009 }
2010 pcmpestri(vec1, Address(str2, 0), pcmpmask);
2011 jcc(Assembler::below, COMPARE_INDEX_CHAR);
2012 subptr(cnt2, stride);
2013 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2014 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2015 lea(str1, Address(str1, result, scale));
2016 lea(str2, Address(str2, result, scale));
2017 } else {
2018 lea(str1, Address(str1, result, scale1));
2019 lea(str2, Address(str2, result, scale2));
2020 }
2021 negptr(cnt2);
2022 jmpb(WHILE_HEAD_LABEL);
2023
2024 bind(COMPARE_SMALL_STR);
2025 } else if (UseSSE42Intrinsics) {
2026 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
2027 int pcmpmask = 0x19;
2028 // Setup to compare 8-char (16-byte) vectors,
2029 // start from first character again because it has aligned address.
2030 movl(result, cnt2);
2031 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
2032 if (ae == StrIntrinsicNode::LL) {
2033 pcmpmask &= ~0x01;
2034 }
2035 jcc(Assembler::zero, COMPARE_TAIL);
2036 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2037 lea(str1, Address(str1, result, scale));
2038 lea(str2, Address(str2, result, scale));
2039 } else {
2040 lea(str1, Address(str1, result, scale1));
2041 lea(str2, Address(str2, result, scale2));
2042 }
2043 negptr(result);
2044
2045 // pcmpestri
2046 // inputs:
2047 // vec1- substring
2048 // rax - negative string length (elements count)
2049 // mem - scanned string
2050 // rdx - string length (elements count)
2051 // pcmpmask - cmp mode: 11000 (string compare with negated result)
2052 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
2053 // outputs:
2054 // rcx - first mismatched element index
2055 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
2056
2057 bind(COMPARE_WIDE_VECTORS);
2058 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2059 movdqu(vec1, Address(str1, result, scale));
2060 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2061 } else {
2062 pmovzxbw(vec1, Address(str1, result, scale1));
2063 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2064 }
2065 // After pcmpestri cnt1(rcx) contains mismatched element index
2066
2067 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
2068 addptr(result, stride);
2069 subptr(cnt2, stride);
2070 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
2071
2072 // compare wide vectors tail
2073 testptr(result, result);
2074 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2075
2076 movl(cnt2, stride);
2077 movl(result, stride);
2078 negptr(result);
2079 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2080 movdqu(vec1, Address(str1, result, scale));
2081 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2082 } else {
2083 pmovzxbw(vec1, Address(str1, result, scale1));
2084 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2085 }
2086 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
2087
2088 // Mismatched characters in the vectors
2089 bind(VECTOR_NOT_EQUAL);
2090 addptr(cnt1, result);
2091 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
2092 subl(result, cnt2);
2093 jmpb(POP_LABEL);
2094
2095 bind(COMPARE_TAIL); // limit is zero
2096 movl(cnt2, result);
2097 // Fallthru to tail compare
2098 }
2099 // Shift str2 and str1 to the end of the arrays, negate min
2100 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2101 lea(str1, Address(str1, cnt2, scale));
2102 lea(str2, Address(str2, cnt2, scale));
2103 } else {
2104 lea(str1, Address(str1, cnt2, scale1));
2105 lea(str2, Address(str2, cnt2, scale2));
2106 }
2107 decrementl(cnt2); // first character was compared already
2108 negptr(cnt2);
2109
2110 // Compare the rest of the elements
2111 bind(WHILE_HEAD_LABEL);
2112 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
2113 subl(result, cnt1);
2114 jccb(Assembler::notZero, POP_LABEL);
2115 increment(cnt2);
2116 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
2117
2118 // Strings are equal up to min length. Return the length difference.
2119 bind(LENGTH_DIFF_LABEL);
2120 pop(result);
2121 if (ae == StrIntrinsicNode::UU) {
2122 // Divide diff by 2 to get number of chars
2123 sarl(result, 1);
2124 }
2125 jmpb(DONE_LABEL);
2126
2127 #ifdef _LP64
2128 if (VM_Version::supports_avx512vlbw()) {
2129
2130 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
2131
2132 kmovql(cnt1, k7);
2133 notq(cnt1);
2134 bsfq(cnt2, cnt1);
2135 if (ae != StrIntrinsicNode::LL) {
2136 // Divide diff by 2 to get number of chars
2137 sarl(cnt2, 1);
2138 }
2139 addq(result, cnt2);
2140 if (ae == StrIntrinsicNode::LL) {
2141 load_unsigned_byte(cnt1, Address(str2, result));
2142 load_unsigned_byte(result, Address(str1, result));
2143 } else if (ae == StrIntrinsicNode::UU) {
2144 load_unsigned_short(cnt1, Address(str2, result, scale));
2145 load_unsigned_short(result, Address(str1, result, scale));
2146 } else {
2147 load_unsigned_short(cnt1, Address(str2, result, scale2));
2148 load_unsigned_byte(result, Address(str1, result, scale1));
2149 }
2150 subl(result, cnt1);
2151 jmpb(POP_LABEL);
2152 }//if (VM_Version::supports_avx512vlbw())
2153 #endif // _LP64
2154
2155 // Discard the stored length difference
2156 bind(POP_LABEL);
2157 pop(cnt1);
2158
2159 // That's it
2160 bind(DONE_LABEL);
2161 if(ae == StrIntrinsicNode::UL) {
2162 negl(result);
2163 }
2164
2165 }
2166
2167 // Search for Non-ASCII character (Negative byte value) in a byte array,
2168 // return true if it has any and false otherwise.
2169 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
2170 // @HotSpotIntrinsicCandidate
2171 // private static boolean hasNegatives(byte[] ba, int off, int len) {
2172 // for (int i = off; i < off + len; i++) {
2173 // if (ba[i] < 0) {
2174 // return true;
2175 // }
2176 // }
2177 // return false;
2178 // }
has_negatives(Register ary1,Register len,Register result,Register tmp1,XMMRegister vec1,XMMRegister vec2)2179 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
2180 Register result, Register tmp1,
2181 XMMRegister vec1, XMMRegister vec2) {
2182 // rsi: byte array
2183 // rcx: len
2184 // rax: result
2185 ShortBranchVerifier sbv(this);
2186 assert_different_registers(ary1, len, result, tmp1);
2187 assert_different_registers(vec1, vec2);
2188 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
2189
2190 // len == 0
2191 testl(len, len);
2192 jcc(Assembler::zero, FALSE_LABEL);
2193
2194 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
2195 VM_Version::supports_avx512vlbw() &&
2196 VM_Version::supports_bmi2()) {
2197
2198 Label test_64_loop, test_tail;
2199 Register tmp3_aliased = len;
2200
2201 movl(tmp1, len);
2202 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
2203
2204 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
2205 andl(len, ~(64 - 1)); // vector count (in chars)
2206 jccb(Assembler::zero, test_tail);
2207
2208 lea(ary1, Address(ary1, len, Address::times_1));
2209 negptr(len);
2210
2211 bind(test_64_loop);
2212 // Check whether our 64 elements of size byte contain negatives
2213 evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
2214 kortestql(k2, k2);
2215 jcc(Assembler::notZero, TRUE_LABEL);
2216
2217 addptr(len, 64);
2218 jccb(Assembler::notZero, test_64_loop);
2219
2220
2221 bind(test_tail);
2222 // bail out when there is nothing to be done
2223 testl(tmp1, -1);
2224 jcc(Assembler::zero, FALSE_LABEL);
2225
2226 // ~(~0 << len) applied up to two times (for 32-bit scenario)
2227 #ifdef _LP64
2228 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
2229 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
2230 notq(tmp3_aliased);
2231 kmovql(k3, tmp3_aliased);
2232 #else
2233 Label k_init;
2234 jmp(k_init);
2235
2236 // We could not read 64-bits from a general purpose register thus we move
2237 // data required to compose 64 1's to the instruction stream
2238 // We emit 64 byte wide series of elements from 0..63 which later on would
2239 // be used as a compare targets with tail count contained in tmp1 register.
2240 // Result would be a k register having tmp1 consecutive number or 1
2241 // counting from least significant bit.
2242 address tmp = pc();
2243 emit_int64(0x0706050403020100);
2244 emit_int64(0x0F0E0D0C0B0A0908);
2245 emit_int64(0x1716151413121110);
2246 emit_int64(0x1F1E1D1C1B1A1918);
2247 emit_int64(0x2726252423222120);
2248 emit_int64(0x2F2E2D2C2B2A2928);
2249 emit_int64(0x3736353433323130);
2250 emit_int64(0x3F3E3D3C3B3A3938);
2251
2252 bind(k_init);
2253 lea(len, InternalAddress(tmp));
2254 // create mask to test for negative byte inside a vector
2255 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
2256 evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
2257
2258 #endif
2259 evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
2260 ktestq(k2, k3);
2261 jcc(Assembler::notZero, TRUE_LABEL);
2262
2263 jmp(FALSE_LABEL);
2264 } else {
2265 movl(result, len); // copy
2266
2267 if (UseAVX >= 2 && UseSSE >= 2) {
2268 // With AVX2, use 32-byte vector compare
2269 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2270
2271 // Compare 32-byte vectors
2272 andl(result, 0x0000001f); // tail count (in bytes)
2273 andl(len, 0xffffffe0); // vector count (in bytes)
2274 jccb(Assembler::zero, COMPARE_TAIL);
2275
2276 lea(ary1, Address(ary1, len, Address::times_1));
2277 negptr(len);
2278
2279 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
2280 movdl(vec2, tmp1);
2281 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
2282
2283 bind(COMPARE_WIDE_VECTORS);
2284 vmovdqu(vec1, Address(ary1, len, Address::times_1));
2285 vptest(vec1, vec2);
2286 jccb(Assembler::notZero, TRUE_LABEL);
2287 addptr(len, 32);
2288 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2289
2290 testl(result, result);
2291 jccb(Assembler::zero, FALSE_LABEL);
2292
2293 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
2294 vptest(vec1, vec2);
2295 jccb(Assembler::notZero, TRUE_LABEL);
2296 jmpb(FALSE_LABEL);
2297
2298 bind(COMPARE_TAIL); // len is zero
2299 movl(len, result);
2300 // Fallthru to tail compare
2301 } else if (UseSSE42Intrinsics) {
2302 // With SSE4.2, use double quad vector compare
2303 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2304
2305 // Compare 16-byte vectors
2306 andl(result, 0x0000000f); // tail count (in bytes)
2307 andl(len, 0xfffffff0); // vector count (in bytes)
2308 jcc(Assembler::zero, COMPARE_TAIL);
2309
2310 lea(ary1, Address(ary1, len, Address::times_1));
2311 negptr(len);
2312
2313 movl(tmp1, 0x80808080);
2314 movdl(vec2, tmp1);
2315 pshufd(vec2, vec2, 0);
2316
2317 bind(COMPARE_WIDE_VECTORS);
2318 movdqu(vec1, Address(ary1, len, Address::times_1));
2319 ptest(vec1, vec2);
2320 jcc(Assembler::notZero, TRUE_LABEL);
2321 addptr(len, 16);
2322 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2323
2324 testl(result, result);
2325 jcc(Assembler::zero, FALSE_LABEL);
2326
2327 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
2328 ptest(vec1, vec2);
2329 jccb(Assembler::notZero, TRUE_LABEL);
2330 jmpb(FALSE_LABEL);
2331
2332 bind(COMPARE_TAIL); // len is zero
2333 movl(len, result);
2334 // Fallthru to tail compare
2335 }
2336 }
2337 // Compare 4-byte vectors
2338 andl(len, 0xfffffffc); // vector count (in bytes)
2339 jccb(Assembler::zero, COMPARE_CHAR);
2340
2341 lea(ary1, Address(ary1, len, Address::times_1));
2342 negptr(len);
2343
2344 bind(COMPARE_VECTORS);
2345 movl(tmp1, Address(ary1, len, Address::times_1));
2346 andl(tmp1, 0x80808080);
2347 jccb(Assembler::notZero, TRUE_LABEL);
2348 addptr(len, 4);
2349 jcc(Assembler::notZero, COMPARE_VECTORS);
2350
2351 // Compare trailing char (final 2 bytes), if any
2352 bind(COMPARE_CHAR);
2353 testl(result, 0x2); // tail char
2354 jccb(Assembler::zero, COMPARE_BYTE);
2355 load_unsigned_short(tmp1, Address(ary1, 0));
2356 andl(tmp1, 0x00008080);
2357 jccb(Assembler::notZero, TRUE_LABEL);
2358 subptr(result, 2);
2359 lea(ary1, Address(ary1, 2));
2360
2361 bind(COMPARE_BYTE);
2362 testl(result, 0x1); // tail byte
2363 jccb(Assembler::zero, FALSE_LABEL);
2364 load_unsigned_byte(tmp1, Address(ary1, 0));
2365 andl(tmp1, 0x00000080);
2366 jccb(Assembler::notEqual, TRUE_LABEL);
2367 jmpb(FALSE_LABEL);
2368
2369 bind(TRUE_LABEL);
2370 movl(result, 1); // return true
2371 jmpb(DONE);
2372
2373 bind(FALSE_LABEL);
2374 xorl(result, result); // return false
2375
2376 // That's it
2377 bind(DONE);
2378 if (UseAVX >= 2 && UseSSE >= 2) {
2379 // clean upper bits of YMM registers
2380 vpxor(vec1, vec1);
2381 vpxor(vec2, vec2);
2382 }
2383 }
2384 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
arrays_equals(bool is_array_equ,Register ary1,Register ary2,Register limit,Register result,Register chr,XMMRegister vec1,XMMRegister vec2,bool is_char)2385 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
2386 Register limit, Register result, Register chr,
2387 XMMRegister vec1, XMMRegister vec2, bool is_char) {
2388 ShortBranchVerifier sbv(this);
2389 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
2390
2391 int length_offset = arrayOopDesc::length_offset_in_bytes();
2392 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
2393
2394 if (is_array_equ) {
2395 // Check the input args
2396 cmpoop(ary1, ary2);
2397 jcc(Assembler::equal, TRUE_LABEL);
2398
2399 // Need additional checks for arrays_equals.
2400 testptr(ary1, ary1);
2401 jcc(Assembler::zero, FALSE_LABEL);
2402 testptr(ary2, ary2);
2403 jcc(Assembler::zero, FALSE_LABEL);
2404
2405 // Check the lengths
2406 movl(limit, Address(ary1, length_offset));
2407 cmpl(limit, Address(ary2, length_offset));
2408 jcc(Assembler::notEqual, FALSE_LABEL);
2409 }
2410
2411 // count == 0
2412 testl(limit, limit);
2413 jcc(Assembler::zero, TRUE_LABEL);
2414
2415 if (is_array_equ) {
2416 // Load array address
2417 lea(ary1, Address(ary1, base_offset));
2418 lea(ary2, Address(ary2, base_offset));
2419 }
2420
2421 if (is_array_equ && is_char) {
2422 // arrays_equals when used for char[].
2423 shll(limit, 1); // byte count != 0
2424 }
2425 movl(result, limit); // copy
2426
2427 if (UseAVX >= 2) {
2428 // With AVX2, use 32-byte vector compare
2429 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2430
2431 // Compare 32-byte vectors
2432 andl(result, 0x0000001f); // tail count (in bytes)
2433 andl(limit, 0xffffffe0); // vector count (in bytes)
2434 jcc(Assembler::zero, COMPARE_TAIL);
2435
2436 lea(ary1, Address(ary1, limit, Address::times_1));
2437 lea(ary2, Address(ary2, limit, Address::times_1));
2438 negptr(limit);
2439
2440 #ifdef _LP64
2441 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
2442 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
2443
2444 cmpl(limit, -64);
2445 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
2446
2447 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
2448
2449 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
2450 evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
2451 kortestql(k7, k7);
2452 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
2453 addptr(limit, 64); // update since we already compared at this addr
2454 cmpl(limit, -64);
2455 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
2456
2457 // At this point we may still need to compare -limit+result bytes.
2458 // We could execute the next two instruction and just continue via non-wide path:
2459 // cmpl(limit, 0);
2460 // jcc(Assembler::equal, COMPARE_TAIL); // true
2461 // But since we stopped at the points ary{1,2}+limit which are
2462 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
2463 // (|limit| <= 32 and result < 32),
2464 // we may just compare the last 64 bytes.
2465 //
2466 addptr(result, -64); // it is safe, bc we just came from this area
2467 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
2468 evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
2469 kortestql(k7, k7);
2470 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
2471
2472 jmp(TRUE_LABEL);
2473
2474 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
2475
2476 }//if (VM_Version::supports_avx512vlbw())
2477 #endif //_LP64
2478 bind(COMPARE_WIDE_VECTORS);
2479 vmovdqu(vec1, Address(ary1, limit, Address::times_1));
2480 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
2481 vpxor(vec1, vec2);
2482
2483 vptest(vec1, vec1);
2484 jcc(Assembler::notZero, FALSE_LABEL);
2485 addptr(limit, 32);
2486 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2487
2488 testl(result, result);
2489 jcc(Assembler::zero, TRUE_LABEL);
2490
2491 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
2492 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
2493 vpxor(vec1, vec2);
2494
2495 vptest(vec1, vec1);
2496 jccb(Assembler::notZero, FALSE_LABEL);
2497 jmpb(TRUE_LABEL);
2498
2499 bind(COMPARE_TAIL); // limit is zero
2500 movl(limit, result);
2501 // Fallthru to tail compare
2502 } else if (UseSSE42Intrinsics) {
2503 // With SSE4.2, use double quad vector compare
2504 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2505
2506 // Compare 16-byte vectors
2507 andl(result, 0x0000000f); // tail count (in bytes)
2508 andl(limit, 0xfffffff0); // vector count (in bytes)
2509 jcc(Assembler::zero, COMPARE_TAIL);
2510
2511 lea(ary1, Address(ary1, limit, Address::times_1));
2512 lea(ary2, Address(ary2, limit, Address::times_1));
2513 negptr(limit);
2514
2515 bind(COMPARE_WIDE_VECTORS);
2516 movdqu(vec1, Address(ary1, limit, Address::times_1));
2517 movdqu(vec2, Address(ary2, limit, Address::times_1));
2518 pxor(vec1, vec2);
2519
2520 ptest(vec1, vec1);
2521 jcc(Assembler::notZero, FALSE_LABEL);
2522 addptr(limit, 16);
2523 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2524
2525 testl(result, result);
2526 jcc(Assembler::zero, TRUE_LABEL);
2527
2528 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
2529 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
2530 pxor(vec1, vec2);
2531
2532 ptest(vec1, vec1);
2533 jccb(Assembler::notZero, FALSE_LABEL);
2534 jmpb(TRUE_LABEL);
2535
2536 bind(COMPARE_TAIL); // limit is zero
2537 movl(limit, result);
2538 // Fallthru to tail compare
2539 }
2540
2541 // Compare 4-byte vectors
2542 andl(limit, 0xfffffffc); // vector count (in bytes)
2543 jccb(Assembler::zero, COMPARE_CHAR);
2544
2545 lea(ary1, Address(ary1, limit, Address::times_1));
2546 lea(ary2, Address(ary2, limit, Address::times_1));
2547 negptr(limit);
2548
2549 bind(COMPARE_VECTORS);
2550 movl(chr, Address(ary1, limit, Address::times_1));
2551 cmpl(chr, Address(ary2, limit, Address::times_1));
2552 jccb(Assembler::notEqual, FALSE_LABEL);
2553 addptr(limit, 4);
2554 jcc(Assembler::notZero, COMPARE_VECTORS);
2555
2556 // Compare trailing char (final 2 bytes), if any
2557 bind(COMPARE_CHAR);
2558 testl(result, 0x2); // tail char
2559 jccb(Assembler::zero, COMPARE_BYTE);
2560 load_unsigned_short(chr, Address(ary1, 0));
2561 load_unsigned_short(limit, Address(ary2, 0));
2562 cmpl(chr, limit);
2563 jccb(Assembler::notEqual, FALSE_LABEL);
2564
2565 if (is_array_equ && is_char) {
2566 bind(COMPARE_BYTE);
2567 } else {
2568 lea(ary1, Address(ary1, 2));
2569 lea(ary2, Address(ary2, 2));
2570
2571 bind(COMPARE_BYTE);
2572 testl(result, 0x1); // tail byte
2573 jccb(Assembler::zero, TRUE_LABEL);
2574 load_unsigned_byte(chr, Address(ary1, 0));
2575 load_unsigned_byte(limit, Address(ary2, 0));
2576 cmpl(chr, limit);
2577 jccb(Assembler::notEqual, FALSE_LABEL);
2578 }
2579 bind(TRUE_LABEL);
2580 movl(result, 1); // return true
2581 jmpb(DONE);
2582
2583 bind(FALSE_LABEL);
2584 xorl(result, result); // return false
2585
2586 // That's it
2587 bind(DONE);
2588 if (UseAVX >= 2) {
2589 // clean upper bits of YMM registers
2590 vpxor(vec1, vec1);
2591 vpxor(vec2, vec2);
2592 }
2593 }
2594