1 /*
2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.
8  *
9  * This code is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12  * version 2 for more details (a copy is included in the LICENSE file that
13  * accompanied this code).
14  *
15  * You should have received a copy of the GNU General Public License version
16  * 2 along with this work; if not, write to the Free Software Foundation,
17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18  *
19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20  * or visit www.oracle.com if you need additional information or have any
21  * questions.
22  *
23  */
24 
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "oops/methodData.hpp"
29 #include "opto/c2_MacroAssembler.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/opcodes.hpp"
32 #include "runtime/biasedLocking.hpp"
33 #include "runtime/objectMonitor.hpp"
34 #include "runtime/stubRoutines.hpp"
35 
setvectmask(Register dst,Register src)36 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
37   guarantee(PostLoopMultiversioning, "must be");
38   Assembler::movl(dst, 1);
39   Assembler::shlxl(dst, dst, src);
40   Assembler::decl(dst);
41   Assembler::kmovdl(k1, dst);
42   Assembler::movl(dst, src);
43 }
44 
restorevectmask()45 void C2_MacroAssembler::restorevectmask() {
46   guarantee(PostLoopMultiversioning, "must be");
47   Assembler::knotwl(k1, k0);
48 }
49 
50 #if INCLUDE_RTM_OPT
51 
52 // Update rtm_counters based on abort status
53 // input: abort_status
54 //        rtm_counters (RTMLockingCounters*)
55 // flags are killed
rtm_counters_update(Register abort_status,Register rtm_counters)56 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
57 
58   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
59   if (PrintPreciseRTMLockingStatistics) {
60     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
61       Label check_abort;
62       testl(abort_status, (1<<i));
63       jccb(Assembler::equal, check_abort);
64       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
65       bind(check_abort);
66     }
67   }
68 }
69 
70 // Branch if (random & (count-1) != 0), count is 2^n
71 // tmp, scr and flags are killed
branch_on_random_using_rdtsc(Register tmp,Register scr,int count,Label & brLabel)72 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
73   assert(tmp == rax, "");
74   assert(scr == rdx, "");
75   rdtsc(); // modifies EDX:EAX
76   andptr(tmp, count-1);
77   jccb(Assembler::notZero, brLabel);
78 }
79 
80 // Perform abort ratio calculation, set no_rtm bit if high ratio
81 // input:  rtm_counters_Reg (RTMLockingCounters* address)
82 // tmpReg, rtm_counters_Reg and flags are killed
rtm_abort_ratio_calculation(Register tmpReg,Register rtm_counters_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data)83 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
84                                                     Register rtm_counters_Reg,
85                                                     RTMLockingCounters* rtm_counters,
86                                                     Metadata* method_data) {
87   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
88 
89   if (RTMLockingCalculationDelay > 0) {
90     // Delay calculation
91     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
92     testptr(tmpReg, tmpReg);
93     jccb(Assembler::equal, L_done);
94   }
95   // Abort ratio calculation only if abort_count > RTMAbortThreshold
96   //   Aborted transactions = abort_count * 100
97   //   All transactions = total_count *  RTMTotalCountIncrRate
98   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
99 
100   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
101   cmpptr(tmpReg, RTMAbortThreshold);
102   jccb(Assembler::below, L_check_always_rtm2);
103   imulptr(tmpReg, tmpReg, 100);
104 
105   Register scrReg = rtm_counters_Reg;
106   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
107   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
108   imulptr(scrReg, scrReg, RTMAbortRatio);
109   cmpptr(tmpReg, scrReg);
110   jccb(Assembler::below, L_check_always_rtm1);
111   if (method_data != NULL) {
112     // set rtm_state to "no rtm" in MDO
113     mov_metadata(tmpReg, method_data);
114     lock();
115     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
116   }
117   jmpb(L_done);
118   bind(L_check_always_rtm1);
119   // Reload RTMLockingCounters* address
120   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
121   bind(L_check_always_rtm2);
122   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
123   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
124   jccb(Assembler::below, L_done);
125   if (method_data != NULL) {
126     // set rtm_state to "always rtm" in MDO
127     mov_metadata(tmpReg, method_data);
128     lock();
129     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
130   }
131   bind(L_done);
132 }
133 
134 // Update counters and perform abort ratio calculation
135 // input:  abort_status_Reg
136 // rtm_counters_Reg, flags are killed
rtm_profiling(Register abort_status_Reg,Register rtm_counters_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm)137 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
138                                       Register rtm_counters_Reg,
139                                       RTMLockingCounters* rtm_counters,
140                                       Metadata* method_data,
141                                       bool profile_rtm) {
142 
143   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
144   // update rtm counters based on rax value at abort
145   // reads abort_status_Reg, updates flags
146   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
147   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
148   if (profile_rtm) {
149     // Save abort status because abort_status_Reg is used by following code.
150     if (RTMRetryCount > 0) {
151       push(abort_status_Reg);
152     }
153     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
154     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
155     // restore abort status
156     if (RTMRetryCount > 0) {
157       pop(abort_status_Reg);
158     }
159   }
160 }
161 
162 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
163 // inputs: retry_count_Reg
164 //       : abort_status_Reg
165 // output: retry_count_Reg decremented by 1
166 // flags are killed
rtm_retry_lock_on_abort(Register retry_count_Reg,Register abort_status_Reg,Label & retryLabel)167 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
168   Label doneRetry;
169   assert(abort_status_Reg == rax, "");
170   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
171   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
172   // if reason is in 0x6 and retry count != 0 then retry
173   andptr(abort_status_Reg, 0x6);
174   jccb(Assembler::zero, doneRetry);
175   testl(retry_count_Reg, retry_count_Reg);
176   jccb(Assembler::zero, doneRetry);
177   pause();
178   decrementl(retry_count_Reg);
179   jmp(retryLabel);
180   bind(doneRetry);
181 }
182 
183 // Spin and retry if lock is busy,
184 // inputs: box_Reg (monitor address)
185 //       : retry_count_Reg
186 // output: retry_count_Reg decremented by 1
187 //       : clear z flag if retry count exceeded
188 // tmp_Reg, scr_Reg, flags are killed
rtm_retry_lock_on_busy(Register retry_count_Reg,Register box_Reg,Register tmp_Reg,Register scr_Reg,Label & retryLabel)189 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
190                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
191   Label SpinLoop, SpinExit, doneRetry;
192   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
193 
194   testl(retry_count_Reg, retry_count_Reg);
195   jccb(Assembler::zero, doneRetry);
196   decrementl(retry_count_Reg);
197   movptr(scr_Reg, RTMSpinLoopCount);
198 
199   bind(SpinLoop);
200   pause();
201   decrementl(scr_Reg);
202   jccb(Assembler::lessEqual, SpinExit);
203   movptr(tmp_Reg, Address(box_Reg, owner_offset));
204   testptr(tmp_Reg, tmp_Reg);
205   jccb(Assembler::notZero, SpinLoop);
206 
207   bind(SpinExit);
208   jmp(retryLabel);
209   bind(doneRetry);
210   incrementl(retry_count_Reg); // clear z flag
211 }
212 
213 // Use RTM for normal stack locks
214 // Input: objReg (object to lock)
rtm_stack_locking(Register objReg,Register tmpReg,Register scrReg,Register retry_on_abort_count_Reg,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL,Label & IsInflated)215 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
216                                          Register retry_on_abort_count_Reg,
217                                          RTMLockingCounters* stack_rtm_counters,
218                                          Metadata* method_data, bool profile_rtm,
219                                          Label& DONE_LABEL, Label& IsInflated) {
220   assert(UseRTMForStackLocks, "why call this otherwise?");
221   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
222   assert(tmpReg == rax, "");
223   assert(scrReg == rdx, "");
224   Label L_rtm_retry, L_decrement_retry, L_on_abort;
225 
226   if (RTMRetryCount > 0) {
227     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
228     bind(L_rtm_retry);
229   }
230   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
231   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
232   jcc(Assembler::notZero, IsInflated);
233 
234   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
235     Label L_noincrement;
236     if (RTMTotalCountIncrRate > 1) {
237       // tmpReg, scrReg and flags are killed
238       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
239     }
240     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
241     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
242     bind(L_noincrement);
243   }
244   xbegin(L_on_abort);
245   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
246   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
247   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
248   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
249 
250   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
251   if (UseRTMXendForLockBusy) {
252     xend();
253     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
254     jmp(L_decrement_retry);
255   }
256   else {
257     xabort(0);
258   }
259   bind(L_on_abort);
260   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
261     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
262   }
263   bind(L_decrement_retry);
264   if (RTMRetryCount > 0) {
265     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
266     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
267   }
268 }
269 
270 // Use RTM for inflating locks
271 // inputs: objReg (object to lock)
272 //         boxReg (on-stack box address (displaced header location) - KILLED)
273 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
rtm_inflated_locking(Register objReg,Register boxReg,Register tmpReg,Register scrReg,Register retry_on_busy_count_Reg,Register retry_on_abort_count_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL)274 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
275                                             Register scrReg, Register retry_on_busy_count_Reg,
276                                             Register retry_on_abort_count_Reg,
277                                             RTMLockingCounters* rtm_counters,
278                                             Metadata* method_data, bool profile_rtm,
279                                             Label& DONE_LABEL) {
280   assert(UseRTMLocking, "why call this otherwise?");
281   assert(tmpReg == rax, "");
282   assert(scrReg == rdx, "");
283   Label L_rtm_retry, L_decrement_retry, L_on_abort;
284   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
285 
286   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
287   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
288   movptr(boxReg, tmpReg); // Save ObjectMonitor address
289 
290   if (RTMRetryCount > 0) {
291     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
292     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
293     bind(L_rtm_retry);
294   }
295   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
296     Label L_noincrement;
297     if (RTMTotalCountIncrRate > 1) {
298       // tmpReg, scrReg and flags are killed
299       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
300     }
301     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
302     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
303     bind(L_noincrement);
304   }
305   xbegin(L_on_abort);
306   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
307   movptr(tmpReg, Address(tmpReg, owner_offset));
308   testptr(tmpReg, tmpReg);
309   jcc(Assembler::zero, DONE_LABEL);
310   if (UseRTMXendForLockBusy) {
311     xend();
312     jmp(L_decrement_retry);
313   }
314   else {
315     xabort(0);
316   }
317   bind(L_on_abort);
318   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
319   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
320     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
321   }
322   if (RTMRetryCount > 0) {
323     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
324     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
325   }
326 
327   movptr(tmpReg, Address(boxReg, owner_offset)) ;
328   testptr(tmpReg, tmpReg) ;
329   jccb(Assembler::notZero, L_decrement_retry) ;
330 
331   // Appears unlocked - try to swing _owner from null to non-null.
332   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
333 #ifdef _LP64
334   Register threadReg = r15_thread;
335 #else
336   get_thread(scrReg);
337   Register threadReg = scrReg;
338 #endif
339   lock();
340   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
341 
342   if (RTMRetryCount > 0) {
343     // success done else retry
344     jccb(Assembler::equal, DONE_LABEL) ;
345     bind(L_decrement_retry);
346     // Spin and retry if lock is busy.
347     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
348   }
349   else {
350     bind(L_decrement_retry);
351   }
352 }
353 
354 #endif //  INCLUDE_RTM_OPT
355 
356 // fast_lock and fast_unlock used by C2
357 
358 // Because the transitions from emitted code to the runtime
359 // monitorenter/exit helper stubs are so slow it's critical that
360 // we inline both the stack-locking fast path and the inflated fast path.
361 //
362 // See also: cmpFastLock and cmpFastUnlock.
363 //
364 // What follows is a specialized inline transliteration of the code
365 // in enter() and exit(). If we're concerned about I$ bloat another
366 // option would be to emit TrySlowEnter and TrySlowExit methods
367 // at startup-time.  These methods would accept arguments as
368 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
369 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
370 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
371 // In practice, however, the # of lock sites is bounded and is usually small.
372 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
373 // if the processor uses simple bimodal branch predictors keyed by EIP
374 // Since the helper routines would be called from multiple synchronization
375 // sites.
376 //
377 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
378 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
379 // to those specialized methods.  That'd give us a mostly platform-independent
380 // implementation that the JITs could optimize and inline at their pleasure.
381 // Done correctly, the only time we'd need to cross to native could would be
382 // to park() or unpark() threads.  We'd also need a few more unsafe operators
383 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
384 // (b) explicit barriers or fence operations.
385 //
386 // TODO:
387 //
388 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
389 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
390 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
391 //    the lock operators would typically be faster than reifying Self.
392 //
393 // *  Ideally I'd define the primitives as:
394 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
395 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
396 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
397 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
398 //    Furthermore the register assignments are overconstrained, possibly resulting in
399 //    sub-optimal code near the synchronization site.
400 //
401 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
402 //    Alternately, use a better sp-proximity test.
403 //
404 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
405 //    Either one is sufficient to uniquely identify a thread.
406 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
407 //
408 // *  Intrinsify notify() and notifyAll() for the common cases where the
409 //    object is locked by the calling thread but the waitlist is empty.
410 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
411 //
412 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
413 //    But beware of excessive branch density on AMD Opterons.
414 //
415 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
416 //    or failure of the fast path.  If the fast path fails then we pass
417 //    control to the slow path, typically in C.  In fast_lock and
418 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
419 //    will emit a conditional branch immediately after the node.
420 //    So we have branches to branches and lots of ICC.ZF games.
421 //    Instead, it might be better to have C2 pass a "FailureLabel"
422 //    into fast_lock and fast_unlock.  In the case of success, control
423 //    will drop through the node.  ICC.ZF is undefined at exit.
424 //    In the case of failure, the node will branch directly to the
425 //    FailureLabel
426 
427 
428 // obj: object to lock
429 // box: on-stack box address (displaced header location) - KILLED
430 // rax,: tmp -- KILLED
431 // scr: tmp -- KILLED
fast_lock(Register objReg,Register boxReg,Register tmpReg,Register scrReg,Register cx1Reg,Register cx2Reg,BiasedLockingCounters * counters,RTMLockingCounters * rtm_counters,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool use_rtm,bool profile_rtm)432 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
433                                  Register scrReg, Register cx1Reg, Register cx2Reg,
434                                  BiasedLockingCounters* counters,
435                                  RTMLockingCounters* rtm_counters,
436                                  RTMLockingCounters* stack_rtm_counters,
437                                  Metadata* method_data,
438                                  bool use_rtm, bool profile_rtm) {
439   // Ensure the register assignments are disjoint
440   assert(tmpReg == rax, "");
441 
442   if (use_rtm) {
443     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
444   } else {
445     assert(cx2Reg == noreg, "");
446     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
447   }
448 
449   if (counters != NULL) {
450     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
451   }
452 
453   // Possible cases that we'll encounter in fast_lock
454   // ------------------------------------------------
455   // * Inflated
456   //    -- unlocked
457   //    -- Locked
458   //       = by self
459   //       = by other
460   // * biased
461   //    -- by Self
462   //    -- by other
463   // * neutral
464   // * stack-locked
465   //    -- by self
466   //       = sp-proximity test hits
467   //       = sp-proximity test generates false-negative
468   //    -- by other
469   //
470 
471   Label IsInflated, DONE_LABEL;
472 
473   // it's stack-locked, biased or neutral
474   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
475   // order to reduce the number of conditional branches in the most common cases.
476   // Beware -- there's a subtle invariant that fetch of the markword
477   // at [FETCH], below, will never observe a biased encoding (*101b).
478   // If this invariant is not held we risk exclusion (safety) failure.
479   if (UseBiasedLocking && !UseOptoBiasInlining) {
480     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
481   }
482 
483 #if INCLUDE_RTM_OPT
484   if (UseRTMForStackLocks && use_rtm) {
485     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
486                       stack_rtm_counters, method_data, profile_rtm,
487                       DONE_LABEL, IsInflated);
488   }
489 #endif // INCLUDE_RTM_OPT
490 
491   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
492   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
493   jccb(Assembler::notZero, IsInflated);
494 
495   // Attempt stack-locking ...
496   orptr (tmpReg, markWord::unlocked_value);
497   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
498   lock();
499   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
500   if (counters != NULL) {
501     cond_inc32(Assembler::equal,
502                ExternalAddress((address)counters->fast_path_entry_count_addr()));
503   }
504   jcc(Assembler::equal, DONE_LABEL);           // Success
505 
506   // Recursive locking.
507   // The object is stack-locked: markword contains stack pointer to BasicLock.
508   // Locked by current thread if difference with current SP is less than one page.
509   subptr(tmpReg, rsp);
510   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
511   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
512   movptr(Address(boxReg, 0), tmpReg);
513   if (counters != NULL) {
514     cond_inc32(Assembler::equal,
515                ExternalAddress((address)counters->fast_path_entry_count_addr()));
516   }
517   jmp(DONE_LABEL);
518 
519   bind(IsInflated);
520   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
521 
522 #if INCLUDE_RTM_OPT
523   // Use the same RTM locking code in 32- and 64-bit VM.
524   if (use_rtm) {
525     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
526                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
527   } else {
528 #endif // INCLUDE_RTM_OPT
529 
530 #ifndef _LP64
531   // The object is inflated.
532 
533   // boxReg refers to the on-stack BasicLock in the current frame.
534   // We'd like to write:
535   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
536   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
537   // additional latency as we have another ST in the store buffer that must drain.
538 
539   // avoid ST-before-CAS
540   // register juggle because we need tmpReg for cmpxchgptr below
541   movptr(scrReg, boxReg);
542   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
543 
544   // Optimistic form: consider XORL tmpReg,tmpReg
545   movptr(tmpReg, NULL_WORD);
546 
547   // Appears unlocked - try to swing _owner from null to non-null.
548   // Ideally, I'd manifest "Self" with get_thread and then attempt
549   // to CAS the register containing Self into m->Owner.
550   // But we don't have enough registers, so instead we can either try to CAS
551   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
552   // we later store "Self" into m->Owner.  Transiently storing a stack address
553   // (rsp or the address of the box) into  m->owner is harmless.
554   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
555   lock();
556   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
557   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
558   // If we weren't able to swing _owner from NULL to the BasicLock
559   // then take the slow path.
560   jccb  (Assembler::notZero, DONE_LABEL);
561   // update _owner from BasicLock to thread
562   get_thread (scrReg);                    // beware: clobbers ICCs
563   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
564   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
565 
566   // If the CAS fails we can either retry or pass control to the slow path.
567   // We use the latter tactic.
568   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
569   // If the CAS was successful ...
570   //   Self has acquired the lock
571   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
572   // Intentional fall-through into DONE_LABEL ...
573 #else // _LP64
574   // It's inflated and we use scrReg for ObjectMonitor* in this section.
575   movq(scrReg, tmpReg);
576   xorq(tmpReg, tmpReg);
577   lock();
578   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
579   // Unconditionally set box->_displaced_header = markWord::unused_mark().
580   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
581   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
582   // Intentional fall-through into DONE_LABEL ...
583   // Propagate ICC.ZF from CAS above into DONE_LABEL.
584 #endif // _LP64
585 #if INCLUDE_RTM_OPT
586   } // use_rtm()
587 #endif
588   // DONE_LABEL is a hot target - we'd really like to place it at the
589   // start of cache line by padding with NOPs.
590   // See the AMD and Intel software optimization manuals for the
591   // most efficient "long" NOP encodings.
592   // Unfortunately none of our alignment mechanisms suffice.
593   bind(DONE_LABEL);
594 
595   // At DONE_LABEL the icc ZFlag is set as follows ...
596   // fast_unlock uses the same protocol.
597   // ZFlag == 1 -> Success
598   // ZFlag == 0 -> Failure - force control through the slow path
599 }
600 
601 // obj: object to unlock
602 // box: box address (displaced header location), killed.  Must be EAX.
603 // tmp: killed, cannot be obj nor box.
604 //
605 // Some commentary on balanced locking:
606 //
607 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
608 // Methods that don't have provably balanced locking are forced to run in the
609 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
610 // The interpreter provides two properties:
611 // I1:  At return-time the interpreter automatically and quietly unlocks any
612 //      objects acquired the current activation (frame).  Recall that the
613 //      interpreter maintains an on-stack list of locks currently held by
614 //      a frame.
615 // I2:  If a method attempts to unlock an object that is not held by the
616 //      the frame the interpreter throws IMSX.
617 //
618 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
619 // B() doesn't have provably balanced locking so it runs in the interpreter.
620 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
621 // is still locked by A().
622 //
623 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
624 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
625 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
626 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
627 // Arguably given that the spec legislates the JNI case as undefined our implementation
628 // could reasonably *avoid* checking owner in fast_unlock().
629 // In the interest of performance we elide m->Owner==Self check in unlock.
630 // A perfectly viable alternative is to elide the owner check except when
631 // Xcheck:jni is enabled.
632 
fast_unlock(Register objReg,Register boxReg,Register tmpReg,bool use_rtm)633 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
634   assert(boxReg == rax, "");
635   assert_different_registers(objReg, boxReg, tmpReg);
636 
637   Label DONE_LABEL, Stacked, CheckSucc;
638 
639   // Critically, the biased locking test must have precedence over
640   // and appear before the (box->dhw == 0) recursive stack-lock test.
641   if (UseBiasedLocking && !UseOptoBiasInlining) {
642     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
643   }
644 
645 #if INCLUDE_RTM_OPT
646   if (UseRTMForStackLocks && use_rtm) {
647     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
648     Label L_regular_unlock;
649     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
650     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
651     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
652     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
653     xend();                                                           // otherwise end...
654     jmp(DONE_LABEL);                                                  // ... and we're done
655     bind(L_regular_unlock);
656   }
657 #endif
658 
659   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
660   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
661   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
662   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
663   jccb  (Assembler::zero, Stacked);
664 
665   // It's inflated.
666 #if INCLUDE_RTM_OPT
667   if (use_rtm) {
668     Label L_regular_inflated_unlock;
669     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
670     movptr(boxReg, Address(tmpReg, owner_offset));
671     testptr(boxReg, boxReg);
672     jccb(Assembler::notZero, L_regular_inflated_unlock);
673     xend();
674     jmpb(DONE_LABEL);
675     bind(L_regular_inflated_unlock);
676   }
677 #endif
678 
679   // Despite our balanced locking property we still check that m->_owner == Self
680   // as java routines or native JNI code called by this thread might
681   // have released the lock.
682   // Refer to the comments in synchronizer.cpp for how we might encode extra
683   // state in _succ so we can avoid fetching EntryList|cxq.
684   //
685   // I'd like to add more cases in fast_lock() and fast_unlock() --
686   // such as recursive enter and exit -- but we have to be wary of
687   // I$ bloat, T$ effects and BP$ effects.
688   //
689   // If there's no contention try a 1-0 exit.  That is, exit without
690   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
691   // we detect and recover from the race that the 1-0 exit admits.
692   //
693   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
694   // before it STs null into _owner, releasing the lock.  Updates
695   // to data protected by the critical section must be visible before
696   // we drop the lock (and thus before any other thread could acquire
697   // the lock and observe the fields protected by the lock).
698   // IA32's memory-model is SPO, so STs are ordered with respect to
699   // each other and there's no need for an explicit barrier (fence).
700   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
701 #ifndef _LP64
702   get_thread (boxReg);
703 
704   // Note that we could employ various encoding schemes to reduce
705   // the number of loads below (currently 4) to just 2 or 3.
706   // Refer to the comments in synchronizer.cpp.
707   // In practice the chain of fetches doesn't seem to impact performance, however.
708   xorptr(boxReg, boxReg);
709   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
710   jccb  (Assembler::notZero, DONE_LABEL);
711   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
712   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
713   jccb  (Assembler::notZero, CheckSucc);
714   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
715   jmpb  (DONE_LABEL);
716 
717   bind (Stacked);
718   // It's not inflated and it's not recursively stack-locked and it's not biased.
719   // It must be stack-locked.
720   // Try to reset the header to displaced header.
721   // The "box" value on the stack is stable, so we can reload
722   // and be assured we observe the same value as above.
723   movptr(tmpReg, Address(boxReg, 0));
724   lock();
725   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
726   // Intention fall-thru into DONE_LABEL
727 
728   // DONE_LABEL is a hot target - we'd really like to place it at the
729   // start of cache line by padding with NOPs.
730   // See the AMD and Intel software optimization manuals for the
731   // most efficient "long" NOP encodings.
732   // Unfortunately none of our alignment mechanisms suffice.
733   bind (CheckSucc);
734 #else // _LP64
735   // It's inflated
736   xorptr(boxReg, boxReg);
737   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
738   jccb  (Assembler::notZero, DONE_LABEL);
739   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
740   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
741   jccb  (Assembler::notZero, CheckSucc);
742   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
743   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
744   jmpb  (DONE_LABEL);
745 
746   // Try to avoid passing control into the slow_path ...
747   Label LSuccess, LGoSlowPath ;
748   bind  (CheckSucc);
749 
750   // The following optional optimization can be elided if necessary
751   // Effectively: if (succ == null) goto slow path
752   // The code reduces the window for a race, however,
753   // and thus benefits performance.
754   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
755   jccb  (Assembler::zero, LGoSlowPath);
756 
757   xorptr(boxReg, boxReg);
758   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
759   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
760 
761   // Memory barrier/fence
762   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
763   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
764   // This is faster on Nehalem and AMD Shanghai/Barcelona.
765   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
766   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
767   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
768   lock(); addl(Address(rsp, 0), 0);
769 
770   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
771   jccb  (Assembler::notZero, LSuccess);
772 
773   // Rare inopportune interleaving - race.
774   // The successor vanished in the small window above.
775   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
776   // We need to ensure progress and succession.
777   // Try to reacquire the lock.
778   // If that fails then the new owner is responsible for succession and this
779   // thread needs to take no further action and can exit via the fast path (success).
780   // If the re-acquire succeeds then pass control into the slow path.
781   // As implemented, this latter mode is horrible because we generated more
782   // coherence traffic on the lock *and* artifically extended the critical section
783   // length while by virtue of passing control into the slow path.
784 
785   // box is really RAX -- the following CMPXCHG depends on that binding
786   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
787   lock();
788   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
789   // There's no successor so we tried to regrab the lock.
790   // If that didn't work, then another thread grabbed the
791   // lock so we're done (and exit was a success).
792   jccb  (Assembler::notEqual, LSuccess);
793   // Intentional fall-through into slow path
794 
795   bind  (LGoSlowPath);
796   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
797   jmpb  (DONE_LABEL);
798 
799   bind  (LSuccess);
800   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
801   jmpb  (DONE_LABEL);
802 
803   bind  (Stacked);
804   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
805   lock();
806   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
807 
808 #endif
809   bind(DONE_LABEL);
810 }
811 
812 //-------------------------------------------------------------------------------------------
813 // Generic instructions support for use in .ad files C2 code generation
814 
vabsnegd(int opcode,XMMRegister dst,XMMRegister src,Register scr)815 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
816   if (dst != src) {
817     movdqu(dst, src);
818   }
819   if (opcode == Op_AbsVD) {
820     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
821   } else {
822     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
823     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
824   }
825 }
826 
vabsnegd(int opcode,XMMRegister dst,XMMRegister src,int vector_len,Register scr)827 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
828   if (opcode == Op_AbsVD) {
829     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
830   } else {
831     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
832     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
833   }
834 }
835 
vabsnegf(int opcode,XMMRegister dst,XMMRegister src,Register scr)836 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
837   if (dst != src) {
838     movdqu(dst, src);
839   }
840   if (opcode == Op_AbsVF) {
841     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
842   } else {
843     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
844     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
845   }
846 }
847 
vabsnegf(int opcode,XMMRegister dst,XMMRegister src,int vector_len,Register scr)848 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
849   if (opcode == Op_AbsVF) {
850     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
851   } else {
852     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
853     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
854   }
855 }
856 
vextendbw(bool sign,XMMRegister dst,XMMRegister src)857 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
858   if (sign) {
859     pmovsxbw(dst, src);
860   } else {
861     pmovzxbw(dst, src);
862   }
863 }
864 
vextendbw(bool sign,XMMRegister dst,XMMRegister src,int vector_len)865 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
866   if (sign) {
867     vpmovsxbw(dst, src, vector_len);
868   } else {
869     vpmovzxbw(dst, src, vector_len);
870   }
871 }
872 
vshiftd(int opcode,XMMRegister dst,XMMRegister src)873 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
874   if (opcode == Op_RShiftVI) {
875     psrad(dst, src);
876   } else if (opcode == Op_LShiftVI) {
877     pslld(dst, src);
878   } else {
879     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
880     psrld(dst, src);
881   }
882 }
883 
vshiftd(int opcode,XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)884 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
885   if (opcode == Op_RShiftVI) {
886     vpsrad(dst, nds, src, vector_len);
887   } else if (opcode == Op_LShiftVI) {
888     vpslld(dst, nds, src, vector_len);
889   } else {
890     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
891     vpsrld(dst, nds, src, vector_len);
892   }
893 }
894 
vshiftw(int opcode,XMMRegister dst,XMMRegister src)895 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
896   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
897     psraw(dst, src);
898   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
899     psllw(dst, src);
900   } else {
901     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
902     psrlw(dst, src);
903   }
904 }
905 
vshiftw(int opcode,XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)906 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
907   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
908     vpsraw(dst, nds, src, vector_len);
909   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
910     vpsllw(dst, nds, src, vector_len);
911   } else {
912     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
913     vpsrlw(dst, nds, src, vector_len);
914   }
915 }
916 
vshiftq(int opcode,XMMRegister dst,XMMRegister src)917 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
918   if (opcode == Op_RShiftVL) {
919     psrlq(dst, src);  // using srl to implement sra on pre-avs512 systems
920   } else if (opcode == Op_LShiftVL) {
921     psllq(dst, src);
922   } else {
923     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
924     psrlq(dst, src);
925   }
926 }
927 
vshiftq(int opcode,XMMRegister dst,XMMRegister nds,XMMRegister src,int vector_len)928 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
929   if (opcode == Op_RShiftVL) {
930     evpsraq(dst, nds, src, vector_len);
931   } else if (opcode == Op_LShiftVL) {
932     vpsllq(dst, nds, src, vector_len);
933   } else {
934     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
935     vpsrlq(dst, nds, src, vector_len);
936   }
937 }
938 
939 // Reductions for vectors of ints, longs, floats, and doubles.
940 
reduce_operation_128(int opcode,XMMRegister dst,XMMRegister src)941 void C2_MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) {
942   int vector_len = Assembler::AVX_128bit;
943 
944   switch (opcode) {
945     case Op_AndReductionV:  pand(dst, src); break;
946     case Op_OrReductionV:   por (dst, src); break;
947     case Op_XorReductionV:  pxor(dst, src); break;
948 
949     case Op_AddReductionVF: addss(dst, src); break;
950     case Op_AddReductionVD: addsd(dst, src); break;
951     case Op_AddReductionVI: paddd(dst, src); break;
952     case Op_AddReductionVL: paddq(dst, src); break;
953 
954     case Op_MulReductionVF: mulss(dst, src); break;
955     case Op_MulReductionVD: mulsd(dst, src); break;
956     case Op_MulReductionVI: pmulld(dst, src); break;
957     case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break;
958 
959     default: assert(false, "wrong opcode");
960   }
961 }
962 
reduce_operation_256(int opcode,XMMRegister dst,XMMRegister src1,XMMRegister src2)963 void C2_MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
964   int vector_len = Assembler::AVX_256bit;
965 
966   switch (opcode) {
967     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
968     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
969     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
970 
971     case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break;
972     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
973 
974     case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break;
975     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
976 
977     default: assert(false, "wrong opcode");
978   }
979 }
980 
reduce_fp(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)981 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
982                                   XMMRegister dst, XMMRegister src,
983                                   XMMRegister vtmp1, XMMRegister vtmp2) {
984   switch (opcode) {
985     case Op_AddReductionVF:
986     case Op_MulReductionVF:
987       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
988       break;
989 
990     case Op_AddReductionVD:
991     case Op_MulReductionVD:
992       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
993       break;
994 
995     default: assert(false, "wrong opcode");
996   }
997 }
998 
reduceI(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)999 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1000                                 Register dst, Register src1, XMMRegister src2,
1001                                 XMMRegister vtmp1, XMMRegister vtmp2) {
1002   switch (vlen) {
1003     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1004     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1005     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1006     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1007 
1008     default: assert(false, "wrong vector length");
1009   }
1010 }
1011 
1012 #ifdef _LP64
reduceL(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1013 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1014                                 Register dst, Register src1, XMMRegister src2,
1015                                 XMMRegister vtmp1, XMMRegister vtmp2) {
1016   switch (vlen) {
1017     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1018     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1019     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1020 
1021     default: assert(false, "wrong vector length");
1022   }
1023 }
1024 #endif // _LP64
1025 
reduceF(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1026 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1027   switch (vlen) {
1028     case 2:
1029       assert(vtmp2 == xnoreg, "");
1030       reduce2F(opcode, dst, src, vtmp1);
1031       break;
1032     case 4:
1033       assert(vtmp2 == xnoreg, "");
1034       reduce4F(opcode, dst, src, vtmp1);
1035       break;
1036     case 8:
1037       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1038       break;
1039     case 16:
1040       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1041       break;
1042     default: assert(false, "wrong vector length");
1043   }
1044 }
1045 
reduceD(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1046 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1047   switch (vlen) {
1048     case 2:
1049       assert(vtmp2 == xnoreg, "");
1050       reduce2D(opcode, dst, src, vtmp1);
1051       break;
1052     case 4:
1053       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1054       break;
1055     case 8:
1056       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1057       break;
1058     default: assert(false, "wrong vector length");
1059   }
1060 }
1061 
reduce2I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1062 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1063   if (opcode == Op_AddReductionVI) {
1064     if (vtmp1 != src2) {
1065       movdqu(vtmp1, src2);
1066     }
1067     phaddd(vtmp1, vtmp1);
1068   } else {
1069     pshufd(vtmp1, src2, 0x1);
1070     reduce_operation_128(opcode, vtmp1, src2);
1071   }
1072   movdl(vtmp2, src1);
1073   reduce_operation_128(opcode, vtmp1, vtmp2);
1074   movdl(dst, vtmp1);
1075 }
1076 
reduce4I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1077 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1078   if (opcode == Op_AddReductionVI) {
1079     if (vtmp1 != src2) {
1080       movdqu(vtmp1, src2);
1081     }
1082     phaddd(vtmp1, src2);
1083     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1084   } else {
1085     pshufd(vtmp2, src2, 0xE);
1086     reduce_operation_128(opcode, vtmp2, src2);
1087     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1088   }
1089 }
1090 
reduce8I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1091 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1092   if (opcode == Op_AddReductionVI) {
1093     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1094     vextracti128_high(vtmp2, vtmp1);
1095     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1096     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1097   } else {
1098     vextracti128_high(vtmp1, src2);
1099     reduce_operation_128(opcode, vtmp1, src2);
1100     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1101   }
1102 }
1103 
reduce16I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1104 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1105   vextracti64x4_high(vtmp2, src2);
1106   reduce_operation_256(opcode, vtmp2, vtmp2, src2);
1107   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1108 }
1109 
1110 #ifdef _LP64
reduce2L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1111 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1112   pshufd(vtmp2, src2, 0xE);
1113   reduce_operation_128(opcode, vtmp2, src2);
1114   movdq(vtmp1, src1);
1115   reduce_operation_128(opcode, vtmp1, vtmp2);
1116   movdq(dst, vtmp1);
1117 }
1118 
reduce4L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1119 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1120   vextracti128_high(vtmp1, src2);
1121   reduce_operation_128(opcode, vtmp1, src2);
1122   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1123 }
1124 
reduce8L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1125 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1126   vextracti64x4_high(vtmp2, src2);
1127   reduce_operation_256(opcode, vtmp2, vtmp2, src2);
1128   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1129 }
1130 #endif // _LP64
1131 
reduce2F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1132 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1133   reduce_operation_128(opcode, dst, src);
1134   pshufd(vtmp, src, 0x1);
1135   reduce_operation_128(opcode, dst, vtmp);
1136 }
1137 
reduce4F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1138 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1139   reduce2F(opcode, dst, src, vtmp);
1140   pshufd(vtmp, src, 0x2);
1141   reduce_operation_128(opcode, dst, vtmp);
1142   pshufd(vtmp, src, 0x3);
1143   reduce_operation_128(opcode, dst, vtmp);
1144 }
1145 
reduce8F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1146 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1147   reduce4F(opcode, dst, src, vtmp2);
1148   vextractf128_high(vtmp2, src);
1149   reduce4F(opcode, dst, vtmp2, vtmp1);
1150 }
1151 
reduce16F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1152 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1153   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1154   vextracti64x4_high(vtmp1, src);
1155   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1156 }
1157 
reduce2D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1158 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1159   reduce_operation_128(opcode, dst, src);
1160   pshufd(vtmp, src, 0xE);
1161   reduce_operation_128(opcode, dst, vtmp);
1162 }
1163 
reduce4D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1164 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1165   reduce2D(opcode, dst, src, vtmp2);
1166   vextractf128_high(vtmp2, src);
1167   reduce2D(opcode, dst, vtmp2, vtmp1);
1168 }
1169 
reduce8D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1170 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1171   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1172   vextracti64x4_high(vtmp1, src);
1173   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1174 }
1175 
1176 //-------------------------------------------------------------------------------------------
1177 
1178 // IndexOf for constant substrings with size >= 8 chars
1179 // which don't need to be loaded through stack.
string_indexofC8(Register str1,Register str2,Register cnt1,Register cnt2,int int_cnt2,Register result,XMMRegister vec,Register tmp,int ae)1180 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
1181                                          Register cnt1, Register cnt2,
1182                                          int int_cnt2,  Register result,
1183                                          XMMRegister vec, Register tmp,
1184                                          int ae) {
1185   ShortBranchVerifier sbv(this);
1186   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1187   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1188 
1189   // This method uses the pcmpestri instruction with bound registers
1190   //   inputs:
1191   //     xmm - substring
1192   //     rax - substring length (elements count)
1193   //     mem - scanned string
1194   //     rdx - string length (elements count)
1195   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
1196   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
1197   //   outputs:
1198   //     rcx - matched index in string
1199   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1200   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
1201   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
1202   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
1203   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
1204 
1205   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
1206         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
1207         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
1208 
1209   // Note, inline_string_indexOf() generates checks:
1210   // if (substr.count > string.count) return -1;
1211   // if (substr.count == 0) return 0;
1212   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
1213 
1214   // Load substring.
1215   if (ae == StrIntrinsicNode::UL) {
1216     pmovzxbw(vec, Address(str2, 0));
1217   } else {
1218     movdqu(vec, Address(str2, 0));
1219   }
1220   movl(cnt2, int_cnt2);
1221   movptr(result, str1); // string addr
1222 
1223   if (int_cnt2 > stride) {
1224     jmpb(SCAN_TO_SUBSTR);
1225 
1226     // Reload substr for rescan, this code
1227     // is executed only for large substrings (> 8 chars)
1228     bind(RELOAD_SUBSTR);
1229     if (ae == StrIntrinsicNode::UL) {
1230       pmovzxbw(vec, Address(str2, 0));
1231     } else {
1232       movdqu(vec, Address(str2, 0));
1233     }
1234     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
1235 
1236     bind(RELOAD_STR);
1237     // We came here after the beginning of the substring was
1238     // matched but the rest of it was not so we need to search
1239     // again. Start from the next element after the previous match.
1240 
1241     // cnt2 is number of substring reminding elements and
1242     // cnt1 is number of string reminding elements when cmp failed.
1243     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
1244     subl(cnt1, cnt2);
1245     addl(cnt1, int_cnt2);
1246     movl(cnt2, int_cnt2); // Now restore cnt2
1247 
1248     decrementl(cnt1);     // Shift to next element
1249     cmpl(cnt1, cnt2);
1250     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1251 
1252     addptr(result, (1<<scale1));
1253 
1254   } // (int_cnt2 > 8)
1255 
1256   // Scan string for start of substr in 16-byte vectors
1257   bind(SCAN_TO_SUBSTR);
1258   pcmpestri(vec, Address(result, 0), mode);
1259   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
1260   subl(cnt1, stride);
1261   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
1262   cmpl(cnt1, cnt2);
1263   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1264   addptr(result, 16);
1265   jmpb(SCAN_TO_SUBSTR);
1266 
1267   // Found a potential substr
1268   bind(FOUND_CANDIDATE);
1269   // Matched whole vector if first element matched (tmp(rcx) == 0).
1270   if (int_cnt2 == stride) {
1271     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
1272   } else { // int_cnt2 > 8
1273     jccb(Assembler::overflow, FOUND_SUBSTR);
1274   }
1275   // After pcmpestri tmp(rcx) contains matched element index
1276   // Compute start addr of substr
1277   lea(result, Address(result, tmp, scale1));
1278 
1279   // Make sure string is still long enough
1280   subl(cnt1, tmp);
1281   cmpl(cnt1, cnt2);
1282   if (int_cnt2 == stride) {
1283     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
1284   } else { // int_cnt2 > 8
1285     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
1286   }
1287   // Left less then substring.
1288 
1289   bind(RET_NOT_FOUND);
1290   movl(result, -1);
1291   jmp(EXIT);
1292 
1293   if (int_cnt2 > stride) {
1294     // This code is optimized for the case when whole substring
1295     // is matched if its head is matched.
1296     bind(MATCH_SUBSTR_HEAD);
1297     pcmpestri(vec, Address(result, 0), mode);
1298     // Reload only string if does not match
1299     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
1300 
1301     Label CONT_SCAN_SUBSTR;
1302     // Compare the rest of substring (> 8 chars).
1303     bind(FOUND_SUBSTR);
1304     // First 8 chars are already matched.
1305     negptr(cnt2);
1306     addptr(cnt2, stride);
1307 
1308     bind(SCAN_SUBSTR);
1309     subl(cnt1, stride);
1310     cmpl(cnt2, -stride); // Do not read beyond substring
1311     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
1312     // Back-up strings to avoid reading beyond substring:
1313     // cnt1 = cnt1 - cnt2 + 8
1314     addl(cnt1, cnt2); // cnt2 is negative
1315     addl(cnt1, stride);
1316     movl(cnt2, stride); negptr(cnt2);
1317     bind(CONT_SCAN_SUBSTR);
1318     if (int_cnt2 < (int)G) {
1319       int tail_off1 = int_cnt2<<scale1;
1320       int tail_off2 = int_cnt2<<scale2;
1321       if (ae == StrIntrinsicNode::UL) {
1322         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
1323       } else {
1324         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
1325       }
1326       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
1327     } else {
1328       // calculate index in register to avoid integer overflow (int_cnt2*2)
1329       movl(tmp, int_cnt2);
1330       addptr(tmp, cnt2);
1331       if (ae == StrIntrinsicNode::UL) {
1332         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
1333       } else {
1334         movdqu(vec, Address(str2, tmp, scale2, 0));
1335       }
1336       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
1337     }
1338     // Need to reload strings pointers if not matched whole vector
1339     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
1340     addptr(cnt2, stride);
1341     jcc(Assembler::negative, SCAN_SUBSTR);
1342     // Fall through if found full substring
1343 
1344   } // (int_cnt2 > 8)
1345 
1346   bind(RET_FOUND);
1347   // Found result if we matched full small substring.
1348   // Compute substr offset
1349   subptr(result, str1);
1350   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1351     shrl(result, 1); // index
1352   }
1353   bind(EXIT);
1354 
1355 } // string_indexofC8
1356 
1357 // Small strings are loaded through stack if they cross page boundary.
string_indexof(Register str1,Register str2,Register cnt1,Register cnt2,int int_cnt2,Register result,XMMRegister vec,Register tmp,int ae)1358 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
1359                                        Register cnt1, Register cnt2,
1360                                        int int_cnt2,  Register result,
1361                                        XMMRegister vec, Register tmp,
1362                                        int ae) {
1363   ShortBranchVerifier sbv(this);
1364   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1365   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1366 
1367   //
1368   // int_cnt2 is length of small (< 8 chars) constant substring
1369   // or (-1) for non constant substring in which case its length
1370   // is in cnt2 register.
1371   //
1372   // Note, inline_string_indexOf() generates checks:
1373   // if (substr.count > string.count) return -1;
1374   // if (substr.count == 0) return 0;
1375   //
1376   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
1377   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
1378   // This method uses the pcmpestri instruction with bound registers
1379   //   inputs:
1380   //     xmm - substring
1381   //     rax - substring length (elements count)
1382   //     mem - scanned string
1383   //     rdx - string length (elements count)
1384   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
1385   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
1386   //   outputs:
1387   //     rcx - matched index in string
1388   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1389   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
1390   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
1391   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
1392 
1393   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
1394         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
1395         FOUND_CANDIDATE;
1396 
1397   { //========================================================
1398     // We don't know where these strings are located
1399     // and we can't read beyond them. Load them through stack.
1400     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
1401 
1402     movptr(tmp, rsp); // save old SP
1403 
1404     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
1405       if (int_cnt2 == (1>>scale2)) { // One byte
1406         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
1407         load_unsigned_byte(result, Address(str2, 0));
1408         movdl(vec, result); // move 32 bits
1409       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
1410         // Not enough header space in 32-bit VM: 12+3 = 15.
1411         movl(result, Address(str2, -1));
1412         shrl(result, 8);
1413         movdl(vec, result); // move 32 bits
1414       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
1415         load_unsigned_short(result, Address(str2, 0));
1416         movdl(vec, result); // move 32 bits
1417       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
1418         movdl(vec, Address(str2, 0)); // move 32 bits
1419       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
1420         movq(vec, Address(str2, 0));  // move 64 bits
1421       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
1422         // Array header size is 12 bytes in 32-bit VM
1423         // + 6 bytes for 3 chars == 18 bytes,
1424         // enough space to load vec and shift.
1425         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
1426         if (ae == StrIntrinsicNode::UL) {
1427           int tail_off = int_cnt2-8;
1428           pmovzxbw(vec, Address(str2, tail_off));
1429           psrldq(vec, -2*tail_off);
1430         }
1431         else {
1432           int tail_off = int_cnt2*(1<<scale2);
1433           movdqu(vec, Address(str2, tail_off-16));
1434           psrldq(vec, 16-tail_off);
1435         }
1436       }
1437     } else { // not constant substring
1438       cmpl(cnt2, stride);
1439       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
1440 
1441       // We can read beyond string if srt+16 does not cross page boundary
1442       // since heaps are aligned and mapped by pages.
1443       assert(os::vm_page_size() < (int)G, "default page should be small");
1444       movl(result, str2); // We need only low 32 bits
1445       andl(result, (os::vm_page_size()-1));
1446       cmpl(result, (os::vm_page_size()-16));
1447       jccb(Assembler::belowEqual, CHECK_STR);
1448 
1449       // Move small strings to stack to allow load 16 bytes into vec.
1450       subptr(rsp, 16);
1451       int stk_offset = wordSize-(1<<scale2);
1452       push(cnt2);
1453 
1454       bind(COPY_SUBSTR);
1455       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
1456         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
1457         movb(Address(rsp, cnt2, scale2, stk_offset), result);
1458       } else if (ae == StrIntrinsicNode::UU) {
1459         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
1460         movw(Address(rsp, cnt2, scale2, stk_offset), result);
1461       }
1462       decrement(cnt2);
1463       jccb(Assembler::notZero, COPY_SUBSTR);
1464 
1465       pop(cnt2);
1466       movptr(str2, rsp);  // New substring address
1467     } // non constant
1468 
1469     bind(CHECK_STR);
1470     cmpl(cnt1, stride);
1471     jccb(Assembler::aboveEqual, BIG_STRINGS);
1472 
1473     // Check cross page boundary.
1474     movl(result, str1); // We need only low 32 bits
1475     andl(result, (os::vm_page_size()-1));
1476     cmpl(result, (os::vm_page_size()-16));
1477     jccb(Assembler::belowEqual, BIG_STRINGS);
1478 
1479     subptr(rsp, 16);
1480     int stk_offset = -(1<<scale1);
1481     if (int_cnt2 < 0) { // not constant
1482       push(cnt2);
1483       stk_offset += wordSize;
1484     }
1485     movl(cnt2, cnt1);
1486 
1487     bind(COPY_STR);
1488     if (ae == StrIntrinsicNode::LL) {
1489       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
1490       movb(Address(rsp, cnt2, scale1, stk_offset), result);
1491     } else {
1492       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
1493       movw(Address(rsp, cnt2, scale1, stk_offset), result);
1494     }
1495     decrement(cnt2);
1496     jccb(Assembler::notZero, COPY_STR);
1497 
1498     if (int_cnt2 < 0) { // not constant
1499       pop(cnt2);
1500     }
1501     movptr(str1, rsp);  // New string address
1502 
1503     bind(BIG_STRINGS);
1504     // Load substring.
1505     if (int_cnt2 < 0) { // -1
1506       if (ae == StrIntrinsicNode::UL) {
1507         pmovzxbw(vec, Address(str2, 0));
1508       } else {
1509         movdqu(vec, Address(str2, 0));
1510       }
1511       push(cnt2);       // substr count
1512       push(str2);       // substr addr
1513       push(str1);       // string addr
1514     } else {
1515       // Small (< 8 chars) constant substrings are loaded already.
1516       movl(cnt2, int_cnt2);
1517     }
1518     push(tmp);  // original SP
1519 
1520   } // Finished loading
1521 
1522   //========================================================
1523   // Start search
1524   //
1525 
1526   movptr(result, str1); // string addr
1527 
1528   if (int_cnt2  < 0) {  // Only for non constant substring
1529     jmpb(SCAN_TO_SUBSTR);
1530 
1531     // SP saved at sp+0
1532     // String saved at sp+1*wordSize
1533     // Substr saved at sp+2*wordSize
1534     // Substr count saved at sp+3*wordSize
1535 
1536     // Reload substr for rescan, this code
1537     // is executed only for large substrings (> 8 chars)
1538     bind(RELOAD_SUBSTR);
1539     movptr(str2, Address(rsp, 2*wordSize));
1540     movl(cnt2, Address(rsp, 3*wordSize));
1541     if (ae == StrIntrinsicNode::UL) {
1542       pmovzxbw(vec, Address(str2, 0));
1543     } else {
1544       movdqu(vec, Address(str2, 0));
1545     }
1546     // We came here after the beginning of the substring was
1547     // matched but the rest of it was not so we need to search
1548     // again. Start from the next element after the previous match.
1549     subptr(str1, result); // Restore counter
1550     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1551       shrl(str1, 1);
1552     }
1553     addl(cnt1, str1);
1554     decrementl(cnt1);   // Shift to next element
1555     cmpl(cnt1, cnt2);
1556     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1557 
1558     addptr(result, (1<<scale1));
1559   } // non constant
1560 
1561   // Scan string for start of substr in 16-byte vectors
1562   bind(SCAN_TO_SUBSTR);
1563   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
1564   pcmpestri(vec, Address(result, 0), mode);
1565   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
1566   subl(cnt1, stride);
1567   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
1568   cmpl(cnt1, cnt2);
1569   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
1570   addptr(result, 16);
1571 
1572   bind(ADJUST_STR);
1573   cmpl(cnt1, stride); // Do not read beyond string
1574   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
1575   // Back-up string to avoid reading beyond string.
1576   lea(result, Address(result, cnt1, scale1, -16));
1577   movl(cnt1, stride);
1578   jmpb(SCAN_TO_SUBSTR);
1579 
1580   // Found a potential substr
1581   bind(FOUND_CANDIDATE);
1582   // After pcmpestri tmp(rcx) contains matched element index
1583 
1584   // Make sure string is still long enough
1585   subl(cnt1, tmp);
1586   cmpl(cnt1, cnt2);
1587   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
1588   // Left less then substring.
1589 
1590   bind(RET_NOT_FOUND);
1591   movl(result, -1);
1592   jmp(CLEANUP);
1593 
1594   bind(FOUND_SUBSTR);
1595   // Compute start addr of substr
1596   lea(result, Address(result, tmp, scale1));
1597   if (int_cnt2 > 0) { // Constant substring
1598     // Repeat search for small substring (< 8 chars)
1599     // from new point without reloading substring.
1600     // Have to check that we don't read beyond string.
1601     cmpl(tmp, stride-int_cnt2);
1602     jccb(Assembler::greater, ADJUST_STR);
1603     // Fall through if matched whole substring.
1604   } else { // non constant
1605     assert(int_cnt2 == -1, "should be != 0");
1606 
1607     addl(tmp, cnt2);
1608     // Found result if we matched whole substring.
1609     cmpl(tmp, stride);
1610     jcc(Assembler::lessEqual, RET_FOUND);
1611 
1612     // Repeat search for small substring (<= 8 chars)
1613     // from new point 'str1' without reloading substring.
1614     cmpl(cnt2, stride);
1615     // Have to check that we don't read beyond string.
1616     jccb(Assembler::lessEqual, ADJUST_STR);
1617 
1618     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
1619     // Compare the rest of substring (> 8 chars).
1620     movptr(str1, result);
1621 
1622     cmpl(tmp, cnt2);
1623     // First 8 chars are already matched.
1624     jccb(Assembler::equal, CHECK_NEXT);
1625 
1626     bind(SCAN_SUBSTR);
1627     pcmpestri(vec, Address(str1, 0), mode);
1628     // Need to reload strings pointers if not matched whole vector
1629     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
1630 
1631     bind(CHECK_NEXT);
1632     subl(cnt2, stride);
1633     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
1634     addptr(str1, 16);
1635     if (ae == StrIntrinsicNode::UL) {
1636       addptr(str2, 8);
1637     } else {
1638       addptr(str2, 16);
1639     }
1640     subl(cnt1, stride);
1641     cmpl(cnt2, stride); // Do not read beyond substring
1642     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
1643     // Back-up strings to avoid reading beyond substring.
1644 
1645     if (ae == StrIntrinsicNode::UL) {
1646       lea(str2, Address(str2, cnt2, scale2, -8));
1647       lea(str1, Address(str1, cnt2, scale1, -16));
1648     } else {
1649       lea(str2, Address(str2, cnt2, scale2, -16));
1650       lea(str1, Address(str1, cnt2, scale1, -16));
1651     }
1652     subl(cnt1, cnt2);
1653     movl(cnt2, stride);
1654     addl(cnt1, stride);
1655     bind(CONT_SCAN_SUBSTR);
1656     if (ae == StrIntrinsicNode::UL) {
1657       pmovzxbw(vec, Address(str2, 0));
1658     } else {
1659       movdqu(vec, Address(str2, 0));
1660     }
1661     jmp(SCAN_SUBSTR);
1662 
1663     bind(RET_FOUND_LONG);
1664     movptr(str1, Address(rsp, wordSize));
1665   } // non constant
1666 
1667   bind(RET_FOUND);
1668   // Compute substr offset
1669   subptr(result, str1);
1670   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
1671     shrl(result, 1); // index
1672   }
1673   bind(CLEANUP);
1674   pop(rsp); // restore SP
1675 
1676 } // string_indexof
1677 
string_indexof_char(Register str1,Register cnt1,Register ch,Register result,XMMRegister vec1,XMMRegister vec2,XMMRegister vec3,Register tmp)1678 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
1679                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
1680   ShortBranchVerifier sbv(this);
1681   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
1682 
1683   int stride = 8;
1684 
1685   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
1686         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
1687         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
1688         FOUND_SEQ_CHAR, DONE_LABEL;
1689 
1690   movptr(result, str1);
1691   if (UseAVX >= 2) {
1692     cmpl(cnt1, stride);
1693     jcc(Assembler::less, SCAN_TO_CHAR);
1694     cmpl(cnt1, 2*stride);
1695     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
1696     movdl(vec1, ch);
1697     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
1698     vpxor(vec2, vec2);
1699     movl(tmp, cnt1);
1700     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
1701     andl(cnt1,0x0000000F);  //tail count (in chars)
1702 
1703     bind(SCAN_TO_16_CHAR_LOOP);
1704     vmovdqu(vec3, Address(result, 0));
1705     vpcmpeqw(vec3, vec3, vec1, 1);
1706     vptest(vec2, vec3);
1707     jcc(Assembler::carryClear, FOUND_CHAR);
1708     addptr(result, 32);
1709     subl(tmp, 2*stride);
1710     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
1711     jmp(SCAN_TO_8_CHAR);
1712     bind(SCAN_TO_8_CHAR_INIT);
1713     movdl(vec1, ch);
1714     pshuflw(vec1, vec1, 0x00);
1715     pshufd(vec1, vec1, 0);
1716     pxor(vec2, vec2);
1717   }
1718   bind(SCAN_TO_8_CHAR);
1719   cmpl(cnt1, stride);
1720   jcc(Assembler::less, SCAN_TO_CHAR);
1721   if (UseAVX < 2) {
1722     movdl(vec1, ch);
1723     pshuflw(vec1, vec1, 0x00);
1724     pshufd(vec1, vec1, 0);
1725     pxor(vec2, vec2);
1726   }
1727   movl(tmp, cnt1);
1728   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
1729   andl(cnt1,0x00000007);  //tail count (in chars)
1730 
1731   bind(SCAN_TO_8_CHAR_LOOP);
1732   movdqu(vec3, Address(result, 0));
1733   pcmpeqw(vec3, vec1);
1734   ptest(vec2, vec3);
1735   jcc(Assembler::carryClear, FOUND_CHAR);
1736   addptr(result, 16);
1737   subl(tmp, stride);
1738   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
1739   bind(SCAN_TO_CHAR);
1740   testl(cnt1, cnt1);
1741   jcc(Assembler::zero, RET_NOT_FOUND);
1742   bind(SCAN_TO_CHAR_LOOP);
1743   load_unsigned_short(tmp, Address(result, 0));
1744   cmpl(ch, tmp);
1745   jccb(Assembler::equal, FOUND_SEQ_CHAR);
1746   addptr(result, 2);
1747   subl(cnt1, 1);
1748   jccb(Assembler::zero, RET_NOT_FOUND);
1749   jmp(SCAN_TO_CHAR_LOOP);
1750 
1751   bind(RET_NOT_FOUND);
1752   movl(result, -1);
1753   jmpb(DONE_LABEL);
1754 
1755   bind(FOUND_CHAR);
1756   if (UseAVX >= 2) {
1757     vpmovmskb(tmp, vec3);
1758   } else {
1759     pmovmskb(tmp, vec3);
1760   }
1761   bsfl(ch, tmp);
1762   addptr(result, ch);
1763 
1764   bind(FOUND_SEQ_CHAR);
1765   subptr(result, str1);
1766   shrl(result, 1);
1767 
1768   bind(DONE_LABEL);
1769 } // string_indexof_char
1770 
1771 // helper function for string_compare
load_next_elements(Register elem1,Register elem2,Register str1,Register str2,Address::ScaleFactor scale,Address::ScaleFactor scale1,Address::ScaleFactor scale2,Register index,int ae)1772 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
1773                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
1774                                            Address::ScaleFactor scale2, Register index, int ae) {
1775   if (ae == StrIntrinsicNode::LL) {
1776     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
1777     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
1778   } else if (ae == StrIntrinsicNode::UU) {
1779     load_unsigned_short(elem1, Address(str1, index, scale, 0));
1780     load_unsigned_short(elem2, Address(str2, index, scale, 0));
1781   } else {
1782     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
1783     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
1784   }
1785 }
1786 
1787 // Compare strings, used for char[] and byte[].
string_compare(Register str1,Register str2,Register cnt1,Register cnt2,Register result,XMMRegister vec1,int ae)1788 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1789                                        Register cnt1, Register cnt2, Register result,
1790                                        XMMRegister vec1, int ae) {
1791   ShortBranchVerifier sbv(this);
1792   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
1793   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
1794   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
1795   int stride2x2 = 0x40;
1796   Address::ScaleFactor scale = Address::no_scale;
1797   Address::ScaleFactor scale1 = Address::no_scale;
1798   Address::ScaleFactor scale2 = Address::no_scale;
1799 
1800   if (ae != StrIntrinsicNode::LL) {
1801     stride2x2 = 0x20;
1802   }
1803 
1804   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
1805     shrl(cnt2, 1);
1806   }
1807   // Compute the minimum of the string lengths and the
1808   // difference of the string lengths (stack).
1809   // Do the conditional move stuff
1810   movl(result, cnt1);
1811   subl(cnt1, cnt2);
1812   push(cnt1);
1813   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
1814 
1815   // Is the minimum length zero?
1816   testl(cnt2, cnt2);
1817   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
1818   if (ae == StrIntrinsicNode::LL) {
1819     // Load first bytes
1820     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
1821     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
1822   } else if (ae == StrIntrinsicNode::UU) {
1823     // Load first characters
1824     load_unsigned_short(result, Address(str1, 0));
1825     load_unsigned_short(cnt1, Address(str2, 0));
1826   } else {
1827     load_unsigned_byte(result, Address(str1, 0));
1828     load_unsigned_short(cnt1, Address(str2, 0));
1829   }
1830   subl(result, cnt1);
1831   jcc(Assembler::notZero,  POP_LABEL);
1832 
1833   if (ae == StrIntrinsicNode::UU) {
1834     // Divide length by 2 to get number of chars
1835     shrl(cnt2, 1);
1836   }
1837   cmpl(cnt2, 1);
1838   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
1839 
1840   // Check if the strings start at the same location and setup scale and stride
1841   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1842     cmpptr(str1, str2);
1843     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
1844     if (ae == StrIntrinsicNode::LL) {
1845       scale = Address::times_1;
1846       stride = 16;
1847     } else {
1848       scale = Address::times_2;
1849       stride = 8;
1850     }
1851   } else {
1852     scale1 = Address::times_1;
1853     scale2 = Address::times_2;
1854     // scale not used
1855     stride = 8;
1856   }
1857 
1858   if (UseAVX >= 2 && UseSSE42Intrinsics) {
1859     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
1860     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
1861     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
1862     Label COMPARE_TAIL_LONG;
1863     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
1864 
1865     int pcmpmask = 0x19;
1866     if (ae == StrIntrinsicNode::LL) {
1867       pcmpmask &= ~0x01;
1868     }
1869 
1870     // Setup to compare 16-chars (32-bytes) vectors,
1871     // start from first character again because it has aligned address.
1872     if (ae == StrIntrinsicNode::LL) {
1873       stride2 = 32;
1874     } else {
1875       stride2 = 16;
1876     }
1877     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1878       adr_stride = stride << scale;
1879     } else {
1880       adr_stride1 = 8;  //stride << scale1;
1881       adr_stride2 = 16; //stride << scale2;
1882     }
1883 
1884     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
1885     // rax and rdx are used by pcmpestri as elements counters
1886     movl(result, cnt2);
1887     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
1888     jcc(Assembler::zero, COMPARE_TAIL_LONG);
1889 
1890     // fast path : compare first 2 8-char vectors.
1891     bind(COMPARE_16_CHARS);
1892     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1893       movdqu(vec1, Address(str1, 0));
1894     } else {
1895       pmovzxbw(vec1, Address(str1, 0));
1896     }
1897     pcmpestri(vec1, Address(str2, 0), pcmpmask);
1898     jccb(Assembler::below, COMPARE_INDEX_CHAR);
1899 
1900     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1901       movdqu(vec1, Address(str1, adr_stride));
1902       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
1903     } else {
1904       pmovzxbw(vec1, Address(str1, adr_stride1));
1905       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
1906     }
1907     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
1908     addl(cnt1, stride);
1909 
1910     // Compare the characters at index in cnt1
1911     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
1912     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
1913     subl(result, cnt2);
1914     jmp(POP_LABEL);
1915 
1916     // Setup the registers to start vector comparison loop
1917     bind(COMPARE_WIDE_VECTORS);
1918     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1919       lea(str1, Address(str1, result, scale));
1920       lea(str2, Address(str2, result, scale));
1921     } else {
1922       lea(str1, Address(str1, result, scale1));
1923       lea(str2, Address(str2, result, scale2));
1924     }
1925     subl(result, stride2);
1926     subl(cnt2, stride2);
1927     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
1928     negptr(result);
1929 
1930     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
1931     bind(COMPARE_WIDE_VECTORS_LOOP);
1932 
1933 #ifdef _LP64
1934     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
1935       cmpl(cnt2, stride2x2);
1936       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
1937       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
1938       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
1939 
1940       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
1941       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1942         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
1943         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
1944       } else {
1945         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
1946         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
1947       }
1948       kortestql(k7, k7);
1949       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
1950       addptr(result, stride2x2);  // update since we already compared at this addr
1951       subl(cnt2, stride2x2);      // and sub the size too
1952       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
1953 
1954       vpxor(vec1, vec1);
1955       jmpb(COMPARE_WIDE_TAIL);
1956     }//if (VM_Version::supports_avx512vlbw())
1957 #endif // _LP64
1958 
1959 
1960     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
1961     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1962       vmovdqu(vec1, Address(str1, result, scale));
1963       vpxor(vec1, Address(str2, result, scale));
1964     } else {
1965       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
1966       vpxor(vec1, Address(str2, result, scale2));
1967     }
1968     vptest(vec1, vec1);
1969     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
1970     addptr(result, stride2);
1971     subl(cnt2, stride2);
1972     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
1973     // clean upper bits of YMM registers
1974     vpxor(vec1, vec1);
1975 
1976     // compare wide vectors tail
1977     bind(COMPARE_WIDE_TAIL);
1978     testptr(result, result);
1979     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
1980 
1981     movl(result, stride2);
1982     movl(cnt2, result);
1983     negptr(result);
1984     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
1985 
1986     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
1987     bind(VECTOR_NOT_EQUAL);
1988     // clean upper bits of YMM registers
1989     vpxor(vec1, vec1);
1990     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
1991       lea(str1, Address(str1, result, scale));
1992       lea(str2, Address(str2, result, scale));
1993     } else {
1994       lea(str1, Address(str1, result, scale1));
1995       lea(str2, Address(str2, result, scale2));
1996     }
1997     jmp(COMPARE_16_CHARS);
1998 
1999     // Compare tail chars, length between 1 to 15 chars
2000     bind(COMPARE_TAIL_LONG);
2001     movl(cnt2, result);
2002     cmpl(cnt2, stride);
2003     jcc(Assembler::less, COMPARE_SMALL_STR);
2004 
2005     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2006       movdqu(vec1, Address(str1, 0));
2007     } else {
2008       pmovzxbw(vec1, Address(str1, 0));
2009     }
2010     pcmpestri(vec1, Address(str2, 0), pcmpmask);
2011     jcc(Assembler::below, COMPARE_INDEX_CHAR);
2012     subptr(cnt2, stride);
2013     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2014     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2015       lea(str1, Address(str1, result, scale));
2016       lea(str2, Address(str2, result, scale));
2017     } else {
2018       lea(str1, Address(str1, result, scale1));
2019       lea(str2, Address(str2, result, scale2));
2020     }
2021     negptr(cnt2);
2022     jmpb(WHILE_HEAD_LABEL);
2023 
2024     bind(COMPARE_SMALL_STR);
2025   } else if (UseSSE42Intrinsics) {
2026     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
2027     int pcmpmask = 0x19;
2028     // Setup to compare 8-char (16-byte) vectors,
2029     // start from first character again because it has aligned address.
2030     movl(result, cnt2);
2031     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
2032     if (ae == StrIntrinsicNode::LL) {
2033       pcmpmask &= ~0x01;
2034     }
2035     jcc(Assembler::zero, COMPARE_TAIL);
2036     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2037       lea(str1, Address(str1, result, scale));
2038       lea(str2, Address(str2, result, scale));
2039     } else {
2040       lea(str1, Address(str1, result, scale1));
2041       lea(str2, Address(str2, result, scale2));
2042     }
2043     negptr(result);
2044 
2045     // pcmpestri
2046     //   inputs:
2047     //     vec1- substring
2048     //     rax - negative string length (elements count)
2049     //     mem - scanned string
2050     //     rdx - string length (elements count)
2051     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
2052     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
2053     //   outputs:
2054     //     rcx - first mismatched element index
2055     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
2056 
2057     bind(COMPARE_WIDE_VECTORS);
2058     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2059       movdqu(vec1, Address(str1, result, scale));
2060       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2061     } else {
2062       pmovzxbw(vec1, Address(str1, result, scale1));
2063       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2064     }
2065     // After pcmpestri cnt1(rcx) contains mismatched element index
2066 
2067     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
2068     addptr(result, stride);
2069     subptr(cnt2, stride);
2070     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
2071 
2072     // compare wide vectors tail
2073     testptr(result, result);
2074     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2075 
2076     movl(cnt2, stride);
2077     movl(result, stride);
2078     negptr(result);
2079     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2080       movdqu(vec1, Address(str1, result, scale));
2081       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
2082     } else {
2083       pmovzxbw(vec1, Address(str1, result, scale1));
2084       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
2085     }
2086     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
2087 
2088     // Mismatched characters in the vectors
2089     bind(VECTOR_NOT_EQUAL);
2090     addptr(cnt1, result);
2091     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
2092     subl(result, cnt2);
2093     jmpb(POP_LABEL);
2094 
2095     bind(COMPARE_TAIL); // limit is zero
2096     movl(cnt2, result);
2097     // Fallthru to tail compare
2098   }
2099   // Shift str2 and str1 to the end of the arrays, negate min
2100   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2101     lea(str1, Address(str1, cnt2, scale));
2102     lea(str2, Address(str2, cnt2, scale));
2103   } else {
2104     lea(str1, Address(str1, cnt2, scale1));
2105     lea(str2, Address(str2, cnt2, scale2));
2106   }
2107   decrementl(cnt2);  // first character was compared already
2108   negptr(cnt2);
2109 
2110   // Compare the rest of the elements
2111   bind(WHILE_HEAD_LABEL);
2112   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
2113   subl(result, cnt1);
2114   jccb(Assembler::notZero, POP_LABEL);
2115   increment(cnt2);
2116   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
2117 
2118   // Strings are equal up to min length.  Return the length difference.
2119   bind(LENGTH_DIFF_LABEL);
2120   pop(result);
2121   if (ae == StrIntrinsicNode::UU) {
2122     // Divide diff by 2 to get number of chars
2123     sarl(result, 1);
2124   }
2125   jmpb(DONE_LABEL);
2126 
2127 #ifdef _LP64
2128   if (VM_Version::supports_avx512vlbw()) {
2129 
2130     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
2131 
2132     kmovql(cnt1, k7);
2133     notq(cnt1);
2134     bsfq(cnt2, cnt1);
2135     if (ae != StrIntrinsicNode::LL) {
2136       // Divide diff by 2 to get number of chars
2137       sarl(cnt2, 1);
2138     }
2139     addq(result, cnt2);
2140     if (ae == StrIntrinsicNode::LL) {
2141       load_unsigned_byte(cnt1, Address(str2, result));
2142       load_unsigned_byte(result, Address(str1, result));
2143     } else if (ae == StrIntrinsicNode::UU) {
2144       load_unsigned_short(cnt1, Address(str2, result, scale));
2145       load_unsigned_short(result, Address(str1, result, scale));
2146     } else {
2147       load_unsigned_short(cnt1, Address(str2, result, scale2));
2148       load_unsigned_byte(result, Address(str1, result, scale1));
2149     }
2150     subl(result, cnt1);
2151     jmpb(POP_LABEL);
2152   }//if (VM_Version::supports_avx512vlbw())
2153 #endif // _LP64
2154 
2155   // Discard the stored length difference
2156   bind(POP_LABEL);
2157   pop(cnt1);
2158 
2159   // That's it
2160   bind(DONE_LABEL);
2161   if(ae == StrIntrinsicNode::UL) {
2162     negl(result);
2163   }
2164 
2165 }
2166 
2167 // Search for Non-ASCII character (Negative byte value) in a byte array,
2168 // return true if it has any and false otherwise.
2169 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
2170 //   @HotSpotIntrinsicCandidate
2171 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
2172 //     for (int i = off; i < off + len; i++) {
2173 //       if (ba[i] < 0) {
2174 //         return true;
2175 //       }
2176 //     }
2177 //     return false;
2178 //   }
has_negatives(Register ary1,Register len,Register result,Register tmp1,XMMRegister vec1,XMMRegister vec2)2179 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
2180   Register result, Register tmp1,
2181   XMMRegister vec1, XMMRegister vec2) {
2182   // rsi: byte array
2183   // rcx: len
2184   // rax: result
2185   ShortBranchVerifier sbv(this);
2186   assert_different_registers(ary1, len, result, tmp1);
2187   assert_different_registers(vec1, vec2);
2188   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
2189 
2190   // len == 0
2191   testl(len, len);
2192   jcc(Assembler::zero, FALSE_LABEL);
2193 
2194   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
2195     VM_Version::supports_avx512vlbw() &&
2196     VM_Version::supports_bmi2()) {
2197 
2198     Label test_64_loop, test_tail;
2199     Register tmp3_aliased = len;
2200 
2201     movl(tmp1, len);
2202     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
2203 
2204     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
2205     andl(len, ~(64 - 1));    // vector count (in chars)
2206     jccb(Assembler::zero, test_tail);
2207 
2208     lea(ary1, Address(ary1, len, Address::times_1));
2209     negptr(len);
2210 
2211     bind(test_64_loop);
2212     // Check whether our 64 elements of size byte contain negatives
2213     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
2214     kortestql(k2, k2);
2215     jcc(Assembler::notZero, TRUE_LABEL);
2216 
2217     addptr(len, 64);
2218     jccb(Assembler::notZero, test_64_loop);
2219 
2220 
2221     bind(test_tail);
2222     // bail out when there is nothing to be done
2223     testl(tmp1, -1);
2224     jcc(Assembler::zero, FALSE_LABEL);
2225 
2226     // ~(~0 << len) applied up to two times (for 32-bit scenario)
2227 #ifdef _LP64
2228     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
2229     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
2230     notq(tmp3_aliased);
2231     kmovql(k3, tmp3_aliased);
2232 #else
2233     Label k_init;
2234     jmp(k_init);
2235 
2236     // We could not read 64-bits from a general purpose register thus we move
2237     // data required to compose 64 1's to the instruction stream
2238     // We emit 64 byte wide series of elements from 0..63 which later on would
2239     // be used as a compare targets with tail count contained in tmp1 register.
2240     // Result would be a k register having tmp1 consecutive number or 1
2241     // counting from least significant bit.
2242     address tmp = pc();
2243     emit_int64(0x0706050403020100);
2244     emit_int64(0x0F0E0D0C0B0A0908);
2245     emit_int64(0x1716151413121110);
2246     emit_int64(0x1F1E1D1C1B1A1918);
2247     emit_int64(0x2726252423222120);
2248     emit_int64(0x2F2E2D2C2B2A2928);
2249     emit_int64(0x3736353433323130);
2250     emit_int64(0x3F3E3D3C3B3A3938);
2251 
2252     bind(k_init);
2253     lea(len, InternalAddress(tmp));
2254     // create mask to test for negative byte inside a vector
2255     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
2256     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
2257 
2258 #endif
2259     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
2260     ktestq(k2, k3);
2261     jcc(Assembler::notZero, TRUE_LABEL);
2262 
2263     jmp(FALSE_LABEL);
2264   } else {
2265     movl(result, len); // copy
2266 
2267     if (UseAVX >= 2 && UseSSE >= 2) {
2268       // With AVX2, use 32-byte vector compare
2269       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2270 
2271       // Compare 32-byte vectors
2272       andl(result, 0x0000001f);  //   tail count (in bytes)
2273       andl(len, 0xffffffe0);   // vector count (in bytes)
2274       jccb(Assembler::zero, COMPARE_TAIL);
2275 
2276       lea(ary1, Address(ary1, len, Address::times_1));
2277       negptr(len);
2278 
2279       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
2280       movdl(vec2, tmp1);
2281       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
2282 
2283       bind(COMPARE_WIDE_VECTORS);
2284       vmovdqu(vec1, Address(ary1, len, Address::times_1));
2285       vptest(vec1, vec2);
2286       jccb(Assembler::notZero, TRUE_LABEL);
2287       addptr(len, 32);
2288       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2289 
2290       testl(result, result);
2291       jccb(Assembler::zero, FALSE_LABEL);
2292 
2293       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
2294       vptest(vec1, vec2);
2295       jccb(Assembler::notZero, TRUE_LABEL);
2296       jmpb(FALSE_LABEL);
2297 
2298       bind(COMPARE_TAIL); // len is zero
2299       movl(len, result);
2300       // Fallthru to tail compare
2301     } else if (UseSSE42Intrinsics) {
2302       // With SSE4.2, use double quad vector compare
2303       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2304 
2305       // Compare 16-byte vectors
2306       andl(result, 0x0000000f);  //   tail count (in bytes)
2307       andl(len, 0xfffffff0);   // vector count (in bytes)
2308       jcc(Assembler::zero, COMPARE_TAIL);
2309 
2310       lea(ary1, Address(ary1, len, Address::times_1));
2311       negptr(len);
2312 
2313       movl(tmp1, 0x80808080);
2314       movdl(vec2, tmp1);
2315       pshufd(vec2, vec2, 0);
2316 
2317       bind(COMPARE_WIDE_VECTORS);
2318       movdqu(vec1, Address(ary1, len, Address::times_1));
2319       ptest(vec1, vec2);
2320       jcc(Assembler::notZero, TRUE_LABEL);
2321       addptr(len, 16);
2322       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2323 
2324       testl(result, result);
2325       jcc(Assembler::zero, FALSE_LABEL);
2326 
2327       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
2328       ptest(vec1, vec2);
2329       jccb(Assembler::notZero, TRUE_LABEL);
2330       jmpb(FALSE_LABEL);
2331 
2332       bind(COMPARE_TAIL); // len is zero
2333       movl(len, result);
2334       // Fallthru to tail compare
2335     }
2336   }
2337   // Compare 4-byte vectors
2338   andl(len, 0xfffffffc); // vector count (in bytes)
2339   jccb(Assembler::zero, COMPARE_CHAR);
2340 
2341   lea(ary1, Address(ary1, len, Address::times_1));
2342   negptr(len);
2343 
2344   bind(COMPARE_VECTORS);
2345   movl(tmp1, Address(ary1, len, Address::times_1));
2346   andl(tmp1, 0x80808080);
2347   jccb(Assembler::notZero, TRUE_LABEL);
2348   addptr(len, 4);
2349   jcc(Assembler::notZero, COMPARE_VECTORS);
2350 
2351   // Compare trailing char (final 2 bytes), if any
2352   bind(COMPARE_CHAR);
2353   testl(result, 0x2);   // tail  char
2354   jccb(Assembler::zero, COMPARE_BYTE);
2355   load_unsigned_short(tmp1, Address(ary1, 0));
2356   andl(tmp1, 0x00008080);
2357   jccb(Assembler::notZero, TRUE_LABEL);
2358   subptr(result, 2);
2359   lea(ary1, Address(ary1, 2));
2360 
2361   bind(COMPARE_BYTE);
2362   testl(result, 0x1);   // tail  byte
2363   jccb(Assembler::zero, FALSE_LABEL);
2364   load_unsigned_byte(tmp1, Address(ary1, 0));
2365   andl(tmp1, 0x00000080);
2366   jccb(Assembler::notEqual, TRUE_LABEL);
2367   jmpb(FALSE_LABEL);
2368 
2369   bind(TRUE_LABEL);
2370   movl(result, 1);   // return true
2371   jmpb(DONE);
2372 
2373   bind(FALSE_LABEL);
2374   xorl(result, result); // return false
2375 
2376   // That's it
2377   bind(DONE);
2378   if (UseAVX >= 2 && UseSSE >= 2) {
2379     // clean upper bits of YMM registers
2380     vpxor(vec1, vec1);
2381     vpxor(vec2, vec2);
2382   }
2383 }
2384 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
arrays_equals(bool is_array_equ,Register ary1,Register ary2,Register limit,Register result,Register chr,XMMRegister vec1,XMMRegister vec2,bool is_char)2385 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
2386                                       Register limit, Register result, Register chr,
2387                                       XMMRegister vec1, XMMRegister vec2, bool is_char) {
2388   ShortBranchVerifier sbv(this);
2389   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
2390 
2391   int length_offset  = arrayOopDesc::length_offset_in_bytes();
2392   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
2393 
2394   if (is_array_equ) {
2395     // Check the input args
2396     cmpoop(ary1, ary2);
2397     jcc(Assembler::equal, TRUE_LABEL);
2398 
2399     // Need additional checks for arrays_equals.
2400     testptr(ary1, ary1);
2401     jcc(Assembler::zero, FALSE_LABEL);
2402     testptr(ary2, ary2);
2403     jcc(Assembler::zero, FALSE_LABEL);
2404 
2405     // Check the lengths
2406     movl(limit, Address(ary1, length_offset));
2407     cmpl(limit, Address(ary2, length_offset));
2408     jcc(Assembler::notEqual, FALSE_LABEL);
2409   }
2410 
2411   // count == 0
2412   testl(limit, limit);
2413   jcc(Assembler::zero, TRUE_LABEL);
2414 
2415   if (is_array_equ) {
2416     // Load array address
2417     lea(ary1, Address(ary1, base_offset));
2418     lea(ary2, Address(ary2, base_offset));
2419   }
2420 
2421   if (is_array_equ && is_char) {
2422     // arrays_equals when used for char[].
2423     shll(limit, 1);      // byte count != 0
2424   }
2425   movl(result, limit); // copy
2426 
2427   if (UseAVX >= 2) {
2428     // With AVX2, use 32-byte vector compare
2429     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2430 
2431     // Compare 32-byte vectors
2432     andl(result, 0x0000001f);  //   tail count (in bytes)
2433     andl(limit, 0xffffffe0);   // vector count (in bytes)
2434     jcc(Assembler::zero, COMPARE_TAIL);
2435 
2436     lea(ary1, Address(ary1, limit, Address::times_1));
2437     lea(ary2, Address(ary2, limit, Address::times_1));
2438     negptr(limit);
2439 
2440 #ifdef _LP64
2441     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
2442       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
2443 
2444       cmpl(limit, -64);
2445       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
2446 
2447       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
2448 
2449       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
2450       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
2451       kortestql(k7, k7);
2452       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
2453       addptr(limit, 64);  // update since we already compared at this addr
2454       cmpl(limit, -64);
2455       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
2456 
2457       // At this point we may still need to compare -limit+result bytes.
2458       // We could execute the next two instruction and just continue via non-wide path:
2459       //  cmpl(limit, 0);
2460       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
2461       // But since we stopped at the points ary{1,2}+limit which are
2462       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
2463       // (|limit| <= 32 and result < 32),
2464       // we may just compare the last 64 bytes.
2465       //
2466       addptr(result, -64);   // it is safe, bc we just came from this area
2467       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
2468       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
2469       kortestql(k7, k7);
2470       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
2471 
2472       jmp(TRUE_LABEL);
2473 
2474       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
2475 
2476     }//if (VM_Version::supports_avx512vlbw())
2477 #endif //_LP64
2478     bind(COMPARE_WIDE_VECTORS);
2479     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
2480     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
2481     vpxor(vec1, vec2);
2482 
2483     vptest(vec1, vec1);
2484     jcc(Assembler::notZero, FALSE_LABEL);
2485     addptr(limit, 32);
2486     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2487 
2488     testl(result, result);
2489     jcc(Assembler::zero, TRUE_LABEL);
2490 
2491     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
2492     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
2493     vpxor(vec1, vec2);
2494 
2495     vptest(vec1, vec1);
2496     jccb(Assembler::notZero, FALSE_LABEL);
2497     jmpb(TRUE_LABEL);
2498 
2499     bind(COMPARE_TAIL); // limit is zero
2500     movl(limit, result);
2501     // Fallthru to tail compare
2502   } else if (UseSSE42Intrinsics) {
2503     // With SSE4.2, use double quad vector compare
2504     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
2505 
2506     // Compare 16-byte vectors
2507     andl(result, 0x0000000f);  //   tail count (in bytes)
2508     andl(limit, 0xfffffff0);   // vector count (in bytes)
2509     jcc(Assembler::zero, COMPARE_TAIL);
2510 
2511     lea(ary1, Address(ary1, limit, Address::times_1));
2512     lea(ary2, Address(ary2, limit, Address::times_1));
2513     negptr(limit);
2514 
2515     bind(COMPARE_WIDE_VECTORS);
2516     movdqu(vec1, Address(ary1, limit, Address::times_1));
2517     movdqu(vec2, Address(ary2, limit, Address::times_1));
2518     pxor(vec1, vec2);
2519 
2520     ptest(vec1, vec1);
2521     jcc(Assembler::notZero, FALSE_LABEL);
2522     addptr(limit, 16);
2523     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
2524 
2525     testl(result, result);
2526     jcc(Assembler::zero, TRUE_LABEL);
2527 
2528     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
2529     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
2530     pxor(vec1, vec2);
2531 
2532     ptest(vec1, vec1);
2533     jccb(Assembler::notZero, FALSE_LABEL);
2534     jmpb(TRUE_LABEL);
2535 
2536     bind(COMPARE_TAIL); // limit is zero
2537     movl(limit, result);
2538     // Fallthru to tail compare
2539   }
2540 
2541   // Compare 4-byte vectors
2542   andl(limit, 0xfffffffc); // vector count (in bytes)
2543   jccb(Assembler::zero, COMPARE_CHAR);
2544 
2545   lea(ary1, Address(ary1, limit, Address::times_1));
2546   lea(ary2, Address(ary2, limit, Address::times_1));
2547   negptr(limit);
2548 
2549   bind(COMPARE_VECTORS);
2550   movl(chr, Address(ary1, limit, Address::times_1));
2551   cmpl(chr, Address(ary2, limit, Address::times_1));
2552   jccb(Assembler::notEqual, FALSE_LABEL);
2553   addptr(limit, 4);
2554   jcc(Assembler::notZero, COMPARE_VECTORS);
2555 
2556   // Compare trailing char (final 2 bytes), if any
2557   bind(COMPARE_CHAR);
2558   testl(result, 0x2);   // tail  char
2559   jccb(Assembler::zero, COMPARE_BYTE);
2560   load_unsigned_short(chr, Address(ary1, 0));
2561   load_unsigned_short(limit, Address(ary2, 0));
2562   cmpl(chr, limit);
2563   jccb(Assembler::notEqual, FALSE_LABEL);
2564 
2565   if (is_array_equ && is_char) {
2566     bind(COMPARE_BYTE);
2567   } else {
2568     lea(ary1, Address(ary1, 2));
2569     lea(ary2, Address(ary2, 2));
2570 
2571     bind(COMPARE_BYTE);
2572     testl(result, 0x1);   // tail  byte
2573     jccb(Assembler::zero, TRUE_LABEL);
2574     load_unsigned_byte(chr, Address(ary1, 0));
2575     load_unsigned_byte(limit, Address(ary2, 0));
2576     cmpl(chr, limit);
2577     jccb(Assembler::notEqual, FALSE_LABEL);
2578   }
2579   bind(TRUE_LABEL);
2580   movl(result, 1);   // return true
2581   jmpb(DONE);
2582 
2583   bind(FALSE_LABEL);
2584   xorl(result, result); // return false
2585 
2586   // That's it
2587   bind(DONE);
2588   if (UseAVX >= 2) {
2589     // clean upper bits of YMM registers
2590     vpxor(vec1, vec1);
2591     vpxor(vec2, vec2);
2592   }
2593 }
2594