1 /*
2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.
8  *
9  * This code is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12  * version 2 for more details (a copy is included in the LICENSE file that
13  * accompanied this code).
14  *
15  * You should have received a copy of the GNU General Public License version
16  * 2 along with this work; if not, write to the Free Software Foundation,
17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18  *
19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20  * or visit www.oracle.com if you need additional information or have any
21  * questions.
22  *
23  */
24 
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "oops/methodData.hpp"
29 #include "opto/c2_MacroAssembler.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/opcodes.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/biasedLocking.hpp"
34 #include "runtime/objectMonitor.hpp"
35 #include "runtime/stubRoutines.hpp"
36 
vector_length_encoding(int vlen_in_bytes)37 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
38   switch (vlen_in_bytes) {
39     case  4: // fall-through
40     case  8: // fall-through
41     case 16: return Assembler::AVX_128bit;
42     case 32: return Assembler::AVX_256bit;
43     case 64: return Assembler::AVX_512bit;
44 
45     default: {
46       ShouldNotReachHere();
47       return Assembler::AVX_NoVec;
48     }
49   }
50 }
51 
setvectmask(Register dst,Register src)52 void C2_MacroAssembler::setvectmask(Register dst, Register src) {
53   guarantee(PostLoopMultiversioning, "must be");
54   Assembler::movl(dst, 1);
55   Assembler::shlxl(dst, dst, src);
56   Assembler::decl(dst);
57   Assembler::kmovdl(k1, dst);
58   Assembler::movl(dst, src);
59 }
60 
restorevectmask()61 void C2_MacroAssembler::restorevectmask() {
62   guarantee(PostLoopMultiversioning, "must be");
63   Assembler::knotwl(k1, k0);
64 }
65 
66 #if INCLUDE_RTM_OPT
67 
68 // Update rtm_counters based on abort status
69 // input: abort_status
70 //        rtm_counters (RTMLockingCounters*)
71 // flags are killed
rtm_counters_update(Register abort_status,Register rtm_counters)72 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
73 
74   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
75   if (PrintPreciseRTMLockingStatistics) {
76     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
77       Label check_abort;
78       testl(abort_status, (1<<i));
79       jccb(Assembler::equal, check_abort);
80       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
81       bind(check_abort);
82     }
83   }
84 }
85 
86 // Branch if (random & (count-1) != 0), count is 2^n
87 // tmp, scr and flags are killed
branch_on_random_using_rdtsc(Register tmp,Register scr,int count,Label & brLabel)88 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
89   assert(tmp == rax, "");
90   assert(scr == rdx, "");
91   rdtsc(); // modifies EDX:EAX
92   andptr(tmp, count-1);
93   jccb(Assembler::notZero, brLabel);
94 }
95 
96 // Perform abort ratio calculation, set no_rtm bit if high ratio
97 // input:  rtm_counters_Reg (RTMLockingCounters* address)
98 // tmpReg, rtm_counters_Reg and flags are killed
rtm_abort_ratio_calculation(Register tmpReg,Register rtm_counters_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data)99 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
100                                                     Register rtm_counters_Reg,
101                                                     RTMLockingCounters* rtm_counters,
102                                                     Metadata* method_data) {
103   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
104 
105   if (RTMLockingCalculationDelay > 0) {
106     // Delay calculation
107     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
108     testptr(tmpReg, tmpReg);
109     jccb(Assembler::equal, L_done);
110   }
111   // Abort ratio calculation only if abort_count > RTMAbortThreshold
112   //   Aborted transactions = abort_count * 100
113   //   All transactions = total_count *  RTMTotalCountIncrRate
114   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
115 
116   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
117   cmpptr(tmpReg, RTMAbortThreshold);
118   jccb(Assembler::below, L_check_always_rtm2);
119   imulptr(tmpReg, tmpReg, 100);
120 
121   Register scrReg = rtm_counters_Reg;
122   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
123   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
124   imulptr(scrReg, scrReg, RTMAbortRatio);
125   cmpptr(tmpReg, scrReg);
126   jccb(Assembler::below, L_check_always_rtm1);
127   if (method_data != NULL) {
128     // set rtm_state to "no rtm" in MDO
129     mov_metadata(tmpReg, method_data);
130     lock();
131     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
132   }
133   jmpb(L_done);
134   bind(L_check_always_rtm1);
135   // Reload RTMLockingCounters* address
136   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
137   bind(L_check_always_rtm2);
138   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
139   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
140   jccb(Assembler::below, L_done);
141   if (method_data != NULL) {
142     // set rtm_state to "always rtm" in MDO
143     mov_metadata(tmpReg, method_data);
144     lock();
145     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
146   }
147   bind(L_done);
148 }
149 
150 // Update counters and perform abort ratio calculation
151 // input:  abort_status_Reg
152 // rtm_counters_Reg, flags are killed
rtm_profiling(Register abort_status_Reg,Register rtm_counters_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm)153 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
154                                       Register rtm_counters_Reg,
155                                       RTMLockingCounters* rtm_counters,
156                                       Metadata* method_data,
157                                       bool profile_rtm) {
158 
159   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
160   // update rtm counters based on rax value at abort
161   // reads abort_status_Reg, updates flags
162   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
163   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
164   if (profile_rtm) {
165     // Save abort status because abort_status_Reg is used by following code.
166     if (RTMRetryCount > 0) {
167       push(abort_status_Reg);
168     }
169     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
170     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
171     // restore abort status
172     if (RTMRetryCount > 0) {
173       pop(abort_status_Reg);
174     }
175   }
176 }
177 
178 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
179 // inputs: retry_count_Reg
180 //       : abort_status_Reg
181 // output: retry_count_Reg decremented by 1
182 // flags are killed
rtm_retry_lock_on_abort(Register retry_count_Reg,Register abort_status_Reg,Label & retryLabel)183 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
184   Label doneRetry;
185   assert(abort_status_Reg == rax, "");
186   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
187   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
188   // if reason is in 0x6 and retry count != 0 then retry
189   andptr(abort_status_Reg, 0x6);
190   jccb(Assembler::zero, doneRetry);
191   testl(retry_count_Reg, retry_count_Reg);
192   jccb(Assembler::zero, doneRetry);
193   pause();
194   decrementl(retry_count_Reg);
195   jmp(retryLabel);
196   bind(doneRetry);
197 }
198 
199 // Spin and retry if lock is busy,
200 // inputs: box_Reg (monitor address)
201 //       : retry_count_Reg
202 // output: retry_count_Reg decremented by 1
203 //       : clear z flag if retry count exceeded
204 // tmp_Reg, scr_Reg, flags are killed
rtm_retry_lock_on_busy(Register retry_count_Reg,Register box_Reg,Register tmp_Reg,Register scr_Reg,Label & retryLabel)205 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
206                                                Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
207   Label SpinLoop, SpinExit, doneRetry;
208   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
209 
210   testl(retry_count_Reg, retry_count_Reg);
211   jccb(Assembler::zero, doneRetry);
212   decrementl(retry_count_Reg);
213   movptr(scr_Reg, RTMSpinLoopCount);
214 
215   bind(SpinLoop);
216   pause();
217   decrementl(scr_Reg);
218   jccb(Assembler::lessEqual, SpinExit);
219   movptr(tmp_Reg, Address(box_Reg, owner_offset));
220   testptr(tmp_Reg, tmp_Reg);
221   jccb(Assembler::notZero, SpinLoop);
222 
223   bind(SpinExit);
224   jmp(retryLabel);
225   bind(doneRetry);
226   incrementl(retry_count_Reg); // clear z flag
227 }
228 
229 // Use RTM for normal stack locks
230 // Input: objReg (object to lock)
rtm_stack_locking(Register objReg,Register tmpReg,Register scrReg,Register retry_on_abort_count_Reg,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL,Label & IsInflated)231 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
232                                          Register retry_on_abort_count_Reg,
233                                          RTMLockingCounters* stack_rtm_counters,
234                                          Metadata* method_data, bool profile_rtm,
235                                          Label& DONE_LABEL, Label& IsInflated) {
236   assert(UseRTMForStackLocks, "why call this otherwise?");
237   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
238   assert(tmpReg == rax, "");
239   assert(scrReg == rdx, "");
240   Label L_rtm_retry, L_decrement_retry, L_on_abort;
241 
242   if (RTMRetryCount > 0) {
243     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
244     bind(L_rtm_retry);
245   }
246   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
247   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
248   jcc(Assembler::notZero, IsInflated);
249 
250   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
251     Label L_noincrement;
252     if (RTMTotalCountIncrRate > 1) {
253       // tmpReg, scrReg and flags are killed
254       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
255     }
256     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
257     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
258     bind(L_noincrement);
259   }
260   xbegin(L_on_abort);
261   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
262   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
263   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
264   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
265 
266   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
267   if (UseRTMXendForLockBusy) {
268     xend();
269     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
270     jmp(L_decrement_retry);
271   }
272   else {
273     xabort(0);
274   }
275   bind(L_on_abort);
276   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
277     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
278   }
279   bind(L_decrement_retry);
280   if (RTMRetryCount > 0) {
281     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
282     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
283   }
284 }
285 
286 // Use RTM for inflating locks
287 // inputs: objReg (object to lock)
288 //         boxReg (on-stack box address (displaced header location) - KILLED)
289 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
rtm_inflated_locking(Register objReg,Register boxReg,Register tmpReg,Register scrReg,Register retry_on_busy_count_Reg,Register retry_on_abort_count_Reg,RTMLockingCounters * rtm_counters,Metadata * method_data,bool profile_rtm,Label & DONE_LABEL)290 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
291                                             Register scrReg, Register retry_on_busy_count_Reg,
292                                             Register retry_on_abort_count_Reg,
293                                             RTMLockingCounters* rtm_counters,
294                                             Metadata* method_data, bool profile_rtm,
295                                             Label& DONE_LABEL) {
296   assert(UseRTMLocking, "why call this otherwise?");
297   assert(tmpReg == rax, "");
298   assert(scrReg == rdx, "");
299   Label L_rtm_retry, L_decrement_retry, L_on_abort;
300   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
301 
302   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
303   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
304   movptr(boxReg, tmpReg); // Save ObjectMonitor address
305 
306   if (RTMRetryCount > 0) {
307     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
308     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
309     bind(L_rtm_retry);
310   }
311   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
312     Label L_noincrement;
313     if (RTMTotalCountIncrRate > 1) {
314       // tmpReg, scrReg and flags are killed
315       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
316     }
317     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
318     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
319     bind(L_noincrement);
320   }
321   xbegin(L_on_abort);
322   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
323   movptr(tmpReg, Address(tmpReg, owner_offset));
324   testptr(tmpReg, tmpReg);
325   jcc(Assembler::zero, DONE_LABEL);
326   if (UseRTMXendForLockBusy) {
327     xend();
328     jmp(L_decrement_retry);
329   }
330   else {
331     xabort(0);
332   }
333   bind(L_on_abort);
334   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
335   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
336     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
337   }
338   if (RTMRetryCount > 0) {
339     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
340     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
341   }
342 
343   movptr(tmpReg, Address(boxReg, owner_offset)) ;
344   testptr(tmpReg, tmpReg) ;
345   jccb(Assembler::notZero, L_decrement_retry) ;
346 
347   // Appears unlocked - try to swing _owner from null to non-null.
348   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
349 #ifdef _LP64
350   Register threadReg = r15_thread;
351 #else
352   get_thread(scrReg);
353   Register threadReg = scrReg;
354 #endif
355   lock();
356   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
357 
358   if (RTMRetryCount > 0) {
359     // success done else retry
360     jccb(Assembler::equal, DONE_LABEL) ;
361     bind(L_decrement_retry);
362     // Spin and retry if lock is busy.
363     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
364   }
365   else {
366     bind(L_decrement_retry);
367   }
368 }
369 
370 #endif //  INCLUDE_RTM_OPT
371 
372 // fast_lock and fast_unlock used by C2
373 
374 // Because the transitions from emitted code to the runtime
375 // monitorenter/exit helper stubs are so slow it's critical that
376 // we inline both the stack-locking fast path and the inflated fast path.
377 //
378 // See also: cmpFastLock and cmpFastUnlock.
379 //
380 // What follows is a specialized inline transliteration of the code
381 // in enter() and exit(). If we're concerned about I$ bloat another
382 // option would be to emit TrySlowEnter and TrySlowExit methods
383 // at startup-time.  These methods would accept arguments as
384 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
385 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
386 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
387 // In practice, however, the # of lock sites is bounded and is usually small.
388 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
389 // if the processor uses simple bimodal branch predictors keyed by EIP
390 // Since the helper routines would be called from multiple synchronization
391 // sites.
392 //
393 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
394 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
395 // to those specialized methods.  That'd give us a mostly platform-independent
396 // implementation that the JITs could optimize and inline at their pleasure.
397 // Done correctly, the only time we'd need to cross to native could would be
398 // to park() or unpark() threads.  We'd also need a few more unsafe operators
399 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
400 // (b) explicit barriers or fence operations.
401 //
402 // TODO:
403 //
404 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
405 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
406 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
407 //    the lock operators would typically be faster than reifying Self.
408 //
409 // *  Ideally I'd define the primitives as:
410 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
411 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
412 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
413 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
414 //    Furthermore the register assignments are overconstrained, possibly resulting in
415 //    sub-optimal code near the synchronization site.
416 //
417 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
418 //    Alternately, use a better sp-proximity test.
419 //
420 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
421 //    Either one is sufficient to uniquely identify a thread.
422 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
423 //
424 // *  Intrinsify notify() and notifyAll() for the common cases where the
425 //    object is locked by the calling thread but the waitlist is empty.
426 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
427 //
428 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
429 //    But beware of excessive branch density on AMD Opterons.
430 //
431 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
432 //    or failure of the fast path.  If the fast path fails then we pass
433 //    control to the slow path, typically in C.  In fast_lock and
434 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
435 //    will emit a conditional branch immediately after the node.
436 //    So we have branches to branches and lots of ICC.ZF games.
437 //    Instead, it might be better to have C2 pass a "FailureLabel"
438 //    into fast_lock and fast_unlock.  In the case of success, control
439 //    will drop through the node.  ICC.ZF is undefined at exit.
440 //    In the case of failure, the node will branch directly to the
441 //    FailureLabel
442 
443 
444 // obj: object to lock
445 // box: on-stack box address (displaced header location) - KILLED
446 // rax,: tmp -- KILLED
447 // scr: tmp -- KILLED
fast_lock(Register objReg,Register boxReg,Register tmpReg,Register scrReg,Register cx1Reg,Register cx2Reg,BiasedLockingCounters * counters,RTMLockingCounters * rtm_counters,RTMLockingCounters * stack_rtm_counters,Metadata * method_data,bool use_rtm,bool profile_rtm)448 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
449                                  Register scrReg, Register cx1Reg, Register cx2Reg,
450                                  BiasedLockingCounters* counters,
451                                  RTMLockingCounters* rtm_counters,
452                                  RTMLockingCounters* stack_rtm_counters,
453                                  Metadata* method_data,
454                                  bool use_rtm, bool profile_rtm) {
455   // Ensure the register assignments are disjoint
456   assert(tmpReg == rax, "");
457 
458   if (use_rtm) {
459     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
460   } else {
461     assert(cx2Reg == noreg, "");
462     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
463   }
464 
465   if (counters != NULL) {
466     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
467   }
468 
469   // Possible cases that we'll encounter in fast_lock
470   // ------------------------------------------------
471   // * Inflated
472   //    -- unlocked
473   //    -- Locked
474   //       = by self
475   //       = by other
476   // * biased
477   //    -- by Self
478   //    -- by other
479   // * neutral
480   // * stack-locked
481   //    -- by self
482   //       = sp-proximity test hits
483   //       = sp-proximity test generates false-negative
484   //    -- by other
485   //
486 
487   Label IsInflated, DONE_LABEL;
488 
489   if (DiagnoseSyncOnValueBasedClasses != 0) {
490     load_klass(tmpReg, objReg, cx1Reg);
491     movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
492     testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
493     jcc(Assembler::notZero, DONE_LABEL);
494   }
495 
496   // it's stack-locked, biased or neutral
497   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
498   // order to reduce the number of conditional branches in the most common cases.
499   // Beware -- there's a subtle invariant that fetch of the markword
500   // at [FETCH], below, will never observe a biased encoding (*101b).
501   // If this invariant is not held we risk exclusion (safety) failure.
502   if (UseBiasedLocking && !UseOptoBiasInlining) {
503     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
504   }
505 
506 #if INCLUDE_RTM_OPT
507   if (UseRTMForStackLocks && use_rtm) {
508     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
509                       stack_rtm_counters, method_data, profile_rtm,
510                       DONE_LABEL, IsInflated);
511   }
512 #endif // INCLUDE_RTM_OPT
513 
514   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
515   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
516   jccb(Assembler::notZero, IsInflated);
517 
518   // Attempt stack-locking ...
519   orptr (tmpReg, markWord::unlocked_value);
520   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
521   lock();
522   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
523   if (counters != NULL) {
524     cond_inc32(Assembler::equal,
525                ExternalAddress((address)counters->fast_path_entry_count_addr()));
526   }
527   jcc(Assembler::equal, DONE_LABEL);           // Success
528 
529   // Recursive locking.
530   // The object is stack-locked: markword contains stack pointer to BasicLock.
531   // Locked by current thread if difference with current SP is less than one page.
532   subptr(tmpReg, rsp);
533   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
534   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
535   movptr(Address(boxReg, 0), tmpReg);
536   if (counters != NULL) {
537     cond_inc32(Assembler::equal,
538                ExternalAddress((address)counters->fast_path_entry_count_addr()));
539   }
540   jmp(DONE_LABEL);
541 
542   bind(IsInflated);
543   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
544 
545 #if INCLUDE_RTM_OPT
546   // Use the same RTM locking code in 32- and 64-bit VM.
547   if (use_rtm) {
548     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
549                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
550   } else {
551 #endif // INCLUDE_RTM_OPT
552 
553 #ifndef _LP64
554   // The object is inflated.
555 
556   // boxReg refers to the on-stack BasicLock in the current frame.
557   // We'd like to write:
558   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
559   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
560   // additional latency as we have another ST in the store buffer that must drain.
561 
562   // avoid ST-before-CAS
563   // register juggle because we need tmpReg for cmpxchgptr below
564   movptr(scrReg, boxReg);
565   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
566 
567   // Optimistic form: consider XORL tmpReg,tmpReg
568   movptr(tmpReg, NULL_WORD);
569 
570   // Appears unlocked - try to swing _owner from null to non-null.
571   // Ideally, I'd manifest "Self" with get_thread and then attempt
572   // to CAS the register containing Self into m->Owner.
573   // But we don't have enough registers, so instead we can either try to CAS
574   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
575   // we later store "Self" into m->Owner.  Transiently storing a stack address
576   // (rsp or the address of the box) into  m->owner is harmless.
577   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
578   lock();
579   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
580   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
581   // If we weren't able to swing _owner from NULL to the BasicLock
582   // then take the slow path.
583   jccb  (Assembler::notZero, DONE_LABEL);
584   // update _owner from BasicLock to thread
585   get_thread (scrReg);                    // beware: clobbers ICCs
586   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
587   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
588 
589   // If the CAS fails we can either retry or pass control to the slow path.
590   // We use the latter tactic.
591   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
592   // If the CAS was successful ...
593   //   Self has acquired the lock
594   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
595   // Intentional fall-through into DONE_LABEL ...
596 #else // _LP64
597   // It's inflated and we use scrReg for ObjectMonitor* in this section.
598   movq(scrReg, tmpReg);
599   xorq(tmpReg, tmpReg);
600   lock();
601   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
602   // Unconditionally set box->_displaced_header = markWord::unused_mark().
603   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
604   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
605   // Intentional fall-through into DONE_LABEL ...
606   // Propagate ICC.ZF from CAS above into DONE_LABEL.
607 #endif // _LP64
608 #if INCLUDE_RTM_OPT
609   } // use_rtm()
610 #endif
611   // DONE_LABEL is a hot target - we'd really like to place it at the
612   // start of cache line by padding with NOPs.
613   // See the AMD and Intel software optimization manuals for the
614   // most efficient "long" NOP encodings.
615   // Unfortunately none of our alignment mechanisms suffice.
616   bind(DONE_LABEL);
617 
618   // At DONE_LABEL the icc ZFlag is set as follows ...
619   // fast_unlock uses the same protocol.
620   // ZFlag == 1 -> Success
621   // ZFlag == 0 -> Failure - force control through the slow path
622 }
623 
624 // obj: object to unlock
625 // box: box address (displaced header location), killed.  Must be EAX.
626 // tmp: killed, cannot be obj nor box.
627 //
628 // Some commentary on balanced locking:
629 //
630 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
631 // Methods that don't have provably balanced locking are forced to run in the
632 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
633 // The interpreter provides two properties:
634 // I1:  At return-time the interpreter automatically and quietly unlocks any
635 //      objects acquired the current activation (frame).  Recall that the
636 //      interpreter maintains an on-stack list of locks currently held by
637 //      a frame.
638 // I2:  If a method attempts to unlock an object that is not held by the
639 //      the frame the interpreter throws IMSX.
640 //
641 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
642 // B() doesn't have provably balanced locking so it runs in the interpreter.
643 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
644 // is still locked by A().
645 //
646 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
647 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
648 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
649 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
650 // Arguably given that the spec legislates the JNI case as undefined our implementation
651 // could reasonably *avoid* checking owner in fast_unlock().
652 // In the interest of performance we elide m->Owner==Self check in unlock.
653 // A perfectly viable alternative is to elide the owner check except when
654 // Xcheck:jni is enabled.
655 
fast_unlock(Register objReg,Register boxReg,Register tmpReg,bool use_rtm)656 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
657   assert(boxReg == rax, "");
658   assert_different_registers(objReg, boxReg, tmpReg);
659 
660   Label DONE_LABEL, Stacked, CheckSucc;
661 
662   // Critically, the biased locking test must have precedence over
663   // and appear before the (box->dhw == 0) recursive stack-lock test.
664   if (UseBiasedLocking && !UseOptoBiasInlining) {
665     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
666   }
667 
668 #if INCLUDE_RTM_OPT
669   if (UseRTMForStackLocks && use_rtm) {
670     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
671     Label L_regular_unlock;
672     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
673     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
674     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
675     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
676     xend();                                                           // otherwise end...
677     jmp(DONE_LABEL);                                                  // ... and we're done
678     bind(L_regular_unlock);
679   }
680 #endif
681 
682   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
683   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
684   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
685   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
686   jccb  (Assembler::zero, Stacked);
687 
688   // It's inflated.
689 #if INCLUDE_RTM_OPT
690   if (use_rtm) {
691     Label L_regular_inflated_unlock;
692     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
693     movptr(boxReg, Address(tmpReg, owner_offset));
694     testptr(boxReg, boxReg);
695     jccb(Assembler::notZero, L_regular_inflated_unlock);
696     xend();
697     jmpb(DONE_LABEL);
698     bind(L_regular_inflated_unlock);
699   }
700 #endif
701 
702   // Despite our balanced locking property we still check that m->_owner == Self
703   // as java routines or native JNI code called by this thread might
704   // have released the lock.
705   // Refer to the comments in synchronizer.cpp for how we might encode extra
706   // state in _succ so we can avoid fetching EntryList|cxq.
707   //
708   // I'd like to add more cases in fast_lock() and fast_unlock() --
709   // such as recursive enter and exit -- but we have to be wary of
710   // I$ bloat, T$ effects and BP$ effects.
711   //
712   // If there's no contention try a 1-0 exit.  That is, exit without
713   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
714   // we detect and recover from the race that the 1-0 exit admits.
715   //
716   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
717   // before it STs null into _owner, releasing the lock.  Updates
718   // to data protected by the critical section must be visible before
719   // we drop the lock (and thus before any other thread could acquire
720   // the lock and observe the fields protected by the lock).
721   // IA32's memory-model is SPO, so STs are ordered with respect to
722   // each other and there's no need for an explicit barrier (fence).
723   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
724 #ifndef _LP64
725   get_thread (boxReg);
726 
727   // Note that we could employ various encoding schemes to reduce
728   // the number of loads below (currently 4) to just 2 or 3.
729   // Refer to the comments in synchronizer.cpp.
730   // In practice the chain of fetches doesn't seem to impact performance, however.
731   xorptr(boxReg, boxReg);
732   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
733   jccb  (Assembler::notZero, DONE_LABEL);
734   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
735   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
736   jccb  (Assembler::notZero, CheckSucc);
737   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
738   jmpb  (DONE_LABEL);
739 
740   bind (Stacked);
741   // It's not inflated and it's not recursively stack-locked and it's not biased.
742   // It must be stack-locked.
743   // Try to reset the header to displaced header.
744   // The "box" value on the stack is stable, so we can reload
745   // and be assured we observe the same value as above.
746   movptr(tmpReg, Address(boxReg, 0));
747   lock();
748   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
749   // Intention fall-thru into DONE_LABEL
750 
751   // DONE_LABEL is a hot target - we'd really like to place it at the
752   // start of cache line by padding with NOPs.
753   // See the AMD and Intel software optimization manuals for the
754   // most efficient "long" NOP encodings.
755   // Unfortunately none of our alignment mechanisms suffice.
756   bind (CheckSucc);
757 #else // _LP64
758   // It's inflated
759   xorptr(boxReg, boxReg);
760   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
761   jccb  (Assembler::notZero, DONE_LABEL);
762   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
763   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
764   jccb  (Assembler::notZero, CheckSucc);
765   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
766   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
767   jmpb  (DONE_LABEL);
768 
769   // Try to avoid passing control into the slow_path ...
770   Label LSuccess, LGoSlowPath ;
771   bind  (CheckSucc);
772 
773   // The following optional optimization can be elided if necessary
774   // Effectively: if (succ == null) goto slow path
775   // The code reduces the window for a race, however,
776   // and thus benefits performance.
777   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
778   jccb  (Assembler::zero, LGoSlowPath);
779 
780   xorptr(boxReg, boxReg);
781   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
782   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
783 
784   // Memory barrier/fence
785   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
786   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
787   // This is faster on Nehalem and AMD Shanghai/Barcelona.
788   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
789   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
790   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
791   lock(); addl(Address(rsp, 0), 0);
792 
793   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
794   jccb  (Assembler::notZero, LSuccess);
795 
796   // Rare inopportune interleaving - race.
797   // The successor vanished in the small window above.
798   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
799   // We need to ensure progress and succession.
800   // Try to reacquire the lock.
801   // If that fails then the new owner is responsible for succession and this
802   // thread needs to take no further action and can exit via the fast path (success).
803   // If the re-acquire succeeds then pass control into the slow path.
804   // As implemented, this latter mode is horrible because we generated more
805   // coherence traffic on the lock *and* artifically extended the critical section
806   // length while by virtue of passing control into the slow path.
807 
808   // box is really RAX -- the following CMPXCHG depends on that binding
809   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
810   lock();
811   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
812   // There's no successor so we tried to regrab the lock.
813   // If that didn't work, then another thread grabbed the
814   // lock so we're done (and exit was a success).
815   jccb  (Assembler::notEqual, LSuccess);
816   // Intentional fall-through into slow path
817 
818   bind  (LGoSlowPath);
819   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
820   jmpb  (DONE_LABEL);
821 
822   bind  (LSuccess);
823   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
824   jmpb  (DONE_LABEL);
825 
826   bind  (Stacked);
827   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
828   lock();
829   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
830 
831 #endif
832   bind(DONE_LABEL);
833 }
834 
835 //-------------------------------------------------------------------------------------------
836 // Generic instructions support for use in .ad files C2 code generation
837 
vabsnegd(int opcode,XMMRegister dst,XMMRegister src,Register scr)838 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
839   if (dst != src) {
840     movdqu(dst, src);
841   }
842   if (opcode == Op_AbsVD) {
843     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
844   } else {
845     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
846     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
847   }
848 }
849 
vabsnegd(int opcode,XMMRegister dst,XMMRegister src,int vector_len,Register scr)850 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
851   if (opcode == Op_AbsVD) {
852     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
853   } else {
854     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
855     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
856   }
857 }
858 
vabsnegf(int opcode,XMMRegister dst,XMMRegister src,Register scr)859 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
860   if (dst != src) {
861     movdqu(dst, src);
862   }
863   if (opcode == Op_AbsVF) {
864     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
865   } else {
866     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
867     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
868   }
869 }
870 
vabsnegf(int opcode,XMMRegister dst,XMMRegister src,int vector_len,Register scr)871 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
872   if (opcode == Op_AbsVF) {
873     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
874   } else {
875     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
876     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
877   }
878 }
879 
pminmax(int opcode,BasicType elem_bt,XMMRegister dst,XMMRegister src,XMMRegister tmp)880 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
881   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
882   assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
883 
884   if (opcode == Op_MinV) {
885     if (elem_bt == T_BYTE) {
886       pminsb(dst, src);
887     } else if (elem_bt == T_SHORT) {
888       pminsw(dst, src);
889     } else if (elem_bt == T_INT) {
890       pminsd(dst, src);
891     } else {
892       assert(elem_bt == T_LONG, "required");
893       assert(tmp == xmm0, "required");
894       assert_different_registers(dst, src, tmp);
895       movdqu(xmm0, dst);
896       pcmpgtq(xmm0, src);
897       blendvpd(dst, src);  // xmm0 as mask
898     }
899   } else { // opcode == Op_MaxV
900     if (elem_bt == T_BYTE) {
901       pmaxsb(dst, src);
902     } else if (elem_bt == T_SHORT) {
903       pmaxsw(dst, src);
904     } else if (elem_bt == T_INT) {
905       pmaxsd(dst, src);
906     } else {
907       assert(elem_bt == T_LONG, "required");
908       assert(tmp == xmm0, "required");
909       assert_different_registers(dst, src, tmp);
910       movdqu(xmm0, src);
911       pcmpgtq(xmm0, dst);
912       blendvpd(dst, src);  // xmm0 as mask
913     }
914   }
915 }
916 
vpminmax(int opcode,BasicType elem_bt,XMMRegister dst,XMMRegister src1,XMMRegister src2,int vlen_enc)917 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
918                                  XMMRegister dst, XMMRegister src1, XMMRegister src2,
919                                  int vlen_enc) {
920   assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
921 
922   if (opcode == Op_MinV) {
923     if (elem_bt == T_BYTE) {
924       vpminsb(dst, src1, src2, vlen_enc);
925     } else if (elem_bt == T_SHORT) {
926       vpminsw(dst, src1, src2, vlen_enc);
927     } else if (elem_bt == T_INT) {
928       vpminsd(dst, src1, src2, vlen_enc);
929     } else {
930       assert(elem_bt == T_LONG, "required");
931       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
932         vpminsq(dst, src1, src2, vlen_enc);
933       } else {
934         assert_different_registers(dst, src1, src2);
935         vpcmpgtq(dst, src1, src2, vlen_enc);
936         vblendvpd(dst, src1, src2, dst, vlen_enc);
937       }
938     }
939   } else { // opcode == Op_MaxV
940     if (elem_bt == T_BYTE) {
941       vpmaxsb(dst, src1, src2, vlen_enc);
942     } else if (elem_bt == T_SHORT) {
943       vpmaxsw(dst, src1, src2, vlen_enc);
944     } else if (elem_bt == T_INT) {
945       vpmaxsd(dst, src1, src2, vlen_enc);
946     } else {
947       assert(elem_bt == T_LONG, "required");
948       if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
949         vpmaxsq(dst, src1, src2, vlen_enc);
950       } else {
951         assert_different_registers(dst, src1, src2);
952         vpcmpgtq(dst, src1, src2, vlen_enc);
953         vblendvpd(dst, src2, src1, dst, vlen_enc);
954       }
955     }
956   }
957 }
958 
959 // Float/Double min max
960 
vminmax_fp(int opcode,BasicType elem_bt,XMMRegister dst,XMMRegister a,XMMRegister b,XMMRegister tmp,XMMRegister atmp,XMMRegister btmp,int vlen_enc)961 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
962                                    XMMRegister dst, XMMRegister a, XMMRegister b,
963                                    XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
964                                    int vlen_enc) {
965   assert(UseAVX > 0, "required");
966   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
967          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
968   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
969   assert_different_registers(a, b, tmp, atmp, btmp);
970 
971   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
972   bool is_double_word = is_double_word_type(elem_bt);
973 
974   if (!is_double_word && is_min) {
975     vblendvps(atmp, a, b, a, vlen_enc);
976     vblendvps(btmp, b, a, a, vlen_enc);
977     vminps(tmp, atmp, btmp, vlen_enc);
978     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
979     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
980   } else if (!is_double_word && !is_min) {
981     vblendvps(btmp, b, a, b, vlen_enc);
982     vblendvps(atmp, a, b, b, vlen_enc);
983     vmaxps(tmp, atmp, btmp, vlen_enc);
984     vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
985     vblendvps(dst, tmp, atmp, btmp, vlen_enc);
986   } else if (is_double_word && is_min) {
987     vblendvpd(atmp, a, b, a, vlen_enc);
988     vblendvpd(btmp, b, a, a, vlen_enc);
989     vminpd(tmp, atmp, btmp, vlen_enc);
990     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
991     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
992   } else {
993     assert(is_double_word && !is_min, "sanity");
994     vblendvpd(btmp, b, a, b, vlen_enc);
995     vblendvpd(atmp, a, b, b, vlen_enc);
996     vmaxpd(tmp, atmp, btmp, vlen_enc);
997     vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
998     vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
999   }
1000 }
1001 
evminmax_fp(int opcode,BasicType elem_bt,XMMRegister dst,XMMRegister a,XMMRegister b,KRegister ktmp,XMMRegister atmp,XMMRegister btmp,int vlen_enc)1002 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1003                                     XMMRegister dst, XMMRegister a, XMMRegister b,
1004                                     KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1005                                     int vlen_enc) {
1006   assert(UseAVX > 2, "required");
1007   assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1008          opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1009   assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1010   assert_different_registers(dst, a, b, atmp, btmp);
1011 
1012   bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1013   bool is_double_word = is_double_word_type(elem_bt);
1014   bool merge = true;
1015 
1016   if (!is_double_word && is_min) {
1017     evpmovd2m(ktmp, a, vlen_enc);
1018     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1019     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1020     vminps(dst, atmp, btmp, vlen_enc);
1021     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1022     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1023   } else if (!is_double_word && !is_min) {
1024     evpmovd2m(ktmp, b, vlen_enc);
1025     evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1026     evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1027     vmaxps(dst, atmp, btmp, vlen_enc);
1028     evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1029     evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1030   } else if (is_double_word && is_min) {
1031     evpmovq2m(ktmp, a, vlen_enc);
1032     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1033     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1034     vminpd(dst, atmp, btmp, vlen_enc);
1035     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1037   } else {
1038     assert(is_double_word && !is_min, "sanity");
1039     evpmovq2m(ktmp, b, vlen_enc);
1040     evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1041     evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1042     vmaxpd(dst, atmp, btmp, vlen_enc);
1043     evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1044     evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1045   }
1046 }
1047 
vextendbw(bool sign,XMMRegister dst,XMMRegister src)1048 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1049   if (sign) {
1050     pmovsxbw(dst, src);
1051   } else {
1052     pmovzxbw(dst, src);
1053   }
1054 }
1055 
vextendbw(bool sign,XMMRegister dst,XMMRegister src,int vector_len)1056 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1057   if (sign) {
1058     vpmovsxbw(dst, src, vector_len);
1059   } else {
1060     vpmovzxbw(dst, src, vector_len);
1061   }
1062 }
1063 
vextendbd(bool sign,XMMRegister dst,XMMRegister src,int vector_len)1064 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1065   if (sign) {
1066     vpmovsxbd(dst, src, vector_len);
1067   } else {
1068     vpmovzxbd(dst, src, vector_len);
1069   }
1070 }
1071 
vextendwd(bool sign,XMMRegister dst,XMMRegister src,int vector_len)1072 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1073   if (sign) {
1074     vpmovsxwd(dst, src, vector_len);
1075   } else {
1076     vpmovzxwd(dst, src, vector_len);
1077   }
1078 }
1079 
vprotate_imm(int opcode,BasicType etype,XMMRegister dst,XMMRegister src,int shift,int vector_len)1080 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1081                                      int shift, int vector_len) {
1082   if (opcode == Op_RotateLeftV) {
1083     if (etype == T_INT) {
1084       evprold(dst, src, shift, vector_len);
1085     } else {
1086       assert(etype == T_LONG, "expected type T_LONG");
1087       evprolq(dst, src, shift, vector_len);
1088     }
1089   } else {
1090     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1091     if (etype == T_INT) {
1092       evprord(dst, src, shift, vector_len);
1093     } else {
1094       assert(etype == T_LONG, "expected type T_LONG");
1095       evprorq(dst, src, shift, vector_len);
1096     }
1097   }
1098 }
1099 
vprotate_var(int opcode,BasicType etype,XMMRegister dst,XMMRegister src,XMMRegister shift,int vector_len)1100 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1101                                      XMMRegister shift, int vector_len) {
1102   if (opcode == Op_RotateLeftV) {
1103     if (etype == T_INT) {
1104       evprolvd(dst, src, shift, vector_len);
1105     } else {
1106       assert(etype == T_LONG, "expected type T_LONG");
1107       evprolvq(dst, src, shift, vector_len);
1108     }
1109   } else {
1110     assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1111     if (etype == T_INT) {
1112       evprorvd(dst, src, shift, vector_len);
1113     } else {
1114       assert(etype == T_LONG, "expected type T_LONG");
1115       evprorvq(dst, src, shift, vector_len);
1116     }
1117   }
1118 }
1119 
vshiftd_imm(int opcode,XMMRegister dst,int shift)1120 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1121   if (opcode == Op_RShiftVI) {
1122     psrad(dst, shift);
1123   } else if (opcode == Op_LShiftVI) {
1124     pslld(dst, shift);
1125   } else {
1126     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1127     psrld(dst, shift);
1128   }
1129 }
1130 
vshiftd(int opcode,XMMRegister dst,XMMRegister shift)1131 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1132   switch (opcode) {
1133     case Op_RShiftVI:  psrad(dst, shift); break;
1134     case Op_LShiftVI:  pslld(dst, shift); break;
1135     case Op_URShiftVI: psrld(dst, shift); break;
1136 
1137     default: assert(false, "%s", NodeClassNames[opcode]);
1138   }
1139 }
1140 
vshiftd_imm(int opcode,XMMRegister dst,XMMRegister nds,int shift,int vector_len)1141 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1142   if (opcode == Op_RShiftVI) {
1143     vpsrad(dst, nds, shift, vector_len);
1144   } else if (opcode == Op_LShiftVI) {
1145     vpslld(dst, nds, shift, vector_len);
1146   } else {
1147     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1148     vpsrld(dst, nds, shift, vector_len);
1149   }
1150 }
1151 
vshiftd(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc)1152 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1153   switch (opcode) {
1154     case Op_RShiftVI:  vpsrad(dst, src, shift, vlen_enc); break;
1155     case Op_LShiftVI:  vpslld(dst, src, shift, vlen_enc); break;
1156     case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1157 
1158     default: assert(false, "%s", NodeClassNames[opcode]);
1159   }
1160 }
1161 
vshiftw(int opcode,XMMRegister dst,XMMRegister shift)1162 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1163   switch (opcode) {
1164     case Op_RShiftVB:  // fall-through
1165     case Op_RShiftVS:  psraw(dst, shift); break;
1166 
1167     case Op_LShiftVB:  // fall-through
1168     case Op_LShiftVS:  psllw(dst, shift);   break;
1169 
1170     case Op_URShiftVS: // fall-through
1171     case Op_URShiftVB: psrlw(dst, shift);  break;
1172 
1173     default: assert(false, "%s", NodeClassNames[opcode]);
1174   }
1175 }
1176 
vshiftw(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc)1177 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1178   switch (opcode) {
1179     case Op_RShiftVB:  // fall-through
1180     case Op_RShiftVS:  vpsraw(dst, src, shift, vlen_enc); break;
1181 
1182     case Op_LShiftVB:  // fall-through
1183     case Op_LShiftVS:  vpsllw(dst, src, shift, vlen_enc); break;
1184 
1185     case Op_URShiftVS: // fall-through
1186     case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1187 
1188     default: assert(false, "%s", NodeClassNames[opcode]);
1189   }
1190 }
1191 
vshiftq(int opcode,XMMRegister dst,XMMRegister shift)1192 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1193   switch (opcode) {
1194     case Op_RShiftVL:  psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1195     case Op_LShiftVL:  psllq(dst, shift); break;
1196     case Op_URShiftVL: psrlq(dst, shift); break;
1197 
1198     default: assert(false, "%s", NodeClassNames[opcode]);
1199   }
1200 }
1201 
vshiftq_imm(int opcode,XMMRegister dst,int shift)1202 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1203   if (opcode == Op_RShiftVL) {
1204     psrlq(dst, shift);  // using srl to implement sra on pre-avs512 systems
1205   } else if (opcode == Op_LShiftVL) {
1206     psllq(dst, shift);
1207   } else {
1208     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1209     psrlq(dst, shift);
1210   }
1211 }
1212 
vshiftq(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc)1213 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1214   switch (opcode) {
1215     case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1216     case Op_LShiftVL:  vpsllq(dst, src, shift, vlen_enc); break;
1217     case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1218 
1219     default: assert(false, "%s", NodeClassNames[opcode]);
1220   }
1221 }
1222 
vshiftq_imm(int opcode,XMMRegister dst,XMMRegister nds,int shift,int vector_len)1223 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1224   if (opcode == Op_RShiftVL) {
1225     evpsraq(dst, nds, shift, vector_len);
1226   } else if (opcode == Op_LShiftVL) {
1227     vpsllq(dst, nds, shift, vector_len);
1228   } else {
1229     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1230     vpsrlq(dst, nds, shift, vector_len);
1231   }
1232 }
1233 
varshiftd(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc)1234 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1235   switch (opcode) {
1236     case Op_RShiftVB:  // fall-through
1237     case Op_RShiftVS:  // fall-through
1238     case Op_RShiftVI:  vpsravd(dst, src, shift, vlen_enc); break;
1239 
1240     case Op_LShiftVB:  // fall-through
1241     case Op_LShiftVS:  // fall-through
1242     case Op_LShiftVI:  vpsllvd(dst, src, shift, vlen_enc); break;
1243 
1244     case Op_URShiftVB: // fall-through
1245     case Op_URShiftVS: // fall-through
1246     case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1247 
1248     default: assert(false, "%s", NodeClassNames[opcode]);
1249   }
1250 }
1251 
varshiftw(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc)1252 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1253   switch (opcode) {
1254     case Op_RShiftVB:  // fall-through
1255     case Op_RShiftVS:  evpsravw(dst, src, shift, vlen_enc); break;
1256 
1257     case Op_LShiftVB:  // fall-through
1258     case Op_LShiftVS:  evpsllvw(dst, src, shift, vlen_enc); break;
1259 
1260     case Op_URShiftVB: // fall-through
1261     case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1262 
1263     default: assert(false, "%s", NodeClassNames[opcode]);
1264   }
1265 }
1266 
varshiftq(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vlen_enc,XMMRegister tmp)1267 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1268   assert(UseAVX >= 2, "required");
1269   switch (opcode) {
1270     case Op_RShiftVL: {
1271       if (UseAVX > 2) {
1272         assert(tmp == xnoreg, "not used");
1273         if (!VM_Version::supports_avx512vl()) {
1274           vlen_enc = Assembler::AVX_512bit;
1275         }
1276         evpsravq(dst, src, shift, vlen_enc);
1277       } else {
1278         vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1279         vpsrlvq(dst, src, shift, vlen_enc);
1280         vpsrlvq(tmp, tmp, shift, vlen_enc);
1281         vpxor(dst, dst, tmp, vlen_enc);
1282         vpsubq(dst, dst, tmp, vlen_enc);
1283       }
1284       break;
1285     }
1286     case Op_LShiftVL: {
1287       assert(tmp == xnoreg, "not used");
1288       vpsllvq(dst, src, shift, vlen_enc);
1289       break;
1290     }
1291     case Op_URShiftVL: {
1292       assert(tmp == xnoreg, "not used");
1293       vpsrlvq(dst, src, shift, vlen_enc);
1294       break;
1295     }
1296     default: assert(false, "%s", NodeClassNames[opcode]);
1297   }
1298 }
1299 
1300 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
varshiftbw(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vector_len,XMMRegister vtmp,Register scratch)1301 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1302   assert(opcode == Op_LShiftVB ||
1303          opcode == Op_RShiftVB ||
1304          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1305   bool sign = (opcode != Op_URShiftVB);
1306   assert(vector_len == 0, "required");
1307   vextendbd(sign, dst, src, 1);
1308   vpmovzxbd(vtmp, shift, 1);
1309   varshiftd(opcode, dst, dst, vtmp, 1);
1310   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1311   vextracti128_high(vtmp, dst);
1312   vpackusdw(dst, dst, vtmp, 0);
1313 }
1314 
1315 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
evarshiftb(int opcode,XMMRegister dst,XMMRegister src,XMMRegister shift,int vector_len,XMMRegister vtmp,Register scratch)1316 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1317   assert(opcode == Op_LShiftVB ||
1318          opcode == Op_RShiftVB ||
1319          opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1320   bool sign = (opcode != Op_URShiftVB);
1321   int ext_vector_len = vector_len + 1;
1322   vextendbw(sign, dst, src, ext_vector_len);
1323   vpmovzxbw(vtmp, shift, ext_vector_len);
1324   varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1325   vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1326   if (vector_len == 0) {
1327     vextracti128_high(vtmp, dst);
1328     vpackuswb(dst, dst, vtmp, vector_len);
1329   } else {
1330     vextracti64x4_high(vtmp, dst);
1331     vpackuswb(dst, dst, vtmp, vector_len);
1332     vpermq(dst, dst, 0xD8, vector_len);
1333   }
1334 }
1335 
insert(BasicType typ,XMMRegister dst,Register val,int idx)1336 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1337   switch(typ) {
1338     case T_BYTE:
1339       pinsrb(dst, val, idx);
1340       break;
1341     case T_SHORT:
1342       pinsrw(dst, val, idx);
1343       break;
1344     case T_INT:
1345       pinsrd(dst, val, idx);
1346       break;
1347     case T_LONG:
1348       pinsrq(dst, val, idx);
1349       break;
1350     default:
1351       assert(false,"Should not reach here.");
1352       break;
1353   }
1354 }
1355 
vinsert(BasicType typ,XMMRegister dst,XMMRegister src,Register val,int idx)1356 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1357   switch(typ) {
1358     case T_BYTE:
1359       vpinsrb(dst, src, val, idx);
1360       break;
1361     case T_SHORT:
1362       vpinsrw(dst, src, val, idx);
1363       break;
1364     case T_INT:
1365       vpinsrd(dst, src, val, idx);
1366       break;
1367     case T_LONG:
1368       vpinsrq(dst, src, val, idx);
1369       break;
1370     default:
1371       assert(false,"Should not reach here.");
1372       break;
1373   }
1374 }
1375 
vgather(BasicType typ,XMMRegister dst,Register base,XMMRegister idx,XMMRegister mask,int vector_len)1376 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1377   switch(typ) {
1378     case T_INT:
1379       vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1380       break;
1381     case T_FLOAT:
1382       vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1383       break;
1384     case T_LONG:
1385       vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1386       break;
1387     case T_DOUBLE:
1388       vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1389       break;
1390     default:
1391       assert(false,"Should not reach here.");
1392       break;
1393   }
1394 }
1395 
evgather(BasicType typ,XMMRegister dst,KRegister mask,Register base,XMMRegister idx,int vector_len)1396 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1397   switch(typ) {
1398     case T_INT:
1399       evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1400       break;
1401     case T_FLOAT:
1402       evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1403       break;
1404     case T_LONG:
1405       evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1406       break;
1407     case T_DOUBLE:
1408       evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1409       break;
1410     default:
1411       assert(false,"Should not reach here.");
1412       break;
1413   }
1414 }
1415 
evscatter(BasicType typ,Register base,XMMRegister idx,KRegister mask,XMMRegister src,int vector_len)1416 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1417   switch(typ) {
1418     case T_INT:
1419       evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1420       break;
1421     case T_FLOAT:
1422       evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1423       break;
1424     case T_LONG:
1425       evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1426       break;
1427     case T_DOUBLE:
1428       evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1429       break;
1430     default:
1431       assert(false,"Should not reach here.");
1432       break;
1433   }
1434 }
1435 
load_vector_mask(XMMRegister dst,XMMRegister src,int vlen_in_bytes,BasicType elem_bt)1436 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) {
1437   if (vlen_in_bytes <= 16) {
1438     pxor (dst, dst);
1439     psubb(dst, src);
1440     switch (elem_bt) {
1441       case T_BYTE:   /* nothing to do */ break;
1442       case T_SHORT:  pmovsxbw(dst, dst); break;
1443       case T_INT:    pmovsxbd(dst, dst); break;
1444       case T_FLOAT:  pmovsxbd(dst, dst); break;
1445       case T_LONG:   pmovsxbq(dst, dst); break;
1446       case T_DOUBLE: pmovsxbq(dst, dst); break;
1447 
1448       default: assert(false, "%s", type2name(elem_bt));
1449     }
1450   } else {
1451     int vlen_enc = vector_length_encoding(vlen_in_bytes);
1452 
1453     vpxor (dst, dst, dst, vlen_enc);
1454     vpsubb(dst, dst, src, vlen_enc);
1455     switch (elem_bt) {
1456       case T_BYTE:   /* nothing to do */            break;
1457       case T_SHORT:  vpmovsxbw(dst, dst, vlen_enc); break;
1458       case T_INT:    vpmovsxbd(dst, dst, vlen_enc); break;
1459       case T_FLOAT:  vpmovsxbd(dst, dst, vlen_enc); break;
1460       case T_LONG:   vpmovsxbq(dst, dst, vlen_enc); break;
1461       case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1462 
1463       default: assert(false, "%s", type2name(elem_bt));
1464     }
1465   }
1466 }
1467 
load_iota_indices(XMMRegister dst,Register scratch,int vlen_in_bytes)1468 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1469   ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1470   if (vlen_in_bytes <= 16) {
1471     movdqu(dst, addr, scratch);
1472   } else if (vlen_in_bytes == 32) {
1473     vmovdqu(dst, addr, scratch);
1474   } else {
1475     assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1476     evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1477   }
1478 }
1479 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1480 
reduce_operation_128(BasicType typ,int opcode,XMMRegister dst,XMMRegister src)1481 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1482   int vector_len = Assembler::AVX_128bit;
1483 
1484   switch (opcode) {
1485     case Op_AndReductionV:  pand(dst, src); break;
1486     case Op_OrReductionV:   por (dst, src); break;
1487     case Op_XorReductionV:  pxor(dst, src); break;
1488     case Op_MinReductionV:
1489       switch (typ) {
1490         case T_BYTE:        pminsb(dst, src); break;
1491         case T_SHORT:       pminsw(dst, src); break;
1492         case T_INT:         pminsd(dst, src); break;
1493         case T_LONG:        assert(UseAVX > 2, "required");
1494                             vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1495         default:            assert(false, "wrong type");
1496       }
1497       break;
1498     case Op_MaxReductionV:
1499       switch (typ) {
1500         case T_BYTE:        pmaxsb(dst, src); break;
1501         case T_SHORT:       pmaxsw(dst, src); break;
1502         case T_INT:         pmaxsd(dst, src); break;
1503         case T_LONG:        assert(UseAVX > 2, "required");
1504                             vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1505         default:            assert(false, "wrong type");
1506       }
1507       break;
1508     case Op_AddReductionVF: addss(dst, src); break;
1509     case Op_AddReductionVD: addsd(dst, src); break;
1510     case Op_AddReductionVI:
1511       switch (typ) {
1512         case T_BYTE:        paddb(dst, src); break;
1513         case T_SHORT:       paddw(dst, src); break;
1514         case T_INT:         paddd(dst, src); break;
1515         default:            assert(false, "wrong type");
1516       }
1517       break;
1518     case Op_AddReductionVL: paddq(dst, src); break;
1519     case Op_MulReductionVF: mulss(dst, src); break;
1520     case Op_MulReductionVD: mulsd(dst, src); break;
1521     case Op_MulReductionVI:
1522       switch (typ) {
1523         case T_SHORT:       pmullw(dst, src); break;
1524         case T_INT:         pmulld(dst, src); break;
1525         default:            assert(false, "wrong type");
1526       }
1527       break;
1528     case Op_MulReductionVL: assert(UseAVX > 2, "required");
1529                             vpmullq(dst, dst, src, vector_len); break;
1530     default:                assert(false, "wrong opcode");
1531   }
1532 }
1533 
reduce_operation_256(BasicType typ,int opcode,XMMRegister dst,XMMRegister src1,XMMRegister src2)1534 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst,  XMMRegister src1, XMMRegister src2) {
1535   int vector_len = Assembler::AVX_256bit;
1536 
1537   switch (opcode) {
1538     case Op_AndReductionV:  vpand(dst, src1, src2, vector_len); break;
1539     case Op_OrReductionV:   vpor (dst, src1, src2, vector_len); break;
1540     case Op_XorReductionV:  vpxor(dst, src1, src2, vector_len); break;
1541     case Op_MinReductionV:
1542       switch (typ) {
1543         case T_BYTE:        vpminsb(dst, src1, src2, vector_len); break;
1544         case T_SHORT:       vpminsw(dst, src1, src2, vector_len); break;
1545         case T_INT:         vpminsd(dst, src1, src2, vector_len); break;
1546         case T_LONG:        assert(UseAVX > 2, "required");
1547                             vpminsq(dst, src1, src2, vector_len); break;
1548         default:            assert(false, "wrong type");
1549       }
1550       break;
1551     case Op_MaxReductionV:
1552       switch (typ) {
1553         case T_BYTE:        vpmaxsb(dst, src1, src2, vector_len); break;
1554         case T_SHORT:       vpmaxsw(dst, src1, src2, vector_len); break;
1555         case T_INT:         vpmaxsd(dst, src1, src2, vector_len); break;
1556         case T_LONG:        assert(UseAVX > 2, "required");
1557                             vpmaxsq(dst, src1, src2, vector_len); break;
1558         default:            assert(false, "wrong type");
1559       }
1560       break;
1561     case Op_AddReductionVI:
1562       switch (typ) {
1563         case T_BYTE:        vpaddb(dst, src1, src2, vector_len); break;
1564         case T_SHORT:       vpaddw(dst, src1, src2, vector_len); break;
1565         case T_INT:         vpaddd(dst, src1, src2, vector_len); break;
1566         default:            assert(false, "wrong type");
1567       }
1568       break;
1569     case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1570     case Op_MulReductionVI:
1571       switch (typ) {
1572         case T_SHORT:       vpmullw(dst, src1, src2, vector_len); break;
1573         case T_INT:         vpmulld(dst, src1, src2, vector_len); break;
1574         default:            assert(false, "wrong type");
1575       }
1576       break;
1577     case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1578     default:                assert(false, "wrong opcode");
1579   }
1580 }
1581 
reduce_fp(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1582 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1583                                   XMMRegister dst, XMMRegister src,
1584                                   XMMRegister vtmp1, XMMRegister vtmp2) {
1585   switch (opcode) {
1586     case Op_AddReductionVF:
1587     case Op_MulReductionVF:
1588       reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1589       break;
1590 
1591     case Op_AddReductionVD:
1592     case Op_MulReductionVD:
1593       reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1594       break;
1595 
1596     default: assert(false, "wrong opcode");
1597   }
1598 }
1599 
reduceB(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1600 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1601                              Register dst, Register src1, XMMRegister src2,
1602                              XMMRegister vtmp1, XMMRegister vtmp2) {
1603   switch (vlen) {
1604     case  8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1605     case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1606     case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1607     case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1608 
1609     default: assert(false, "wrong vector length");
1610   }
1611 }
1612 
mulreduceB(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1613 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1614                              Register dst, Register src1, XMMRegister src2,
1615                              XMMRegister vtmp1, XMMRegister vtmp2) {
1616   switch (vlen) {
1617     case  8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1618     case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1619     case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1620     case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1621 
1622     default: assert(false, "wrong vector length");
1623   }
1624 }
1625 
reduceS(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1626 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1627                              Register dst, Register src1, XMMRegister src2,
1628                              XMMRegister vtmp1, XMMRegister vtmp2) {
1629   switch (vlen) {
1630     case  4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1631     case  8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1632     case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1633     case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1634 
1635     default: assert(false, "wrong vector length");
1636   }
1637 }
1638 
reduceI(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1639 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1640                              Register dst, Register src1, XMMRegister src2,
1641                              XMMRegister vtmp1, XMMRegister vtmp2) {
1642   switch (vlen) {
1643     case  2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1644     case  4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1645     case  8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1646     case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1647 
1648     default: assert(false, "wrong vector length");
1649   }
1650 }
1651 
1652 #ifdef _LP64
reduceL(int opcode,int vlen,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1653 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1654                              Register dst, Register src1, XMMRegister src2,
1655                              XMMRegister vtmp1, XMMRegister vtmp2) {
1656   switch (vlen) {
1657     case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1658     case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1659     case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1660 
1661     default: assert(false, "wrong vector length");
1662   }
1663 }
1664 #endif // _LP64
1665 
reduceF(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1666 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1667   switch (vlen) {
1668     case 2:
1669       assert(vtmp2 == xnoreg, "");
1670       reduce2F(opcode, dst, src, vtmp1);
1671       break;
1672     case 4:
1673       assert(vtmp2 == xnoreg, "");
1674       reduce4F(opcode, dst, src, vtmp1);
1675       break;
1676     case 8:
1677       reduce8F(opcode, dst, src, vtmp1, vtmp2);
1678       break;
1679     case 16:
1680       reduce16F(opcode, dst, src, vtmp1, vtmp2);
1681       break;
1682     default: assert(false, "wrong vector length");
1683   }
1684 }
1685 
reduceD(int opcode,int vlen,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1686 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1687   switch (vlen) {
1688     case 2:
1689       assert(vtmp2 == xnoreg, "");
1690       reduce2D(opcode, dst, src, vtmp1);
1691       break;
1692     case 4:
1693       reduce4D(opcode, dst, src, vtmp1, vtmp2);
1694       break;
1695     case 8:
1696       reduce8D(opcode, dst, src, vtmp1, vtmp2);
1697       break;
1698     default: assert(false, "wrong vector length");
1699   }
1700 }
1701 
reduce2I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1702 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1703   if (opcode == Op_AddReductionVI) {
1704     if (vtmp1 != src2) {
1705       movdqu(vtmp1, src2);
1706     }
1707     phaddd(vtmp1, vtmp1);
1708   } else {
1709     pshufd(vtmp1, src2, 0x1);
1710     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1711   }
1712   movdl(vtmp2, src1);
1713   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1714   movdl(dst, vtmp1);
1715 }
1716 
reduce4I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1717 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1718   if (opcode == Op_AddReductionVI) {
1719     if (vtmp1 != src2) {
1720       movdqu(vtmp1, src2);
1721     }
1722     phaddd(vtmp1, src2);
1723     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1724   } else {
1725     pshufd(vtmp2, src2, 0xE);
1726     reduce_operation_128(T_INT, opcode, vtmp2, src2);
1727     reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1728   }
1729 }
1730 
reduce8I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1731 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1732   if (opcode == Op_AddReductionVI) {
1733     vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1734     vextracti128_high(vtmp2, vtmp1);
1735     vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1736     reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1737   } else {
1738     vextracti128_high(vtmp1, src2);
1739     reduce_operation_128(T_INT, opcode, vtmp1, src2);
1740     reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1741   }
1742 }
1743 
reduce16I(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1744 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1745   vextracti64x4_high(vtmp2, src2);
1746   reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1747   reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1748 }
1749 
reduce8B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1750 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1751   pshufd(vtmp2, src2, 0x1);
1752   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1753   movdqu(vtmp1, vtmp2);
1754   psrldq(vtmp1, 2);
1755   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1756   movdqu(vtmp2, vtmp1);
1757   psrldq(vtmp2, 1);
1758   reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1759   movdl(vtmp2, src1);
1760   pmovsxbd(vtmp1, vtmp1);
1761   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1762   pextrb(dst, vtmp1, 0x0);
1763   movsbl(dst, dst);
1764 }
1765 
reduce16B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1766 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1767   pshufd(vtmp1, src2, 0xE);
1768   reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1769   reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1770 }
1771 
reduce32B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1772 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1773   vextracti128_high(vtmp2, src2);
1774   reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1775   reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1776 }
1777 
reduce64B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1778 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1779   vextracti64x4_high(vtmp1, src2);
1780   reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1781   reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1782 }
1783 
mulreduce8B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1784 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1785   pmovsxbw(vtmp2, src2);
1786   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1787 }
1788 
mulreduce16B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1789 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1790   if (UseAVX > 1) {
1791     int vector_len = Assembler::AVX_256bit;
1792     vpmovsxbw(vtmp1, src2, vector_len);
1793     reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1794   } else {
1795     pmovsxbw(vtmp2, src2);
1796     reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1797     pshufd(vtmp2, src2, 0x1);
1798     pmovsxbw(vtmp2, src2);
1799     reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1800   }
1801 }
1802 
mulreduce32B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1803 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1804   if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1805     int vector_len = Assembler::AVX_512bit;
1806     vpmovsxbw(vtmp1, src2, vector_len);
1807     reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1808   } else {
1809     assert(UseAVX >= 2,"Should not reach here.");
1810     mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1811     vextracti128_high(vtmp2, src2);
1812     mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1813   }
1814 }
1815 
mulreduce64B(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1816 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1817   mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1818   vextracti64x4_high(vtmp2, src2);
1819   mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1820 }
1821 
reduce4S(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1822 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1823   if (opcode == Op_AddReductionVI) {
1824     if (vtmp1 != src2) {
1825       movdqu(vtmp1, src2);
1826     }
1827     phaddw(vtmp1, vtmp1);
1828     phaddw(vtmp1, vtmp1);
1829   } else {
1830     pshufd(vtmp2, src2, 0x1);
1831     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1832     movdqu(vtmp1, vtmp2);
1833     psrldq(vtmp1, 2);
1834     reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1835   }
1836   movdl(vtmp2, src1);
1837   pmovsxwd(vtmp1, vtmp1);
1838   reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1839   pextrw(dst, vtmp1, 0x0);
1840   movswl(dst, dst);
1841 }
1842 
reduce8S(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1843 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1844   if (opcode == Op_AddReductionVI) {
1845     if (vtmp1 != src2) {
1846       movdqu(vtmp1, src2);
1847     }
1848     phaddw(vtmp1, src2);
1849   } else {
1850     pshufd(vtmp1, src2, 0xE);
1851     reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1852   }
1853   reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1854 }
1855 
reduce16S(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1856 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1857   if (opcode == Op_AddReductionVI) {
1858     int vector_len = Assembler::AVX_256bit;
1859     vphaddw(vtmp2, src2, src2, vector_len);
1860     vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1861   } else {
1862     vextracti128_high(vtmp2, src2);
1863     reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1864   }
1865   reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1866 }
1867 
reduce32S(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1868 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1869   int vector_len = Assembler::AVX_256bit;
1870   vextracti64x4_high(vtmp1, src2);
1871   reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1872   reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1873 }
1874 
1875 #ifdef _LP64
reduce2L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1876 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1877   pshufd(vtmp2, src2, 0xE);
1878   reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1879   movdq(vtmp1, src1);
1880   reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1881   movdq(dst, vtmp1);
1882 }
1883 
reduce4L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1884 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1885   vextracti128_high(vtmp1, src2);
1886   reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1887   reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1888 }
1889 
reduce8L(int opcode,Register dst,Register src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)1890 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1891   vextracti64x4_high(vtmp2, src2);
1892   reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1893   reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1894 }
1895 
genmask(Register dst,Register len,Register temp)1896 void C2_MacroAssembler::genmask(Register dst, Register len, Register temp) {
1897   if (ArrayCopyPartialInlineSize <= 32) {
1898     mov64(dst, 1);
1899     shlxq(dst, dst, len);
1900     decq(dst);
1901   } else {
1902     mov64(dst, -1);
1903     movq(temp, len);
1904     negptr(temp);
1905     addptr(temp, 64);
1906     shrxq(dst, dst, temp);
1907   }
1908 }
1909 #endif // _LP64
1910 
reduce2F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1911 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1912   reduce_operation_128(T_FLOAT, opcode, dst, src);
1913   pshufd(vtmp, src, 0x1);
1914   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1915 }
1916 
reduce4F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1917 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1918   reduce2F(opcode, dst, src, vtmp);
1919   pshufd(vtmp, src, 0x2);
1920   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1921   pshufd(vtmp, src, 0x3);
1922   reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1923 }
1924 
reduce8F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1925 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1926   reduce4F(opcode, dst, src, vtmp2);
1927   vextractf128_high(vtmp2, src);
1928   reduce4F(opcode, dst, vtmp2, vtmp1);
1929 }
1930 
reduce16F(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1931 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1932   reduce8F(opcode, dst, src, vtmp1, vtmp2);
1933   vextracti64x4_high(vtmp1, src);
1934   reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1935 }
1936 
reduce2D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp)1937 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1938   reduce_operation_128(T_DOUBLE, opcode, dst, src);
1939   pshufd(vtmp, src, 0xE);
1940   reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1941 }
1942 
reduce4D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1943 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1944   reduce2D(opcode, dst, src, vtmp2);
1945   vextractf128_high(vtmp2, src);
1946   reduce2D(opcode, dst, vtmp2, vtmp1);
1947 }
1948 
reduce8D(int opcode,XMMRegister dst,XMMRegister src,XMMRegister vtmp1,XMMRegister vtmp2)1949 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1950   reduce4D(opcode, dst, src, vtmp1, vtmp2);
1951   vextracti64x4_high(vtmp1, src);
1952   reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1953 }
1954 
evmovdqu(BasicType type,KRegister kmask,XMMRegister dst,Address src,int vector_len)1955 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1956   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1957 }
1958 
evmovdqu(BasicType type,KRegister kmask,Address dst,XMMRegister src,int vector_len)1959 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1960   MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1961 }
1962 
1963 
reduceFloatMinMax(int opcode,int vlen,bool is_dst_valid,XMMRegister dst,XMMRegister src,XMMRegister tmp,XMMRegister atmp,XMMRegister btmp,XMMRegister xmm_0,XMMRegister xmm_1)1964 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1965                                           XMMRegister dst, XMMRegister src,
1966                                           XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1967                                           XMMRegister xmm_0, XMMRegister xmm_1) {
1968   int permconst[] = {1, 14};
1969   XMMRegister wsrc = src;
1970   XMMRegister wdst = xmm_0;
1971   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1972 
1973   int vlen_enc = Assembler::AVX_128bit;
1974   if (vlen == 16) {
1975     vlen_enc = Assembler::AVX_256bit;
1976   }
1977 
1978   for (int i = log2(vlen) - 1; i >=0; i--) {
1979     if (i == 0 && !is_dst_valid) {
1980       wdst = dst;
1981     }
1982     if (i == 3) {
1983       vextracti64x4_high(wtmp, wsrc);
1984     } else if (i == 2) {
1985       vextracti128_high(wtmp, wsrc);
1986     } else { // i = [0,1]
1987       vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
1988     }
1989     vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
1990     wsrc = wdst;
1991     vlen_enc = Assembler::AVX_128bit;
1992   }
1993   if (is_dst_valid) {
1994     vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
1995   }
1996 }
1997 
reduceDoubleMinMax(int opcode,int vlen,bool is_dst_valid,XMMRegister dst,XMMRegister src,XMMRegister tmp,XMMRegister atmp,XMMRegister btmp,XMMRegister xmm_0,XMMRegister xmm_1)1998 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
1999                                         XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2000                                         XMMRegister xmm_0, XMMRegister xmm_1) {
2001   XMMRegister wsrc = src;
2002   XMMRegister wdst = xmm_0;
2003   XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2004   int vlen_enc = Assembler::AVX_128bit;
2005   if (vlen == 8) {
2006     vlen_enc = Assembler::AVX_256bit;
2007   }
2008   for (int i = log2(vlen) - 1; i >=0; i--) {
2009     if (i == 0 && !is_dst_valid) {
2010       wdst = dst;
2011     }
2012     if (i == 1) {
2013       vextracti128_high(wtmp, wsrc);
2014     } else if (i == 2) {
2015       vextracti64x4_high(wtmp, wsrc);
2016     } else {
2017       assert(i == 0, "%d", i);
2018       vpermilpd(wtmp, wsrc, 1, vlen_enc);
2019     }
2020     vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2021     wsrc = wdst;
2022     vlen_enc = Assembler::AVX_128bit;
2023   }
2024   if (is_dst_valid) {
2025     vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2026   }
2027 }
2028 
extract(BasicType bt,Register dst,XMMRegister src,int idx)2029 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2030   switch (bt) {
2031     case T_BYTE:  pextrb(dst, src, idx); break;
2032     case T_SHORT: pextrw(dst, src, idx); break;
2033     case T_INT:   pextrd(dst, src, idx); break;
2034     case T_LONG:  pextrq(dst, src, idx); break;
2035 
2036     default:
2037       assert(false,"Should not reach here.");
2038       break;
2039   }
2040 }
2041 
get_lane(BasicType typ,XMMRegister dst,XMMRegister src,int elemindex)2042 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2043   int esize =  type2aelembytes(typ);
2044   int elem_per_lane = 16/esize;
2045   int lane = elemindex / elem_per_lane;
2046   int eindex = elemindex % elem_per_lane;
2047 
2048   if (lane >= 2) {
2049     assert(UseAVX > 2, "required");
2050     vextractf32x4(dst, src, lane & 3);
2051     return dst;
2052   } else if (lane > 0) {
2053     assert(UseAVX > 0, "required");
2054     vextractf128(dst, src, lane);
2055     return dst;
2056   } else {
2057     return src;
2058   }
2059 }
2060 
get_elem(BasicType typ,Register dst,XMMRegister src,int elemindex)2061 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2062   int esize =  type2aelembytes(typ);
2063   int elem_per_lane = 16/esize;
2064   int eindex = elemindex % elem_per_lane;
2065   assert(is_integral_type(typ),"required");
2066 
2067   if (eindex == 0) {
2068     if (typ == T_LONG) {
2069       movq(dst, src);
2070     } else {
2071       movdl(dst, src);
2072       if (typ == T_BYTE)
2073         movsbl(dst, dst);
2074       else if (typ == T_SHORT)
2075         movswl(dst, dst);
2076     }
2077   } else {
2078     extract(typ, dst, src, eindex);
2079   }
2080 }
2081 
get_elem(BasicType typ,XMMRegister dst,XMMRegister src,int elemindex,Register tmp,XMMRegister vtmp)2082 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2083   int esize =  type2aelembytes(typ);
2084   int elem_per_lane = 16/esize;
2085   int eindex = elemindex % elem_per_lane;
2086   assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2087 
2088   if (eindex == 0) {
2089     movq(dst, src);
2090   } else {
2091     if (typ == T_FLOAT) {
2092       if (UseAVX == 0) {
2093         movdqu(dst, src);
2094         pshufps(dst, dst, eindex);
2095       } else {
2096         vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2097       }
2098     } else {
2099       if (UseAVX == 0) {
2100         movdqu(dst, src);
2101         psrldq(dst, eindex*esize);
2102       } else {
2103         vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2104       }
2105       movq(dst, dst);
2106     }
2107   }
2108   // Zero upper bits
2109   if (typ == T_FLOAT) {
2110     if (UseAVX == 0) {
2111       assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2112       movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2113       pand(dst, vtmp);
2114     } else {
2115       assert((tmp != noreg), "required.");
2116       vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2117     }
2118   }
2119 }
2120 
evpcmp(BasicType typ,KRegister kdmask,KRegister ksmask,XMMRegister src1,AddressLiteral adr,int comparison,int vector_len,Register scratch)2121 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2122   switch(typ) {
2123     case T_BYTE:
2124       evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2125       break;
2126     case T_SHORT:
2127       evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2128       break;
2129     case T_INT:
2130     case T_FLOAT:
2131       evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2132       break;
2133     case T_LONG:
2134     case T_DOUBLE:
2135       evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
2136       break;
2137     default:
2138       assert(false,"Should not reach here.");
2139       break;
2140   }
2141 }
2142 
evpblend(BasicType typ,XMMRegister dst,KRegister kmask,XMMRegister src1,XMMRegister src2,bool merge,int vector_len)2143 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2144   switch(typ) {
2145     case T_BYTE:
2146       evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2147       break;
2148     case T_SHORT:
2149       evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2150       break;
2151     case T_INT:
2152     case T_FLOAT:
2153       evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2154       break;
2155     case T_LONG:
2156     case T_DOUBLE:
2157       evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2158       break;
2159     default:
2160       assert(false,"Should not reach here.");
2161       break;
2162   }
2163 }
2164 
vectortest(int bt,int vlen,XMMRegister src1,XMMRegister src2,XMMRegister vtmp1,XMMRegister vtmp2)2165 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2166   switch(vlen) {
2167     case 4:
2168       assert(vtmp1 != xnoreg, "required.");
2169       // Broadcast lower 32 bits to 128 bits before ptest
2170       pshufd(vtmp1, src1, 0x0);
2171       if (bt == BoolTest::overflow) {
2172         assert(vtmp2 != xnoreg, "required.");
2173         pshufd(vtmp2, src2, 0x0);
2174       } else {
2175         assert(vtmp2 == xnoreg, "required.");
2176         vtmp2 = src2;
2177       }
2178       ptest(vtmp1, vtmp2);
2179      break;
2180     case 8:
2181       assert(vtmp1 != xnoreg, "required.");
2182       // Broadcast lower 64 bits to 128 bits before ptest
2183       pshufd(vtmp1, src1, 0x4);
2184       if (bt == BoolTest::overflow) {
2185         assert(vtmp2 != xnoreg, "required.");
2186         pshufd(vtmp2, src2, 0x4);
2187       } else {
2188         assert(vtmp2 == xnoreg, "required.");
2189         vtmp2 = src2;
2190       }
2191       ptest(vtmp1, vtmp2);
2192      break;
2193     case 16:
2194       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2195       ptest(src1, src2);
2196       break;
2197     case 32:
2198       assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2199       vptest(src1, src2, Assembler::AVX_256bit);
2200       break;
2201     case 64:
2202       {
2203         KRegister ktemp = k2; // Use a hardcoded temp due to no k register allocation.
2204         assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2205         evpcmpeqb(ktemp, src1, src2, Assembler::AVX_512bit);
2206         if (bt == BoolTest::ne) {
2207           ktestql(ktemp, ktemp);
2208         } else {
2209           assert(bt == BoolTest::overflow, "required");
2210           kortestql(ktemp, ktemp);
2211         }
2212       }
2213       break;
2214     default:
2215       assert(false,"Should not reach here.");
2216       break;
2217   }
2218 }
2219 
2220 //-------------------------------------------------------------------------------------------
2221 
2222 // IndexOf for constant substrings with size >= 8 chars
2223 // which don't need to be loaded through stack.
string_indexofC8(Register str1,Register str2,Register cnt1,Register cnt2,int int_cnt2,Register result,XMMRegister vec,Register tmp,int ae)2224 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2225                                          Register cnt1, Register cnt2,
2226                                          int int_cnt2,  Register result,
2227                                          XMMRegister vec, Register tmp,
2228                                          int ae) {
2229   ShortBranchVerifier sbv(this);
2230   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2231   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2232 
2233   // This method uses the pcmpestri instruction with bound registers
2234   //   inputs:
2235   //     xmm - substring
2236   //     rax - substring length (elements count)
2237   //     mem - scanned string
2238   //     rdx - string length (elements count)
2239   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2240   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2241   //   outputs:
2242   //     rcx - matched index in string
2243   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2244   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2245   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2246   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2247   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2248 
2249   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2250         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2251         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2252 
2253   // Note, inline_string_indexOf() generates checks:
2254   // if (substr.count > string.count) return -1;
2255   // if (substr.count == 0) return 0;
2256   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2257 
2258   // Load substring.
2259   if (ae == StrIntrinsicNode::UL) {
2260     pmovzxbw(vec, Address(str2, 0));
2261   } else {
2262     movdqu(vec, Address(str2, 0));
2263   }
2264   movl(cnt2, int_cnt2);
2265   movptr(result, str1); // string addr
2266 
2267   if (int_cnt2 > stride) {
2268     jmpb(SCAN_TO_SUBSTR);
2269 
2270     // Reload substr for rescan, this code
2271     // is executed only for large substrings (> 8 chars)
2272     bind(RELOAD_SUBSTR);
2273     if (ae == StrIntrinsicNode::UL) {
2274       pmovzxbw(vec, Address(str2, 0));
2275     } else {
2276       movdqu(vec, Address(str2, 0));
2277     }
2278     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2279 
2280     bind(RELOAD_STR);
2281     // We came here after the beginning of the substring was
2282     // matched but the rest of it was not so we need to search
2283     // again. Start from the next element after the previous match.
2284 
2285     // cnt2 is number of substring reminding elements and
2286     // cnt1 is number of string reminding elements when cmp failed.
2287     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2288     subl(cnt1, cnt2);
2289     addl(cnt1, int_cnt2);
2290     movl(cnt2, int_cnt2); // Now restore cnt2
2291 
2292     decrementl(cnt1);     // Shift to next element
2293     cmpl(cnt1, cnt2);
2294     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2295 
2296     addptr(result, (1<<scale1));
2297 
2298   } // (int_cnt2 > 8)
2299 
2300   // Scan string for start of substr in 16-byte vectors
2301   bind(SCAN_TO_SUBSTR);
2302   pcmpestri(vec, Address(result, 0), mode);
2303   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2304   subl(cnt1, stride);
2305   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2306   cmpl(cnt1, cnt2);
2307   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2308   addptr(result, 16);
2309   jmpb(SCAN_TO_SUBSTR);
2310 
2311   // Found a potential substr
2312   bind(FOUND_CANDIDATE);
2313   // Matched whole vector if first element matched (tmp(rcx) == 0).
2314   if (int_cnt2 == stride) {
2315     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
2316   } else { // int_cnt2 > 8
2317     jccb(Assembler::overflow, FOUND_SUBSTR);
2318   }
2319   // After pcmpestri tmp(rcx) contains matched element index
2320   // Compute start addr of substr
2321   lea(result, Address(result, tmp, scale1));
2322 
2323   // Make sure string is still long enough
2324   subl(cnt1, tmp);
2325   cmpl(cnt1, cnt2);
2326   if (int_cnt2 == stride) {
2327     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2328   } else { // int_cnt2 > 8
2329     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2330   }
2331   // Left less then substring.
2332 
2333   bind(RET_NOT_FOUND);
2334   movl(result, -1);
2335   jmp(EXIT);
2336 
2337   if (int_cnt2 > stride) {
2338     // This code is optimized for the case when whole substring
2339     // is matched if its head is matched.
2340     bind(MATCH_SUBSTR_HEAD);
2341     pcmpestri(vec, Address(result, 0), mode);
2342     // Reload only string if does not match
2343     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2344 
2345     Label CONT_SCAN_SUBSTR;
2346     // Compare the rest of substring (> 8 chars).
2347     bind(FOUND_SUBSTR);
2348     // First 8 chars are already matched.
2349     negptr(cnt2);
2350     addptr(cnt2, stride);
2351 
2352     bind(SCAN_SUBSTR);
2353     subl(cnt1, stride);
2354     cmpl(cnt2, -stride); // Do not read beyond substring
2355     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2356     // Back-up strings to avoid reading beyond substring:
2357     // cnt1 = cnt1 - cnt2 + 8
2358     addl(cnt1, cnt2); // cnt2 is negative
2359     addl(cnt1, stride);
2360     movl(cnt2, stride); negptr(cnt2);
2361     bind(CONT_SCAN_SUBSTR);
2362     if (int_cnt2 < (int)G) {
2363       int tail_off1 = int_cnt2<<scale1;
2364       int tail_off2 = int_cnt2<<scale2;
2365       if (ae == StrIntrinsicNode::UL) {
2366         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2367       } else {
2368         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2369       }
2370       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2371     } else {
2372       // calculate index in register to avoid integer overflow (int_cnt2*2)
2373       movl(tmp, int_cnt2);
2374       addptr(tmp, cnt2);
2375       if (ae == StrIntrinsicNode::UL) {
2376         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2377       } else {
2378         movdqu(vec, Address(str2, tmp, scale2, 0));
2379       }
2380       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2381     }
2382     // Need to reload strings pointers if not matched whole vector
2383     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2384     addptr(cnt2, stride);
2385     jcc(Assembler::negative, SCAN_SUBSTR);
2386     // Fall through if found full substring
2387 
2388   } // (int_cnt2 > 8)
2389 
2390   bind(RET_FOUND);
2391   // Found result if we matched full small substring.
2392   // Compute substr offset
2393   subptr(result, str1);
2394   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2395     shrl(result, 1); // index
2396   }
2397   bind(EXIT);
2398 
2399 } // string_indexofC8
2400 
2401 // Small strings are loaded through stack if they cross page boundary.
string_indexof(Register str1,Register str2,Register cnt1,Register cnt2,int int_cnt2,Register result,XMMRegister vec,Register tmp,int ae)2402 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2403                                        Register cnt1, Register cnt2,
2404                                        int int_cnt2,  Register result,
2405                                        XMMRegister vec, Register tmp,
2406                                        int ae) {
2407   ShortBranchVerifier sbv(this);
2408   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2409   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2410 
2411   //
2412   // int_cnt2 is length of small (< 8 chars) constant substring
2413   // or (-1) for non constant substring in which case its length
2414   // is in cnt2 register.
2415   //
2416   // Note, inline_string_indexOf() generates checks:
2417   // if (substr.count > string.count) return -1;
2418   // if (substr.count == 0) return 0;
2419   //
2420   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2421   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2422   // This method uses the pcmpestri instruction with bound registers
2423   //   inputs:
2424   //     xmm - substring
2425   //     rax - substring length (elements count)
2426   //     mem - scanned string
2427   //     rdx - string length (elements count)
2428   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2429   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2430   //   outputs:
2431   //     rcx - matched index in string
2432   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2433   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2434   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2435   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2436 
2437   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2438         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2439         FOUND_CANDIDATE;
2440 
2441   { //========================================================
2442     // We don't know where these strings are located
2443     // and we can't read beyond them. Load them through stack.
2444     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2445 
2446     movptr(tmp, rsp); // save old SP
2447 
2448     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
2449       if (int_cnt2 == (1>>scale2)) { // One byte
2450         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2451         load_unsigned_byte(result, Address(str2, 0));
2452         movdl(vec, result); // move 32 bits
2453       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
2454         // Not enough header space in 32-bit VM: 12+3 = 15.
2455         movl(result, Address(str2, -1));
2456         shrl(result, 8);
2457         movdl(vec, result); // move 32 bits
2458       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
2459         load_unsigned_short(result, Address(str2, 0));
2460         movdl(vec, result); // move 32 bits
2461       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2462         movdl(vec, Address(str2, 0)); // move 32 bits
2463       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2464         movq(vec, Address(str2, 0));  // move 64 bits
2465       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2466         // Array header size is 12 bytes in 32-bit VM
2467         // + 6 bytes for 3 chars == 18 bytes,
2468         // enough space to load vec and shift.
2469         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2470         if (ae == StrIntrinsicNode::UL) {
2471           int tail_off = int_cnt2-8;
2472           pmovzxbw(vec, Address(str2, tail_off));
2473           psrldq(vec, -2*tail_off);
2474         }
2475         else {
2476           int tail_off = int_cnt2*(1<<scale2);
2477           movdqu(vec, Address(str2, tail_off-16));
2478           psrldq(vec, 16-tail_off);
2479         }
2480       }
2481     } else { // not constant substring
2482       cmpl(cnt2, stride);
2483       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2484 
2485       // We can read beyond string if srt+16 does not cross page boundary
2486       // since heaps are aligned and mapped by pages.
2487       assert(os::vm_page_size() < (int)G, "default page should be small");
2488       movl(result, str2); // We need only low 32 bits
2489       andl(result, (os::vm_page_size()-1));
2490       cmpl(result, (os::vm_page_size()-16));
2491       jccb(Assembler::belowEqual, CHECK_STR);
2492 
2493       // Move small strings to stack to allow load 16 bytes into vec.
2494       subptr(rsp, 16);
2495       int stk_offset = wordSize-(1<<scale2);
2496       push(cnt2);
2497 
2498       bind(COPY_SUBSTR);
2499       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2500         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2501         movb(Address(rsp, cnt2, scale2, stk_offset), result);
2502       } else if (ae == StrIntrinsicNode::UU) {
2503         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2504         movw(Address(rsp, cnt2, scale2, stk_offset), result);
2505       }
2506       decrement(cnt2);
2507       jccb(Assembler::notZero, COPY_SUBSTR);
2508 
2509       pop(cnt2);
2510       movptr(str2, rsp);  // New substring address
2511     } // non constant
2512 
2513     bind(CHECK_STR);
2514     cmpl(cnt1, stride);
2515     jccb(Assembler::aboveEqual, BIG_STRINGS);
2516 
2517     // Check cross page boundary.
2518     movl(result, str1); // We need only low 32 bits
2519     andl(result, (os::vm_page_size()-1));
2520     cmpl(result, (os::vm_page_size()-16));
2521     jccb(Assembler::belowEqual, BIG_STRINGS);
2522 
2523     subptr(rsp, 16);
2524     int stk_offset = -(1<<scale1);
2525     if (int_cnt2 < 0) { // not constant
2526       push(cnt2);
2527       stk_offset += wordSize;
2528     }
2529     movl(cnt2, cnt1);
2530 
2531     bind(COPY_STR);
2532     if (ae == StrIntrinsicNode::LL) {
2533       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2534       movb(Address(rsp, cnt2, scale1, stk_offset), result);
2535     } else {
2536       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2537       movw(Address(rsp, cnt2, scale1, stk_offset), result);
2538     }
2539     decrement(cnt2);
2540     jccb(Assembler::notZero, COPY_STR);
2541 
2542     if (int_cnt2 < 0) { // not constant
2543       pop(cnt2);
2544     }
2545     movptr(str1, rsp);  // New string address
2546 
2547     bind(BIG_STRINGS);
2548     // Load substring.
2549     if (int_cnt2 < 0) { // -1
2550       if (ae == StrIntrinsicNode::UL) {
2551         pmovzxbw(vec, Address(str2, 0));
2552       } else {
2553         movdqu(vec, Address(str2, 0));
2554       }
2555       push(cnt2);       // substr count
2556       push(str2);       // substr addr
2557       push(str1);       // string addr
2558     } else {
2559       // Small (< 8 chars) constant substrings are loaded already.
2560       movl(cnt2, int_cnt2);
2561     }
2562     push(tmp);  // original SP
2563 
2564   } // Finished loading
2565 
2566   //========================================================
2567   // Start search
2568   //
2569 
2570   movptr(result, str1); // string addr
2571 
2572   if (int_cnt2  < 0) {  // Only for non constant substring
2573     jmpb(SCAN_TO_SUBSTR);
2574 
2575     // SP saved at sp+0
2576     // String saved at sp+1*wordSize
2577     // Substr saved at sp+2*wordSize
2578     // Substr count saved at sp+3*wordSize
2579 
2580     // Reload substr for rescan, this code
2581     // is executed only for large substrings (> 8 chars)
2582     bind(RELOAD_SUBSTR);
2583     movptr(str2, Address(rsp, 2*wordSize));
2584     movl(cnt2, Address(rsp, 3*wordSize));
2585     if (ae == StrIntrinsicNode::UL) {
2586       pmovzxbw(vec, Address(str2, 0));
2587     } else {
2588       movdqu(vec, Address(str2, 0));
2589     }
2590     // We came here after the beginning of the substring was
2591     // matched but the rest of it was not so we need to search
2592     // again. Start from the next element after the previous match.
2593     subptr(str1, result); // Restore counter
2594     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2595       shrl(str1, 1);
2596     }
2597     addl(cnt1, str1);
2598     decrementl(cnt1);   // Shift to next element
2599     cmpl(cnt1, cnt2);
2600     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2601 
2602     addptr(result, (1<<scale1));
2603   } // non constant
2604 
2605   // Scan string for start of substr in 16-byte vectors
2606   bind(SCAN_TO_SUBSTR);
2607   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2608   pcmpestri(vec, Address(result, 0), mode);
2609   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
2610   subl(cnt1, stride);
2611   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2612   cmpl(cnt1, cnt2);
2613   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
2614   addptr(result, 16);
2615 
2616   bind(ADJUST_STR);
2617   cmpl(cnt1, stride); // Do not read beyond string
2618   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2619   // Back-up string to avoid reading beyond string.
2620   lea(result, Address(result, cnt1, scale1, -16));
2621   movl(cnt1, stride);
2622   jmpb(SCAN_TO_SUBSTR);
2623 
2624   // Found a potential substr
2625   bind(FOUND_CANDIDATE);
2626   // After pcmpestri tmp(rcx) contains matched element index
2627 
2628   // Make sure string is still long enough
2629   subl(cnt1, tmp);
2630   cmpl(cnt1, cnt2);
2631   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2632   // Left less then substring.
2633 
2634   bind(RET_NOT_FOUND);
2635   movl(result, -1);
2636   jmp(CLEANUP);
2637 
2638   bind(FOUND_SUBSTR);
2639   // Compute start addr of substr
2640   lea(result, Address(result, tmp, scale1));
2641   if (int_cnt2 > 0) { // Constant substring
2642     // Repeat search for small substring (< 8 chars)
2643     // from new point without reloading substring.
2644     // Have to check that we don't read beyond string.
2645     cmpl(tmp, stride-int_cnt2);
2646     jccb(Assembler::greater, ADJUST_STR);
2647     // Fall through if matched whole substring.
2648   } else { // non constant
2649     assert(int_cnt2 == -1, "should be != 0");
2650 
2651     addl(tmp, cnt2);
2652     // Found result if we matched whole substring.
2653     cmpl(tmp, stride);
2654     jcc(Assembler::lessEqual, RET_FOUND);
2655 
2656     // Repeat search for small substring (<= 8 chars)
2657     // from new point 'str1' without reloading substring.
2658     cmpl(cnt2, stride);
2659     // Have to check that we don't read beyond string.
2660     jccb(Assembler::lessEqual, ADJUST_STR);
2661 
2662     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2663     // Compare the rest of substring (> 8 chars).
2664     movptr(str1, result);
2665 
2666     cmpl(tmp, cnt2);
2667     // First 8 chars are already matched.
2668     jccb(Assembler::equal, CHECK_NEXT);
2669 
2670     bind(SCAN_SUBSTR);
2671     pcmpestri(vec, Address(str1, 0), mode);
2672     // Need to reload strings pointers if not matched whole vector
2673     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2674 
2675     bind(CHECK_NEXT);
2676     subl(cnt2, stride);
2677     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2678     addptr(str1, 16);
2679     if (ae == StrIntrinsicNode::UL) {
2680       addptr(str2, 8);
2681     } else {
2682       addptr(str2, 16);
2683     }
2684     subl(cnt1, stride);
2685     cmpl(cnt2, stride); // Do not read beyond substring
2686     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2687     // Back-up strings to avoid reading beyond substring.
2688 
2689     if (ae == StrIntrinsicNode::UL) {
2690       lea(str2, Address(str2, cnt2, scale2, -8));
2691       lea(str1, Address(str1, cnt2, scale1, -16));
2692     } else {
2693       lea(str2, Address(str2, cnt2, scale2, -16));
2694       lea(str1, Address(str1, cnt2, scale1, -16));
2695     }
2696     subl(cnt1, cnt2);
2697     movl(cnt2, stride);
2698     addl(cnt1, stride);
2699     bind(CONT_SCAN_SUBSTR);
2700     if (ae == StrIntrinsicNode::UL) {
2701       pmovzxbw(vec, Address(str2, 0));
2702     } else {
2703       movdqu(vec, Address(str2, 0));
2704     }
2705     jmp(SCAN_SUBSTR);
2706 
2707     bind(RET_FOUND_LONG);
2708     movptr(str1, Address(rsp, wordSize));
2709   } // non constant
2710 
2711   bind(RET_FOUND);
2712   // Compute substr offset
2713   subptr(result, str1);
2714   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2715     shrl(result, 1); // index
2716   }
2717   bind(CLEANUP);
2718   pop(rsp); // restore SP
2719 
2720 } // string_indexof
2721 
string_indexof_char(Register str1,Register cnt1,Register ch,Register result,XMMRegister vec1,XMMRegister vec2,XMMRegister vec3,Register tmp)2722 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2723                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2724   ShortBranchVerifier sbv(this);
2725   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2726 
2727   int stride = 8;
2728 
2729   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2730         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2731         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2732         FOUND_SEQ_CHAR, DONE_LABEL;
2733 
2734   movptr(result, str1);
2735   if (UseAVX >= 2) {
2736     cmpl(cnt1, stride);
2737     jcc(Assembler::less, SCAN_TO_CHAR);
2738     cmpl(cnt1, 2*stride);
2739     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2740     movdl(vec1, ch);
2741     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2742     vpxor(vec2, vec2);
2743     movl(tmp, cnt1);
2744     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
2745     andl(cnt1,0x0000000F);  //tail count (in chars)
2746 
2747     bind(SCAN_TO_16_CHAR_LOOP);
2748     vmovdqu(vec3, Address(result, 0));
2749     vpcmpeqw(vec3, vec3, vec1, 1);
2750     vptest(vec2, vec3);
2751     jcc(Assembler::carryClear, FOUND_CHAR);
2752     addptr(result, 32);
2753     subl(tmp, 2*stride);
2754     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2755     jmp(SCAN_TO_8_CHAR);
2756     bind(SCAN_TO_8_CHAR_INIT);
2757     movdl(vec1, ch);
2758     pshuflw(vec1, vec1, 0x00);
2759     pshufd(vec1, vec1, 0);
2760     pxor(vec2, vec2);
2761   }
2762   bind(SCAN_TO_8_CHAR);
2763   cmpl(cnt1, stride);
2764   jcc(Assembler::less, SCAN_TO_CHAR);
2765   if (UseAVX < 2) {
2766     movdl(vec1, ch);
2767     pshuflw(vec1, vec1, 0x00);
2768     pshufd(vec1, vec1, 0);
2769     pxor(vec2, vec2);
2770   }
2771   movl(tmp, cnt1);
2772   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
2773   andl(cnt1,0x00000007);  //tail count (in chars)
2774 
2775   bind(SCAN_TO_8_CHAR_LOOP);
2776   movdqu(vec3, Address(result, 0));
2777   pcmpeqw(vec3, vec1);
2778   ptest(vec2, vec3);
2779   jcc(Assembler::carryClear, FOUND_CHAR);
2780   addptr(result, 16);
2781   subl(tmp, stride);
2782   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2783   bind(SCAN_TO_CHAR);
2784   testl(cnt1, cnt1);
2785   jcc(Assembler::zero, RET_NOT_FOUND);
2786   bind(SCAN_TO_CHAR_LOOP);
2787   load_unsigned_short(tmp, Address(result, 0));
2788   cmpl(ch, tmp);
2789   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2790   addptr(result, 2);
2791   subl(cnt1, 1);
2792   jccb(Assembler::zero, RET_NOT_FOUND);
2793   jmp(SCAN_TO_CHAR_LOOP);
2794 
2795   bind(RET_NOT_FOUND);
2796   movl(result, -1);
2797   jmpb(DONE_LABEL);
2798 
2799   bind(FOUND_CHAR);
2800   if (UseAVX >= 2) {
2801     vpmovmskb(tmp, vec3);
2802   } else {
2803     pmovmskb(tmp, vec3);
2804   }
2805   bsfl(ch, tmp);
2806   addptr(result, ch);
2807 
2808   bind(FOUND_SEQ_CHAR);
2809   subptr(result, str1);
2810   shrl(result, 1);
2811 
2812   bind(DONE_LABEL);
2813 } // string_indexof_char
2814 
stringL_indexof_char(Register str1,Register cnt1,Register ch,Register result,XMMRegister vec1,XMMRegister vec2,XMMRegister vec3,Register tmp)2815 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2816                                             XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2817   ShortBranchVerifier sbv(this);
2818   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2819 
2820   int stride = 16;
2821 
2822   Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2823         SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2824         RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2825         FOUND_SEQ_CHAR, DONE_LABEL;
2826 
2827   movptr(result, str1);
2828   if (UseAVX >= 2) {
2829     cmpl(cnt1, stride);
2830     jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2831     cmpl(cnt1, stride*2);
2832     jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2833     movdl(vec1, ch);
2834     vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2835     vpxor(vec2, vec2);
2836     movl(tmp, cnt1);
2837     andl(tmp, 0xFFFFFFE0);  //vector count (in chars)
2838     andl(cnt1,0x0000001F);  //tail count (in chars)
2839 
2840     bind(SCAN_TO_32_CHAR_LOOP);
2841     vmovdqu(vec3, Address(result, 0));
2842     vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2843     vptest(vec2, vec3);
2844     jcc(Assembler::carryClear, FOUND_CHAR);
2845     addptr(result, 32);
2846     subl(tmp, stride*2);
2847     jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2848     jmp(SCAN_TO_16_CHAR);
2849 
2850     bind(SCAN_TO_16_CHAR_INIT);
2851     movdl(vec1, ch);
2852     pxor(vec2, vec2);
2853     pshufb(vec1, vec2);
2854   }
2855 
2856   bind(SCAN_TO_16_CHAR);
2857   cmpl(cnt1, stride);
2858   jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
2859   if (UseAVX < 2) {
2860     movdl(vec1, ch);
2861     pxor(vec2, vec2);
2862     pshufb(vec1, vec2);
2863   }
2864   movl(tmp, cnt1);
2865   andl(tmp, 0xFFFFFFF0);  //vector count (in bytes)
2866   andl(cnt1,0x0000000F);  //tail count (in bytes)
2867 
2868   bind(SCAN_TO_16_CHAR_LOOP);
2869   movdqu(vec3, Address(result, 0));
2870   pcmpeqb(vec3, vec1);
2871   ptest(vec2, vec3);
2872   jcc(Assembler::carryClear, FOUND_CHAR);
2873   addptr(result, 16);
2874   subl(tmp, stride);
2875   jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
2876 
2877   bind(SCAN_TO_CHAR_INIT);
2878   testl(cnt1, cnt1);
2879   jcc(Assembler::zero, RET_NOT_FOUND);
2880   bind(SCAN_TO_CHAR_LOOP);
2881   load_unsigned_byte(tmp, Address(result, 0));
2882   cmpl(ch, tmp);
2883   jccb(Assembler::equal, FOUND_SEQ_CHAR);
2884   addptr(result, 1);
2885   subl(cnt1, 1);
2886   jccb(Assembler::zero, RET_NOT_FOUND);
2887   jmp(SCAN_TO_CHAR_LOOP);
2888 
2889   bind(RET_NOT_FOUND);
2890   movl(result, -1);
2891   jmpb(DONE_LABEL);
2892 
2893   bind(FOUND_CHAR);
2894   if (UseAVX >= 2) {
2895     vpmovmskb(tmp, vec3);
2896   } else {
2897     pmovmskb(tmp, vec3);
2898   }
2899   bsfl(ch, tmp);
2900   addptr(result, ch);
2901 
2902   bind(FOUND_SEQ_CHAR);
2903   subptr(result, str1);
2904 
2905   bind(DONE_LABEL);
2906 } // stringL_indexof_char
2907 
2908 // helper function for string_compare
load_next_elements(Register elem1,Register elem2,Register str1,Register str2,Address::ScaleFactor scale,Address::ScaleFactor scale1,Address::ScaleFactor scale2,Register index,int ae)2909 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
2910                                            Address::ScaleFactor scale, Address::ScaleFactor scale1,
2911                                            Address::ScaleFactor scale2, Register index, int ae) {
2912   if (ae == StrIntrinsicNode::LL) {
2913     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
2914     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
2915   } else if (ae == StrIntrinsicNode::UU) {
2916     load_unsigned_short(elem1, Address(str1, index, scale, 0));
2917     load_unsigned_short(elem2, Address(str2, index, scale, 0));
2918   } else {
2919     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
2920     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
2921   }
2922 }
2923 
2924 // Compare strings, used for char[] and byte[].
string_compare(Register str1,Register str2,Register cnt1,Register cnt2,Register result,XMMRegister vec1,int ae)2925 void C2_MacroAssembler::string_compare(Register str1, Register str2,
2926                                        Register cnt1, Register cnt2, Register result,
2927                                        XMMRegister vec1, int ae) {
2928   ShortBranchVerifier sbv(this);
2929   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
2930   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
2931   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
2932   int stride2x2 = 0x40;
2933   Address::ScaleFactor scale = Address::no_scale;
2934   Address::ScaleFactor scale1 = Address::no_scale;
2935   Address::ScaleFactor scale2 = Address::no_scale;
2936 
2937   if (ae != StrIntrinsicNode::LL) {
2938     stride2x2 = 0x20;
2939   }
2940 
2941   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
2942     shrl(cnt2, 1);
2943   }
2944   // Compute the minimum of the string lengths and the
2945   // difference of the string lengths (stack).
2946   // Do the conditional move stuff
2947   movl(result, cnt1);
2948   subl(cnt1, cnt2);
2949   push(cnt1);
2950   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
2951 
2952   // Is the minimum length zero?
2953   testl(cnt2, cnt2);
2954   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
2955   if (ae == StrIntrinsicNode::LL) {
2956     // Load first bytes
2957     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
2958     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
2959   } else if (ae == StrIntrinsicNode::UU) {
2960     // Load first characters
2961     load_unsigned_short(result, Address(str1, 0));
2962     load_unsigned_short(cnt1, Address(str2, 0));
2963   } else {
2964     load_unsigned_byte(result, Address(str1, 0));
2965     load_unsigned_short(cnt1, Address(str2, 0));
2966   }
2967   subl(result, cnt1);
2968   jcc(Assembler::notZero,  POP_LABEL);
2969 
2970   if (ae == StrIntrinsicNode::UU) {
2971     // Divide length by 2 to get number of chars
2972     shrl(cnt2, 1);
2973   }
2974   cmpl(cnt2, 1);
2975   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
2976 
2977   // Check if the strings start at the same location and setup scale and stride
2978   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
2979     cmpptr(str1, str2);
2980     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
2981     if (ae == StrIntrinsicNode::LL) {
2982       scale = Address::times_1;
2983       stride = 16;
2984     } else {
2985       scale = Address::times_2;
2986       stride = 8;
2987     }
2988   } else {
2989     scale1 = Address::times_1;
2990     scale2 = Address::times_2;
2991     // scale not used
2992     stride = 8;
2993   }
2994 
2995   if (UseAVX >= 2 && UseSSE42Intrinsics) {
2996     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
2997     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
2998     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
2999     Label COMPARE_TAIL_LONG;
3000     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
3001 
3002     int pcmpmask = 0x19;
3003     if (ae == StrIntrinsicNode::LL) {
3004       pcmpmask &= ~0x01;
3005     }
3006 
3007     // Setup to compare 16-chars (32-bytes) vectors,
3008     // start from first character again because it has aligned address.
3009     if (ae == StrIntrinsicNode::LL) {
3010       stride2 = 32;
3011     } else {
3012       stride2 = 16;
3013     }
3014     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3015       adr_stride = stride << scale;
3016     } else {
3017       adr_stride1 = 8;  //stride << scale1;
3018       adr_stride2 = 16; //stride << scale2;
3019     }
3020 
3021     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3022     // rax and rdx are used by pcmpestri as elements counters
3023     movl(result, cnt2);
3024     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
3025     jcc(Assembler::zero, COMPARE_TAIL_LONG);
3026 
3027     // fast path : compare first 2 8-char vectors.
3028     bind(COMPARE_16_CHARS);
3029     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3030       movdqu(vec1, Address(str1, 0));
3031     } else {
3032       pmovzxbw(vec1, Address(str1, 0));
3033     }
3034     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3035     jccb(Assembler::below, COMPARE_INDEX_CHAR);
3036 
3037     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3038       movdqu(vec1, Address(str1, adr_stride));
3039       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3040     } else {
3041       pmovzxbw(vec1, Address(str1, adr_stride1));
3042       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3043     }
3044     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3045     addl(cnt1, stride);
3046 
3047     // Compare the characters at index in cnt1
3048     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3049     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3050     subl(result, cnt2);
3051     jmp(POP_LABEL);
3052 
3053     // Setup the registers to start vector comparison loop
3054     bind(COMPARE_WIDE_VECTORS);
3055     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3056       lea(str1, Address(str1, result, scale));
3057       lea(str2, Address(str2, result, scale));
3058     } else {
3059       lea(str1, Address(str1, result, scale1));
3060       lea(str2, Address(str2, result, scale2));
3061     }
3062     subl(result, stride2);
3063     subl(cnt2, stride2);
3064     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3065     negptr(result);
3066 
3067     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3068     bind(COMPARE_WIDE_VECTORS_LOOP);
3069 
3070 #ifdef _LP64
3071     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3072       cmpl(cnt2, stride2x2);
3073       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3074       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
3075       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
3076 
3077       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3078       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3079         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3080         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3081       } else {
3082         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3083         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3084       }
3085       kortestql(k7, k7);
3086       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
3087       addptr(result, stride2x2);  // update since we already compared at this addr
3088       subl(cnt2, stride2x2);      // and sub the size too
3089       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3090 
3091       vpxor(vec1, vec1);
3092       jmpb(COMPARE_WIDE_TAIL);
3093     }//if (VM_Version::supports_avx512vlbw())
3094 #endif // _LP64
3095 
3096 
3097     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3098     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3099       vmovdqu(vec1, Address(str1, result, scale));
3100       vpxor(vec1, Address(str2, result, scale));
3101     } else {
3102       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3103       vpxor(vec1, Address(str2, result, scale2));
3104     }
3105     vptest(vec1, vec1);
3106     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3107     addptr(result, stride2);
3108     subl(cnt2, stride2);
3109     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3110     // clean upper bits of YMM registers
3111     vpxor(vec1, vec1);
3112 
3113     // compare wide vectors tail
3114     bind(COMPARE_WIDE_TAIL);
3115     testptr(result, result);
3116     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3117 
3118     movl(result, stride2);
3119     movl(cnt2, result);
3120     negptr(result);
3121     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3122 
3123     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3124     bind(VECTOR_NOT_EQUAL);
3125     // clean upper bits of YMM registers
3126     vpxor(vec1, vec1);
3127     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3128       lea(str1, Address(str1, result, scale));
3129       lea(str2, Address(str2, result, scale));
3130     } else {
3131       lea(str1, Address(str1, result, scale1));
3132       lea(str2, Address(str2, result, scale2));
3133     }
3134     jmp(COMPARE_16_CHARS);
3135 
3136     // Compare tail chars, length between 1 to 15 chars
3137     bind(COMPARE_TAIL_LONG);
3138     movl(cnt2, result);
3139     cmpl(cnt2, stride);
3140     jcc(Assembler::less, COMPARE_SMALL_STR);
3141 
3142     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3143       movdqu(vec1, Address(str1, 0));
3144     } else {
3145       pmovzxbw(vec1, Address(str1, 0));
3146     }
3147     pcmpestri(vec1, Address(str2, 0), pcmpmask);
3148     jcc(Assembler::below, COMPARE_INDEX_CHAR);
3149     subptr(cnt2, stride);
3150     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3151     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3152       lea(str1, Address(str1, result, scale));
3153       lea(str2, Address(str2, result, scale));
3154     } else {
3155       lea(str1, Address(str1, result, scale1));
3156       lea(str2, Address(str2, result, scale2));
3157     }
3158     negptr(cnt2);
3159     jmpb(WHILE_HEAD_LABEL);
3160 
3161     bind(COMPARE_SMALL_STR);
3162   } else if (UseSSE42Intrinsics) {
3163     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3164     int pcmpmask = 0x19;
3165     // Setup to compare 8-char (16-byte) vectors,
3166     // start from first character again because it has aligned address.
3167     movl(result, cnt2);
3168     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
3169     if (ae == StrIntrinsicNode::LL) {
3170       pcmpmask &= ~0x01;
3171     }
3172     jcc(Assembler::zero, COMPARE_TAIL);
3173     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3174       lea(str1, Address(str1, result, scale));
3175       lea(str2, Address(str2, result, scale));
3176     } else {
3177       lea(str1, Address(str1, result, scale1));
3178       lea(str2, Address(str2, result, scale2));
3179     }
3180     negptr(result);
3181 
3182     // pcmpestri
3183     //   inputs:
3184     //     vec1- substring
3185     //     rax - negative string length (elements count)
3186     //     mem - scanned string
3187     //     rdx - string length (elements count)
3188     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
3189     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
3190     //   outputs:
3191     //     rcx - first mismatched element index
3192     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3193 
3194     bind(COMPARE_WIDE_VECTORS);
3195     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3196       movdqu(vec1, Address(str1, result, scale));
3197       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3198     } else {
3199       pmovzxbw(vec1, Address(str1, result, scale1));
3200       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3201     }
3202     // After pcmpestri cnt1(rcx) contains mismatched element index
3203 
3204     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
3205     addptr(result, stride);
3206     subptr(cnt2, stride);
3207     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3208 
3209     // compare wide vectors tail
3210     testptr(result, result);
3211     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3212 
3213     movl(cnt2, stride);
3214     movl(result, stride);
3215     negptr(result);
3216     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3217       movdqu(vec1, Address(str1, result, scale));
3218       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3219     } else {
3220       pmovzxbw(vec1, Address(str1, result, scale1));
3221       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3222     }
3223     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3224 
3225     // Mismatched characters in the vectors
3226     bind(VECTOR_NOT_EQUAL);
3227     addptr(cnt1, result);
3228     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3229     subl(result, cnt2);
3230     jmpb(POP_LABEL);
3231 
3232     bind(COMPARE_TAIL); // limit is zero
3233     movl(cnt2, result);
3234     // Fallthru to tail compare
3235   }
3236   // Shift str2 and str1 to the end of the arrays, negate min
3237   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3238     lea(str1, Address(str1, cnt2, scale));
3239     lea(str2, Address(str2, cnt2, scale));
3240   } else {
3241     lea(str1, Address(str1, cnt2, scale1));
3242     lea(str2, Address(str2, cnt2, scale2));
3243   }
3244   decrementl(cnt2);  // first character was compared already
3245   negptr(cnt2);
3246 
3247   // Compare the rest of the elements
3248   bind(WHILE_HEAD_LABEL);
3249   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3250   subl(result, cnt1);
3251   jccb(Assembler::notZero, POP_LABEL);
3252   increment(cnt2);
3253   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3254 
3255   // Strings are equal up to min length.  Return the length difference.
3256   bind(LENGTH_DIFF_LABEL);
3257   pop(result);
3258   if (ae == StrIntrinsicNode::UU) {
3259     // Divide diff by 2 to get number of chars
3260     sarl(result, 1);
3261   }
3262   jmpb(DONE_LABEL);
3263 
3264 #ifdef _LP64
3265   if (VM_Version::supports_avx512vlbw()) {
3266 
3267     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3268 
3269     kmovql(cnt1, k7);
3270     notq(cnt1);
3271     bsfq(cnt2, cnt1);
3272     if (ae != StrIntrinsicNode::LL) {
3273       // Divide diff by 2 to get number of chars
3274       sarl(cnt2, 1);
3275     }
3276     addq(result, cnt2);
3277     if (ae == StrIntrinsicNode::LL) {
3278       load_unsigned_byte(cnt1, Address(str2, result));
3279       load_unsigned_byte(result, Address(str1, result));
3280     } else if (ae == StrIntrinsicNode::UU) {
3281       load_unsigned_short(cnt1, Address(str2, result, scale));
3282       load_unsigned_short(result, Address(str1, result, scale));
3283     } else {
3284       load_unsigned_short(cnt1, Address(str2, result, scale2));
3285       load_unsigned_byte(result, Address(str1, result, scale1));
3286     }
3287     subl(result, cnt1);
3288     jmpb(POP_LABEL);
3289   }//if (VM_Version::supports_avx512vlbw())
3290 #endif // _LP64
3291 
3292   // Discard the stored length difference
3293   bind(POP_LABEL);
3294   pop(cnt1);
3295 
3296   // That's it
3297   bind(DONE_LABEL);
3298   if(ae == StrIntrinsicNode::UL) {
3299     negl(result);
3300   }
3301 
3302 }
3303 
3304 // Search for Non-ASCII character (Negative byte value) in a byte array,
3305 // return true if it has any and false otherwise.
3306 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3307 //   @IntrinsicCandidate
3308 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
3309 //     for (int i = off; i < off + len; i++) {
3310 //       if (ba[i] < 0) {
3311 //         return true;
3312 //       }
3313 //     }
3314 //     return false;
3315 //   }
has_negatives(Register ary1,Register len,Register result,Register tmp1,XMMRegister vec1,XMMRegister vec2)3316 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3317   Register result, Register tmp1,
3318   XMMRegister vec1, XMMRegister vec2) {
3319   // rsi: byte array
3320   // rcx: len
3321   // rax: result
3322   ShortBranchVerifier sbv(this);
3323   assert_different_registers(ary1, len, result, tmp1);
3324   assert_different_registers(vec1, vec2);
3325   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3326 
3327   // len == 0
3328   testl(len, len);
3329   jcc(Assembler::zero, FALSE_LABEL);
3330 
3331   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3332     VM_Version::supports_avx512vlbw() &&
3333     VM_Version::supports_bmi2()) {
3334 
3335     Label test_64_loop, test_tail;
3336     Register tmp3_aliased = len;
3337 
3338     movl(tmp1, len);
3339     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3340 
3341     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
3342     andl(len, ~(64 - 1));    // vector count (in chars)
3343     jccb(Assembler::zero, test_tail);
3344 
3345     lea(ary1, Address(ary1, len, Address::times_1));
3346     negptr(len);
3347 
3348     bind(test_64_loop);
3349     // Check whether our 64 elements of size byte contain negatives
3350     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3351     kortestql(k2, k2);
3352     jcc(Assembler::notZero, TRUE_LABEL);
3353 
3354     addptr(len, 64);
3355     jccb(Assembler::notZero, test_64_loop);
3356 
3357 
3358     bind(test_tail);
3359     // bail out when there is nothing to be done
3360     testl(tmp1, -1);
3361     jcc(Assembler::zero, FALSE_LABEL);
3362 
3363     // ~(~0 << len) applied up to two times (for 32-bit scenario)
3364 #ifdef _LP64
3365     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3366     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3367     notq(tmp3_aliased);
3368     kmovql(k3, tmp3_aliased);
3369 #else
3370     Label k_init;
3371     jmp(k_init);
3372 
3373     // We could not read 64-bits from a general purpose register thus we move
3374     // data required to compose 64 1's to the instruction stream
3375     // We emit 64 byte wide series of elements from 0..63 which later on would
3376     // be used as a compare targets with tail count contained in tmp1 register.
3377     // Result would be a k register having tmp1 consecutive number or 1
3378     // counting from least significant bit.
3379     address tmp = pc();
3380     emit_int64(0x0706050403020100);
3381     emit_int64(0x0F0E0D0C0B0A0908);
3382     emit_int64(0x1716151413121110);
3383     emit_int64(0x1F1E1D1C1B1A1918);
3384     emit_int64(0x2726252423222120);
3385     emit_int64(0x2F2E2D2C2B2A2928);
3386     emit_int64(0x3736353433323130);
3387     emit_int64(0x3F3E3D3C3B3A3938);
3388 
3389     bind(k_init);
3390     lea(len, InternalAddress(tmp));
3391     // create mask to test for negative byte inside a vector
3392     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3393     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
3394 
3395 #endif
3396     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3397     ktestq(k2, k3);
3398     jcc(Assembler::notZero, TRUE_LABEL);
3399 
3400     jmp(FALSE_LABEL);
3401   } else {
3402     movl(result, len); // copy
3403 
3404     if (UseAVX >= 2 && UseSSE >= 2) {
3405       // With AVX2, use 32-byte vector compare
3406       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3407 
3408       // Compare 32-byte vectors
3409       andl(result, 0x0000001f);  //   tail count (in bytes)
3410       andl(len, 0xffffffe0);   // vector count (in bytes)
3411       jccb(Assembler::zero, COMPARE_TAIL);
3412 
3413       lea(ary1, Address(ary1, len, Address::times_1));
3414       negptr(len);
3415 
3416       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
3417       movdl(vec2, tmp1);
3418       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3419 
3420       bind(COMPARE_WIDE_VECTORS);
3421       vmovdqu(vec1, Address(ary1, len, Address::times_1));
3422       vptest(vec1, vec2);
3423       jccb(Assembler::notZero, TRUE_LABEL);
3424       addptr(len, 32);
3425       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3426 
3427       testl(result, result);
3428       jccb(Assembler::zero, FALSE_LABEL);
3429 
3430       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3431       vptest(vec1, vec2);
3432       jccb(Assembler::notZero, TRUE_LABEL);
3433       jmpb(FALSE_LABEL);
3434 
3435       bind(COMPARE_TAIL); // len is zero
3436       movl(len, result);
3437       // Fallthru to tail compare
3438     } else if (UseSSE42Intrinsics) {
3439       // With SSE4.2, use double quad vector compare
3440       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3441 
3442       // Compare 16-byte vectors
3443       andl(result, 0x0000000f);  //   tail count (in bytes)
3444       andl(len, 0xfffffff0);   // vector count (in bytes)
3445       jcc(Assembler::zero, COMPARE_TAIL);
3446 
3447       lea(ary1, Address(ary1, len, Address::times_1));
3448       negptr(len);
3449 
3450       movl(tmp1, 0x80808080);
3451       movdl(vec2, tmp1);
3452       pshufd(vec2, vec2, 0);
3453 
3454       bind(COMPARE_WIDE_VECTORS);
3455       movdqu(vec1, Address(ary1, len, Address::times_1));
3456       ptest(vec1, vec2);
3457       jcc(Assembler::notZero, TRUE_LABEL);
3458       addptr(len, 16);
3459       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3460 
3461       testl(result, result);
3462       jcc(Assembler::zero, FALSE_LABEL);
3463 
3464       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3465       ptest(vec1, vec2);
3466       jccb(Assembler::notZero, TRUE_LABEL);
3467       jmpb(FALSE_LABEL);
3468 
3469       bind(COMPARE_TAIL); // len is zero
3470       movl(len, result);
3471       // Fallthru to tail compare
3472     }
3473   }
3474   // Compare 4-byte vectors
3475   andl(len, 0xfffffffc); // vector count (in bytes)
3476   jccb(Assembler::zero, COMPARE_CHAR);
3477 
3478   lea(ary1, Address(ary1, len, Address::times_1));
3479   negptr(len);
3480 
3481   bind(COMPARE_VECTORS);
3482   movl(tmp1, Address(ary1, len, Address::times_1));
3483   andl(tmp1, 0x80808080);
3484   jccb(Assembler::notZero, TRUE_LABEL);
3485   addptr(len, 4);
3486   jcc(Assembler::notZero, COMPARE_VECTORS);
3487 
3488   // Compare trailing char (final 2 bytes), if any
3489   bind(COMPARE_CHAR);
3490   testl(result, 0x2);   // tail  char
3491   jccb(Assembler::zero, COMPARE_BYTE);
3492   load_unsigned_short(tmp1, Address(ary1, 0));
3493   andl(tmp1, 0x00008080);
3494   jccb(Assembler::notZero, TRUE_LABEL);
3495   subptr(result, 2);
3496   lea(ary1, Address(ary1, 2));
3497 
3498   bind(COMPARE_BYTE);
3499   testl(result, 0x1);   // tail  byte
3500   jccb(Assembler::zero, FALSE_LABEL);
3501   load_unsigned_byte(tmp1, Address(ary1, 0));
3502   andl(tmp1, 0x00000080);
3503   jccb(Assembler::notEqual, TRUE_LABEL);
3504   jmpb(FALSE_LABEL);
3505 
3506   bind(TRUE_LABEL);
3507   movl(result, 1);   // return true
3508   jmpb(DONE);
3509 
3510   bind(FALSE_LABEL);
3511   xorl(result, result); // return false
3512 
3513   // That's it
3514   bind(DONE);
3515   if (UseAVX >= 2 && UseSSE >= 2) {
3516     // clean upper bits of YMM registers
3517     vpxor(vec1, vec1);
3518     vpxor(vec2, vec2);
3519   }
3520 }
3521 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
arrays_equals(bool is_array_equ,Register ary1,Register ary2,Register limit,Register result,Register chr,XMMRegister vec1,XMMRegister vec2,bool is_char)3522 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3523                                       Register limit, Register result, Register chr,
3524                                       XMMRegister vec1, XMMRegister vec2, bool is_char) {
3525   ShortBranchVerifier sbv(this);
3526   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3527 
3528   int length_offset  = arrayOopDesc::length_offset_in_bytes();
3529   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3530 
3531   if (is_array_equ) {
3532     // Check the input args
3533     cmpoop(ary1, ary2);
3534     jcc(Assembler::equal, TRUE_LABEL);
3535 
3536     // Need additional checks for arrays_equals.
3537     testptr(ary1, ary1);
3538     jcc(Assembler::zero, FALSE_LABEL);
3539     testptr(ary2, ary2);
3540     jcc(Assembler::zero, FALSE_LABEL);
3541 
3542     // Check the lengths
3543     movl(limit, Address(ary1, length_offset));
3544     cmpl(limit, Address(ary2, length_offset));
3545     jcc(Assembler::notEqual, FALSE_LABEL);
3546   }
3547 
3548   // count == 0
3549   testl(limit, limit);
3550   jcc(Assembler::zero, TRUE_LABEL);
3551 
3552   if (is_array_equ) {
3553     // Load array address
3554     lea(ary1, Address(ary1, base_offset));
3555     lea(ary2, Address(ary2, base_offset));
3556   }
3557 
3558   if (is_array_equ && is_char) {
3559     // arrays_equals when used for char[].
3560     shll(limit, 1);      // byte count != 0
3561   }
3562   movl(result, limit); // copy
3563 
3564   if (UseAVX >= 2) {
3565     // With AVX2, use 32-byte vector compare
3566     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3567 
3568     // Compare 32-byte vectors
3569     andl(result, 0x0000001f);  //   tail count (in bytes)
3570     andl(limit, 0xffffffe0);   // vector count (in bytes)
3571     jcc(Assembler::zero, COMPARE_TAIL);
3572 
3573     lea(ary1, Address(ary1, limit, Address::times_1));
3574     lea(ary2, Address(ary2, limit, Address::times_1));
3575     negptr(limit);
3576 
3577 #ifdef _LP64
3578     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3579       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3580 
3581       cmpl(limit, -64);
3582       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3583 
3584       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3585 
3586       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3587       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3588       kortestql(k7, k7);
3589       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3590       addptr(limit, 64);  // update since we already compared at this addr
3591       cmpl(limit, -64);
3592       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3593 
3594       // At this point we may still need to compare -limit+result bytes.
3595       // We could execute the next two instruction and just continue via non-wide path:
3596       //  cmpl(limit, 0);
3597       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
3598       // But since we stopped at the points ary{1,2}+limit which are
3599       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3600       // (|limit| <= 32 and result < 32),
3601       // we may just compare the last 64 bytes.
3602       //
3603       addptr(result, -64);   // it is safe, bc we just came from this area
3604       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3605       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3606       kortestql(k7, k7);
3607       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
3608 
3609       jmp(TRUE_LABEL);
3610 
3611       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3612 
3613     }//if (VM_Version::supports_avx512vlbw())
3614 #endif //_LP64
3615     bind(COMPARE_WIDE_VECTORS);
3616     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3617     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3618     vpxor(vec1, vec2);
3619 
3620     vptest(vec1, vec1);
3621     jcc(Assembler::notZero, FALSE_LABEL);
3622     addptr(limit, 32);
3623     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3624 
3625     testl(result, result);
3626     jcc(Assembler::zero, TRUE_LABEL);
3627 
3628     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3629     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3630     vpxor(vec1, vec2);
3631 
3632     vptest(vec1, vec1);
3633     jccb(Assembler::notZero, FALSE_LABEL);
3634     jmpb(TRUE_LABEL);
3635 
3636     bind(COMPARE_TAIL); // limit is zero
3637     movl(limit, result);
3638     // Fallthru to tail compare
3639   } else if (UseSSE42Intrinsics) {
3640     // With SSE4.2, use double quad vector compare
3641     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3642 
3643     // Compare 16-byte vectors
3644     andl(result, 0x0000000f);  //   tail count (in bytes)
3645     andl(limit, 0xfffffff0);   // vector count (in bytes)
3646     jcc(Assembler::zero, COMPARE_TAIL);
3647 
3648     lea(ary1, Address(ary1, limit, Address::times_1));
3649     lea(ary2, Address(ary2, limit, Address::times_1));
3650     negptr(limit);
3651 
3652     bind(COMPARE_WIDE_VECTORS);
3653     movdqu(vec1, Address(ary1, limit, Address::times_1));
3654     movdqu(vec2, Address(ary2, limit, Address::times_1));
3655     pxor(vec1, vec2);
3656 
3657     ptest(vec1, vec1);
3658     jcc(Assembler::notZero, FALSE_LABEL);
3659     addptr(limit, 16);
3660     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3661 
3662     testl(result, result);
3663     jcc(Assembler::zero, TRUE_LABEL);
3664 
3665     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3666     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3667     pxor(vec1, vec2);
3668 
3669     ptest(vec1, vec1);
3670     jccb(Assembler::notZero, FALSE_LABEL);
3671     jmpb(TRUE_LABEL);
3672 
3673     bind(COMPARE_TAIL); // limit is zero
3674     movl(limit, result);
3675     // Fallthru to tail compare
3676   }
3677 
3678   // Compare 4-byte vectors
3679   andl(limit, 0xfffffffc); // vector count (in bytes)
3680   jccb(Assembler::zero, COMPARE_CHAR);
3681 
3682   lea(ary1, Address(ary1, limit, Address::times_1));
3683   lea(ary2, Address(ary2, limit, Address::times_1));
3684   negptr(limit);
3685 
3686   bind(COMPARE_VECTORS);
3687   movl(chr, Address(ary1, limit, Address::times_1));
3688   cmpl(chr, Address(ary2, limit, Address::times_1));
3689   jccb(Assembler::notEqual, FALSE_LABEL);
3690   addptr(limit, 4);
3691   jcc(Assembler::notZero, COMPARE_VECTORS);
3692 
3693   // Compare trailing char (final 2 bytes), if any
3694   bind(COMPARE_CHAR);
3695   testl(result, 0x2);   // tail  char
3696   jccb(Assembler::zero, COMPARE_BYTE);
3697   load_unsigned_short(chr, Address(ary1, 0));
3698   load_unsigned_short(limit, Address(ary2, 0));
3699   cmpl(chr, limit);
3700   jccb(Assembler::notEqual, FALSE_LABEL);
3701 
3702   if (is_array_equ && is_char) {
3703     bind(COMPARE_BYTE);
3704   } else {
3705     lea(ary1, Address(ary1, 2));
3706     lea(ary2, Address(ary2, 2));
3707 
3708     bind(COMPARE_BYTE);
3709     testl(result, 0x1);   // tail  byte
3710     jccb(Assembler::zero, TRUE_LABEL);
3711     load_unsigned_byte(chr, Address(ary1, 0));
3712     load_unsigned_byte(limit, Address(ary2, 0));
3713     cmpl(chr, limit);
3714     jccb(Assembler::notEqual, FALSE_LABEL);
3715   }
3716   bind(TRUE_LABEL);
3717   movl(result, 1);   // return true
3718   jmpb(DONE);
3719 
3720   bind(FALSE_LABEL);
3721   xorl(result, result); // return false
3722 
3723   // That's it
3724   bind(DONE);
3725   if (UseAVX >= 2) {
3726     // clean upper bits of YMM registers
3727     vpxor(vec1, vec1);
3728     vpxor(vec2, vec2);
3729   }
3730 }
3731