1//
2// Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved.
3// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4//
5// This code is free software; you can redistribute it and/or modify it
6// under the terms of the GNU General Public License version 2 only, as
7// published by the Free Software Foundation.
8//
9// This code is distributed in the hope that it will be useful, but WITHOUT
10// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12// version 2 for more details (a copy is included in the LICENSE file that
13// accompanied this code).
14//
15// You should have received a copy of the GNU General Public License version
16// 2 along with this work; if not, write to the Free Software Foundation,
17// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18//
19// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20// or visit www.oracle.com if you need additional information or have any
21// questions.
22//
23//
24
25// X86 Common Architecture Description File
26
27//----------REGISTER DEFINITION BLOCK------------------------------------------
28// This information is used by the matcher and the register allocator to
29// describe individual registers and classes of registers within the target
30// archtecture.
31
32register %{
33//----------Architecture Description Register Definitions----------------------
34// General Registers
35// "reg_def"  name ( register save type, C convention save type,
36//                   ideal register type, encoding );
37// Register Save Types:
38//
39// NS  = No-Save:       The register allocator assumes that these registers
40//                      can be used without saving upon entry to the method, &
41//                      that they do not need to be saved at call sites.
42//
43// SOC = Save-On-Call:  The register allocator assumes that these registers
44//                      can be used without saving upon entry to the method,
45//                      but that they must be saved at call sites.
46//
47// SOE = Save-On-Entry: The register allocator assumes that these registers
48//                      must be saved before using them upon entry to the
49//                      method, but they do not need to be saved at call
50//                      sites.
51//
52// AS  = Always-Save:   The register allocator assumes that these registers
53//                      must be saved before using them upon entry to the
54//                      method, & that they must be saved at call sites.
55//
56// Ideal Register Type is used to determine how to save & restore a
57// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
58// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
59//
60// The encoding number is the actual bit-pattern placed into the opcodes.
61
62// XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
63// Word a in each register holds a Float, words ab hold a Double.
64// The whole registers are used in SSE4.2 version intrinsics,
65// array copy stubs and superword operations (see UseSSE42Intrinsics,
66// UseXMMForArrayCopy and UseSuperword flags).
67// For pre EVEX enabled architectures:
68//      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
69// For EVEX enabled architectures:
70//      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
71//
72// Linux ABI:   No register preserved across function calls
73//              XMM0-XMM7 might hold parameters
74// Windows ABI: XMM6-XMM31 preserved across function calls
75//              XMM0-XMM3 might hold parameters
76
77reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
78reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
79reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
80reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
81reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
82reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
83reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
84reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
85reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
86reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
87reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
88reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
89reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
90reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
91reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
92reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
93
94reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
95reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
96reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
97reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
98reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
99reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
100reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
101reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
102reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
103reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
104reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
105reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
106reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
107reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
108reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
109reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
110
111reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
112reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
113reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
114reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
115reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
116reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
117reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
118reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
119reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
120reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
121reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
122reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
123reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
124reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
125reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
126reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
127
128reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
129reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
130reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
131reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
132reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
133reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
134reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
135reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
136reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
137reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
138reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
139reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
140reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
141reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
142reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
143reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
144
145reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
146reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
147reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
148reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
149reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
150reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
151reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
152reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
153reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
154reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
155reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
156reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
157reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
158reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
159reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
160reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
161
162reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
163reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
164reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
165reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
166reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
167reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
168reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
169reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
170reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
171reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
172reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
173reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
174reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
175reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
176reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
177reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
178
179reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
180reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
181reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
182reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
183reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
184reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
185reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
186reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
187reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
188reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
189reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
190reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
191reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
192reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
193reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
194reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
195
196reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
197reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
198reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
199reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
200reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
201reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
202reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
203reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
204reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
205reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
206reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
207reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
208reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
209reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
210reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
211reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
212
213#ifdef _LP64
214
215reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
216reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
217reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
218reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
219reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
220reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
221reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
222reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
223reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
224reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
225reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
226reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
227reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
228reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
229reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
230reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
231
232reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
233reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
234reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
235reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
236reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
237reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
238reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
239reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
240reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
241reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
242reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
243reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
244reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
245reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
246reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
247reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
248
249reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
250reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
251reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
252reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
253reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
254reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
255reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
256reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
257reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
258reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
259reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
260reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
261reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
262reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
263reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
264reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
265
266reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
267reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
268reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
269reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
270reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
271reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
272reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
273reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
274reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
275reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
276reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
277reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
278reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
279reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
280reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
281reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
282
283reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
284reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
285reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
286reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
287reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
288reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
289reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
290reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
291reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
292reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
293reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
294reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
295reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
296reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
297reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
298reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
299
300reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
301reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
302reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
303reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
304reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
305reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
306reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
307reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
308reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
309reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
310reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
311reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
312reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
313reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
314reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
315reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
316
317reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
318reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
319reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
320reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
321reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
322reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
323reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
324reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
325reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
326reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
327reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
328reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
329reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
330reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
331reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
332reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
333
334reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
335reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
336reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
337reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
338reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
339reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
340reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
341reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
342reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
343reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
344reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
345reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
346reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
347reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
348reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
349reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
350
351reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
352reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
353reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
354reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
355reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
356reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
357reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
358reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
359reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
360reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
361reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
362reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
363reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
364reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
365reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
366reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
367
368reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
369reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
370reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
371reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
372reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
373reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
374reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
375reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
376reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
377reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
378reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
379reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
380reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
381reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
382reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
383reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
384
385reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
386reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
387reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
388reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
389reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
390reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
391reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
392reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
393reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
394reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
395reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
396reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
397reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
398reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
399reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
400reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
401
402reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
403reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
404reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
405reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
406reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
407reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
408reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
409reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
410reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
411reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
412reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
413reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
414reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
415reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
416reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
417reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
418
419reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
420reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
421reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
422reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
423reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
424reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
425reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
426reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
427reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
428reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
429reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
430reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
431reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
432reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
433reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
434reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
435
436reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
437reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
438reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
439reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
440reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
441reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
442reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
443reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
444reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
445reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
446reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
447reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
448reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
449reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
450reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
451reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
452
453reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
454reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
455reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
456reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
457reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
458reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
459reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
460reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
461reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
462reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
463reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
464reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
465reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
466reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
467reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
468reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
469
470reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
471reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
472reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
473reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
474reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
475reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
476reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
477reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
478reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
479reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
480reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
481reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
482reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
483reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
484reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
485reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
486
487reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
488reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
489reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
490reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
491reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
492reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
493reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
494reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
495reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
496reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
497reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
498reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
499reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
500reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
501reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
502reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
503
504reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
505reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
506reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
507reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
508reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
509reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
510reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
511reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
512reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
513reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
514reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
515reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
516reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
517reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
518reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
519reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
520
521reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
522reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
523reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
524reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
525reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
526reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
527reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
528reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
529reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
530reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
531reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
532reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
533reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
534reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
535reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
536reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
537
538reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
539reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
540reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
541reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
542reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
543reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
544reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
545reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
546reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
547reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
548reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
549reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
550reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
551reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
552reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
553reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
554
555reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
556reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
557reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
558reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
559reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
560reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
561reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
562reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
563reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
564reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
565reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
566reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
567reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
568reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
569reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
570reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
571
572reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
573reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
574reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
575reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
576reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
577reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
578reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
579reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
580reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
581reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
582reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
583reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
584reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
585reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
586reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
587reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
588
589reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
590reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
591reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
592reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
593reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
594reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
595reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
596reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
597reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
598reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
599reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
600reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
601reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
602reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
603reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
604reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
605
606reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
607reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
608reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
609reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
610reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
611reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
612reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
613reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
614reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
615reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
616reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
617reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
618reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
619reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
620reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
621reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
622
623#endif // _LP64
624
625#ifdef _LP64
626reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
627#else
628reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
629#endif // _LP64
630
631// AVX3 Mask Registers.
632reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
633reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
634
635reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
636reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
637
638reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
639reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
640
641reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
642reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
643
644reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
645reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
646
647reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
648reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
649
650reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
651reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
652
653
654alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
655                   XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
656                   XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
657                   XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
658                   XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
659                   XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
660                   XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
661                   XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
662#ifdef _LP64
663                  ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
664                   XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
665                   XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
666                   XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
667                   XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
668                   XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
669                   XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
670                   XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
671                  ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
672                   XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
673                   XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
674                   XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
675                   XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
676                   XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
677                   XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
678                   XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
679                   XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
680                   XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
681                   XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
682                   XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
683                   XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
684                   XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
685                   XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
686                   XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
687#endif
688                      );
689
690alloc_class chunk2(K7, K7_H,
691                   K6, K6_H,
692                   K5, K5_H,
693                   K4, K4_H,
694                   K3, K3_H,
695                   K2, K2_H,
696                   K1, K1_H);
697
698reg_class  vectmask_reg(K1, K1_H,
699                        K2, K2_H,
700                        K3, K3_H,
701                        K4, K4_H,
702                        K5, K5_H,
703                        K6, K6_H,
704                        K7, K7_H);
705
706reg_class vectmask_reg_K1(K1, K1_H);
707reg_class vectmask_reg_K2(K2, K2_H);
708reg_class vectmask_reg_K3(K3, K3_H);
709reg_class vectmask_reg_K4(K4, K4_H);
710reg_class vectmask_reg_K5(K5, K5_H);
711reg_class vectmask_reg_K6(K6, K6_H);
712reg_class vectmask_reg_K7(K7, K7_H);
713
714// flags allocation class should be last.
715alloc_class chunk3(RFLAGS);
716
717
718// Singleton class for condition codes
719reg_class int_flags(RFLAGS);
720
721// Class for pre evex float registers
722reg_class float_reg_legacy(XMM0,
723                    XMM1,
724                    XMM2,
725                    XMM3,
726                    XMM4,
727                    XMM5,
728                    XMM6,
729                    XMM7
730#ifdef _LP64
731                   ,XMM8,
732                    XMM9,
733                    XMM10,
734                    XMM11,
735                    XMM12,
736                    XMM13,
737                    XMM14,
738                    XMM15
739#endif
740                    );
741
742// Class for evex float registers
743reg_class float_reg_evex(XMM0,
744                    XMM1,
745                    XMM2,
746                    XMM3,
747                    XMM4,
748                    XMM5,
749                    XMM6,
750                    XMM7
751#ifdef _LP64
752                   ,XMM8,
753                    XMM9,
754                    XMM10,
755                    XMM11,
756                    XMM12,
757                    XMM13,
758                    XMM14,
759                    XMM15,
760                    XMM16,
761                    XMM17,
762                    XMM18,
763                    XMM19,
764                    XMM20,
765                    XMM21,
766                    XMM22,
767                    XMM23,
768                    XMM24,
769                    XMM25,
770                    XMM26,
771                    XMM27,
772                    XMM28,
773                    XMM29,
774                    XMM30,
775                    XMM31
776#endif
777                    );
778
779reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
780reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
781
782// Class for pre evex double registers
783reg_class double_reg_legacy(XMM0,  XMM0b,
784                     XMM1,  XMM1b,
785                     XMM2,  XMM2b,
786                     XMM3,  XMM3b,
787                     XMM4,  XMM4b,
788                     XMM5,  XMM5b,
789                     XMM6,  XMM6b,
790                     XMM7,  XMM7b
791#ifdef _LP64
792                    ,XMM8,  XMM8b,
793                     XMM9,  XMM9b,
794                     XMM10, XMM10b,
795                     XMM11, XMM11b,
796                     XMM12, XMM12b,
797                     XMM13, XMM13b,
798                     XMM14, XMM14b,
799                     XMM15, XMM15b
800#endif
801                     );
802
803// Class for evex double registers
804reg_class double_reg_evex(XMM0,  XMM0b,
805                     XMM1,  XMM1b,
806                     XMM2,  XMM2b,
807                     XMM3,  XMM3b,
808                     XMM4,  XMM4b,
809                     XMM5,  XMM5b,
810                     XMM6,  XMM6b,
811                     XMM7,  XMM7b
812#ifdef _LP64
813                    ,XMM8,  XMM8b,
814                     XMM9,  XMM9b,
815                     XMM10, XMM10b,
816                     XMM11, XMM11b,
817                     XMM12, XMM12b,
818                     XMM13, XMM13b,
819                     XMM14, XMM14b,
820                     XMM15, XMM15b,
821                     XMM16, XMM16b,
822                     XMM17, XMM17b,
823                     XMM18, XMM18b,
824                     XMM19, XMM19b,
825                     XMM20, XMM20b,
826                     XMM21, XMM21b,
827                     XMM22, XMM22b,
828                     XMM23, XMM23b,
829                     XMM24, XMM24b,
830                     XMM25, XMM25b,
831                     XMM26, XMM26b,
832                     XMM27, XMM27b,
833                     XMM28, XMM28b,
834                     XMM29, XMM29b,
835                     XMM30, XMM30b,
836                     XMM31, XMM31b
837#endif
838                     );
839
840reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
841reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
842
843// Class for pre evex 32bit vector registers
844reg_class vectors_reg_legacy(XMM0,
845                      XMM1,
846                      XMM2,
847                      XMM3,
848                      XMM4,
849                      XMM5,
850                      XMM6,
851                      XMM7
852#ifdef _LP64
853                     ,XMM8,
854                      XMM9,
855                      XMM10,
856                      XMM11,
857                      XMM12,
858                      XMM13,
859                      XMM14,
860                      XMM15
861#endif
862                      );
863
864// Class for evex 32bit vector registers
865reg_class vectors_reg_evex(XMM0,
866                      XMM1,
867                      XMM2,
868                      XMM3,
869                      XMM4,
870                      XMM5,
871                      XMM6,
872                      XMM7
873#ifdef _LP64
874                     ,XMM8,
875                      XMM9,
876                      XMM10,
877                      XMM11,
878                      XMM12,
879                      XMM13,
880                      XMM14,
881                      XMM15,
882                      XMM16,
883                      XMM17,
884                      XMM18,
885                      XMM19,
886                      XMM20,
887                      XMM21,
888                      XMM22,
889                      XMM23,
890                      XMM24,
891                      XMM25,
892                      XMM26,
893                      XMM27,
894                      XMM28,
895                      XMM29,
896                      XMM30,
897                      XMM31
898#endif
899                      );
900
901reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
902reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
903
904// Class for all 64bit vector registers
905reg_class vectord_reg_legacy(XMM0,  XMM0b,
906                      XMM1,  XMM1b,
907                      XMM2,  XMM2b,
908                      XMM3,  XMM3b,
909                      XMM4,  XMM4b,
910                      XMM5,  XMM5b,
911                      XMM6,  XMM6b,
912                      XMM7,  XMM7b
913#ifdef _LP64
914                     ,XMM8,  XMM8b,
915                      XMM9,  XMM9b,
916                      XMM10, XMM10b,
917                      XMM11, XMM11b,
918                      XMM12, XMM12b,
919                      XMM13, XMM13b,
920                      XMM14, XMM14b,
921                      XMM15, XMM15b
922#endif
923                      );
924
925// Class for all 64bit vector registers
926reg_class vectord_reg_evex(XMM0,  XMM0b,
927                      XMM1,  XMM1b,
928                      XMM2,  XMM2b,
929                      XMM3,  XMM3b,
930                      XMM4,  XMM4b,
931                      XMM5,  XMM5b,
932                      XMM6,  XMM6b,
933                      XMM7,  XMM7b
934#ifdef _LP64
935                     ,XMM8,  XMM8b,
936                      XMM9,  XMM9b,
937                      XMM10, XMM10b,
938                      XMM11, XMM11b,
939                      XMM12, XMM12b,
940                      XMM13, XMM13b,
941                      XMM14, XMM14b,
942                      XMM15, XMM15b,
943                      XMM16, XMM16b,
944                      XMM17, XMM17b,
945                      XMM18, XMM18b,
946                      XMM19, XMM19b,
947                      XMM20, XMM20b,
948                      XMM21, XMM21b,
949                      XMM22, XMM22b,
950                      XMM23, XMM23b,
951                      XMM24, XMM24b,
952                      XMM25, XMM25b,
953                      XMM26, XMM26b,
954                      XMM27, XMM27b,
955                      XMM28, XMM28b,
956                      XMM29, XMM29b,
957                      XMM30, XMM30b,
958                      XMM31, XMM31b
959#endif
960                      );
961
962reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
963reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
964
965// Class for all 128bit vector registers
966reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
967                      XMM1,  XMM1b,  XMM1c,  XMM1d,
968                      XMM2,  XMM2b,  XMM2c,  XMM2d,
969                      XMM3,  XMM3b,  XMM3c,  XMM3d,
970                      XMM4,  XMM4b,  XMM4c,  XMM4d,
971                      XMM5,  XMM5b,  XMM5c,  XMM5d,
972                      XMM6,  XMM6b,  XMM6c,  XMM6d,
973                      XMM7,  XMM7b,  XMM7c,  XMM7d
974#ifdef _LP64
975                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,
976                      XMM9,  XMM9b,  XMM9c,  XMM9d,
977                      XMM10, XMM10b, XMM10c, XMM10d,
978                      XMM11, XMM11b, XMM11c, XMM11d,
979                      XMM12, XMM12b, XMM12c, XMM12d,
980                      XMM13, XMM13b, XMM13c, XMM13d,
981                      XMM14, XMM14b, XMM14c, XMM14d,
982                      XMM15, XMM15b, XMM15c, XMM15d
983#endif
984                      );
985
986// Class for all 128bit vector registers
987reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
988                      XMM1,  XMM1b,  XMM1c,  XMM1d,
989                      XMM2,  XMM2b,  XMM2c,  XMM2d,
990                      XMM3,  XMM3b,  XMM3c,  XMM3d,
991                      XMM4,  XMM4b,  XMM4c,  XMM4d,
992                      XMM5,  XMM5b,  XMM5c,  XMM5d,
993                      XMM6,  XMM6b,  XMM6c,  XMM6d,
994                      XMM7,  XMM7b,  XMM7c,  XMM7d
995#ifdef _LP64
996                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,
997                      XMM9,  XMM9b,  XMM9c,  XMM9d,
998                      XMM10, XMM10b, XMM10c, XMM10d,
999                      XMM11, XMM11b, XMM11c, XMM11d,
1000                      XMM12, XMM12b, XMM12c, XMM12d,
1001                      XMM13, XMM13b, XMM13c, XMM13d,
1002                      XMM14, XMM14b, XMM14c, XMM14d,
1003                      XMM15, XMM15b, XMM15c, XMM15d,
1004                      XMM16, XMM16b, XMM16c, XMM16d,
1005                      XMM17, XMM17b, XMM17c, XMM17d,
1006                      XMM18, XMM18b, XMM18c, XMM18d,
1007                      XMM19, XMM19b, XMM19c, XMM19d,
1008                      XMM20, XMM20b, XMM20c, XMM20d,
1009                      XMM21, XMM21b, XMM21c, XMM21d,
1010                      XMM22, XMM22b, XMM22c, XMM22d,
1011                      XMM23, XMM23b, XMM23c, XMM23d,
1012                      XMM24, XMM24b, XMM24c, XMM24d,
1013                      XMM25, XMM25b, XMM25c, XMM25d,
1014                      XMM26, XMM26b, XMM26c, XMM26d,
1015                      XMM27, XMM27b, XMM27c, XMM27d,
1016                      XMM28, XMM28b, XMM28c, XMM28d,
1017                      XMM29, XMM29b, XMM29c, XMM29d,
1018                      XMM30, XMM30b, XMM30c, XMM30d,
1019                      XMM31, XMM31b, XMM31c, XMM31d
1020#endif
1021                      );
1022
1023reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
1024reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1025
1026// Class for all 256bit vector registers
1027reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1028                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1029                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1030                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1031                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1032                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1033                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1034                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1035#ifdef _LP64
1036                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1037                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1038                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1039                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1040                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1041                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1042                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1043                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
1044#endif
1045                      );
1046
1047// Class for all 256bit vector registers
1048reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1049                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1050                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1051                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1052                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1053                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1054                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1055                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1056#ifdef _LP64
1057                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1058                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1059                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1060                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1061                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1062                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1063                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1064                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1065                      XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1066                      XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1067                      XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1068                      XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1069                      XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1070                      XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1071                      XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1072                      XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1073                      XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1074                      XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1075                      XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1076                      XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1077                      XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1078                      XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1079                      XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1080                      XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1081#endif
1082                      );
1083
1084reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1085reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1086
1087// Class for all 512bit vector registers
1088reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1089                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1090                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1091                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1092                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1093                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1094                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1095                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1096#ifdef _LP64
1097                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1098                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1099                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1100                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1101                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1102                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1103                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1104                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1105                     ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1106                      XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1107                      XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1108                      XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1109                      XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1110                      XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1111                      XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1112                      XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1113                      XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1114                      XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1115                      XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1116                      XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1117                      XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1118                      XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1119                      XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1120                      XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1121#endif
1122                      );
1123
1124// Class for restricted 512bit vector registers
1125reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1126                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1127                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1128                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1129                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1130                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1131                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1132                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1133#ifdef _LP64
1134                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1135                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1136                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1137                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1138                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1139                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1140                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1141                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1142#endif
1143                      );
1144
1145reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1146reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1147
1148reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1149%}
1150
1151
1152//----------SOURCE BLOCK-------------------------------------------------------
1153// This is a block of C++ code which provides values, functions, and
1154// definitions necessary in the rest of the architecture description
1155
1156source_hpp %{
1157// Header information of the source block.
1158// Method declarations/definitions which are used outside
1159// the ad-scope can conveniently be defined here.
1160//
1161// To keep related declarations/definitions/uses close together,
1162// we switch between source %{ }% and source_hpp %{ }% freely as needed.
1163
1164#include "runtime/vm_version.hpp"
1165
1166class NativeJump;
1167
1168class CallStubImpl {
1169
1170  //--------------------------------------------------------------
1171  //---<  Used for optimization in Compile::shorten_branches  >---
1172  //--------------------------------------------------------------
1173
1174 public:
1175  // Size of call trampoline stub.
1176  static uint size_call_trampoline() {
1177    return 0; // no call trampolines on this platform
1178  }
1179
1180  // number of relocations needed by a call trampoline stub
1181  static uint reloc_call_trampoline() {
1182    return 0; // no call trampolines on this platform
1183  }
1184};
1185
1186class HandlerImpl {
1187
1188 public:
1189
1190  static int emit_exception_handler(CodeBuffer &cbuf);
1191  static int emit_deopt_handler(CodeBuffer& cbuf);
1192
1193  static uint size_exception_handler() {
1194    // NativeCall instruction size is the same as NativeJump.
1195    // exception handler starts out as jump and can be patched to
1196    // a call be deoptimization.  (4932387)
1197    // Note that this value is also credited (in output.cpp) to
1198    // the size of the code section.
1199    return NativeJump::instruction_size;
1200  }
1201
1202#ifdef _LP64
1203  static uint size_deopt_handler() {
1204    // three 5 byte instructions plus one move for unreachable address.
1205    return 15+3;
1206  }
1207#else
1208  static uint size_deopt_handler() {
1209    // NativeCall instruction size is the same as NativeJump.
1210    // exception handler starts out as jump and can be patched to
1211    // a call be deoptimization.  (4932387)
1212    // Note that this value is also credited (in output.cpp) to
1213    // the size of the code section.
1214    return 5 + NativeJump::instruction_size; // pushl(); jmp;
1215  }
1216#endif
1217};
1218
1219
1220inline uint vector_length(const Node* n) {
1221  const TypeVect* vt = n->bottom_type()->is_vect();
1222  return vt->length();
1223}
1224
1225inline uint vector_length(const MachNode* use, MachOper* opnd) {
1226  uint def_idx = use->operand_index(opnd);
1227  Node* def = use->in(def_idx);
1228  return def->bottom_type()->is_vect()->length();
1229}
1230
1231inline uint vector_length_in_bytes(const Node* n) {
1232  const TypeVect* vt = n->bottom_type()->is_vect();
1233  return vt->length_in_bytes();
1234}
1235
1236inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) {
1237  uint def_idx = use->operand_index(opnd);
1238  Node* def = use->in(def_idx);
1239  return def->bottom_type()->is_vect()->length_in_bytes();
1240}
1241
1242inline BasicType vector_element_basic_type(const Node *n) {
1243  return n->bottom_type()->is_vect()->element_basic_type();
1244}
1245
1246inline BasicType vector_element_basic_type(const MachNode *use, MachOper* opnd) {
1247  uint def_idx = use->operand_index(opnd);
1248  Node* def = use->in(def_idx);
1249  return def->bottom_type()->is_vect()->element_basic_type();
1250}
1251
1252inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1253  switch(bytes) {
1254    case  4: // fall-through
1255    case  8: // fall-through
1256    case 16: return Assembler::AVX_128bit;
1257    case 32: return Assembler::AVX_256bit;
1258    case 64: return Assembler::AVX_512bit;
1259
1260    default: {
1261      ShouldNotReachHere();
1262      return Assembler::AVX_NoVec;
1263    }
1264  }
1265}
1266
1267static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1268  return vector_length_encoding(vector_length_in_bytes(n));
1269}
1270
1271static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1272  uint def_idx = use->operand_index(opnd);
1273  Node* def = use->in(def_idx);
1274  return vector_length_encoding(def);
1275}
1276
1277static inline bool is_unsigned_booltest_pred(int bt) {
1278  return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
1279}
1280
1281class Node::PD {
1282public:
1283  enum NodeFlags {
1284    Flag_intel_jcc_erratum = Node::_last_flag << 1,
1285    _last_flag             = Flag_intel_jcc_erratum
1286  };
1287};
1288
1289%} // end source_hpp
1290
1291source %{
1292
1293#include "opto/addnode.hpp"
1294#include "c2_intelJccErratum_x86.hpp"
1295
1296void PhaseOutput::pd_perform_mach_node_analysis() {
1297  if (VM_Version::has_intel_jcc_erratum()) {
1298    int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1299    _buf_sizes._code += extra_padding;
1300  }
1301}
1302
1303int MachNode::pd_alignment_required() const {
1304  if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1305    // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1306    return IntelJccErratum::largest_jcc_size() + 1;
1307  } else {
1308    return 1;
1309  }
1310}
1311
1312int MachNode::compute_padding(int current_offset) const {
1313  if (flags() & Node::PD::Flag_intel_jcc_erratum) {
1314    Compile* C = Compile::current();
1315    PhaseOutput* output = C->output();
1316    Block* block = output->block();
1317    int index = output->index();
1318    return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
1319  } else {
1320    return 0;
1321  }
1322}
1323
1324// Emit exception handler code.
1325// Stuff framesize into a register and call a VM stub routine.
1326int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1327
1328  // Note that the code buffer's insts_mark is always relative to insts.
1329  // That's why we must use the macroassembler to generate a handler.
1330  C2_MacroAssembler _masm(&cbuf);
1331  address base = __ start_a_stub(size_exception_handler());
1332  if (base == NULL) {
1333    ciEnv::current()->record_failure("CodeCache is full");
1334    return 0;  // CodeBuffer::expand failed
1335  }
1336  int offset = __ offset();
1337  __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1338  assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1339  __ end_a_stub();
1340  return offset;
1341}
1342
1343// Emit deopt handler code.
1344int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1345
1346  // Note that the code buffer's insts_mark is always relative to insts.
1347  // That's why we must use the macroassembler to generate a handler.
1348  C2_MacroAssembler _masm(&cbuf);
1349  address base = __ start_a_stub(size_deopt_handler());
1350  if (base == NULL) {
1351    ciEnv::current()->record_failure("CodeCache is full");
1352    return 0;  // CodeBuffer::expand failed
1353  }
1354  int offset = __ offset();
1355
1356#ifdef _LP64
1357  address the_pc = (address) __ pc();
1358  Label next;
1359  // push a "the_pc" on the stack without destroying any registers
1360  // as they all may be live.
1361
1362  // push address of "next"
1363  __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1364  __ bind(next);
1365  // adjust it so it matches "the_pc"
1366  __ subptr(Address(rsp, 0), __ offset() - offset);
1367#else
1368  InternalAddress here(__ pc());
1369  __ pushptr(here.addr());
1370#endif
1371
1372  __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1373  assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1374  __ end_a_stub();
1375  return offset;
1376}
1377
1378Assembler::Width widthForType(BasicType bt) {
1379  if (bt == T_BYTE) {
1380    return Assembler::B;
1381  } else if (bt == T_SHORT) {
1382    return Assembler::W;
1383  } else if (bt == T_INT) {
1384    return Assembler::D;
1385  } else {
1386    assert(bt == T_LONG, "not a long: %s", type2name(bt));
1387    return Assembler::Q;
1388  }
1389}
1390
1391//=============================================================================
1392
1393  // Float masks come from different places depending on platform.
1394#ifdef _LP64
1395  static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1396  static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1397  static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1398  static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1399#else
1400  static address float_signmask()  { return (address)float_signmask_pool; }
1401  static address float_signflip()  { return (address)float_signflip_pool; }
1402  static address double_signmask() { return (address)double_signmask_pool; }
1403  static address double_signflip() { return (address)double_signflip_pool; }
1404#endif
1405  static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1406  static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1407  static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1408  static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1409  static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1410  static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1411  static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
1412  static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1413  static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1414  static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1415  static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1416  static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1417
1418//=============================================================================
1419const bool Matcher::match_rule_supported(int opcode) {
1420  if (!has_match_rule(opcode)) {
1421    return false; // no match rule present
1422  }
1423  const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1424  switch (opcode) {
1425    case Op_AbsVL:
1426    case Op_StoreVectorScatter:
1427      if (UseAVX < 3) {
1428        return false;
1429      }
1430      break;
1431    case Op_PopCountI:
1432    case Op_PopCountL:
1433      if (!UsePopCountInstruction) {
1434        return false;
1435      }
1436      break;
1437    case Op_PopCountVI:
1438      if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
1439        return false;
1440      }
1441      break;
1442    case Op_MulVI:
1443      if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1444        return false;
1445      }
1446      break;
1447    case Op_MulVL:
1448      if (UseSSE < 4) { // only with SSE4_1 or AVX
1449        return false;
1450      }
1451      break;
1452    case Op_MulReductionVL:
1453      if (VM_Version::supports_avx512dq() == false) {
1454        return false;
1455      }
1456      break;
1457    case Op_AddReductionVL:
1458      if (UseSSE < 2) { // requires at least SSE2
1459        return false;
1460      }
1461      break;
1462    case Op_AbsVB:
1463    case Op_AbsVS:
1464    case Op_AbsVI:
1465    case Op_AddReductionVI:
1466    case Op_AndReductionV:
1467    case Op_OrReductionV:
1468    case Op_XorReductionV:
1469      if (UseSSE < 3) { // requires at least SSSE3
1470        return false;
1471      }
1472      break;
1473    case Op_VectorLoadShuffle:
1474    case Op_VectorRearrange:
1475    case Op_MulReductionVI:
1476      if (UseSSE < 4) { // requires at least SSE4
1477        return false;
1478      }
1479      break;
1480    case Op_SqrtVD:
1481    case Op_SqrtVF:
1482    case Op_VectorMaskCmp:
1483    case Op_VectorCastB2X:
1484    case Op_VectorCastS2X:
1485    case Op_VectorCastI2X:
1486    case Op_VectorCastL2X:
1487    case Op_VectorCastF2X:
1488    case Op_VectorCastD2X:
1489      if (UseAVX < 1) { // enabled for AVX only
1490        return false;
1491      }
1492      break;
1493    case Op_CompareAndSwapL:
1494#ifdef _LP64
1495    case Op_CompareAndSwapP:
1496#endif
1497      if (!VM_Version::supports_cx8()) {
1498        return false;
1499      }
1500      break;
1501    case Op_CMoveVF:
1502    case Op_CMoveVD:
1503      if (UseAVX < 1) { // enabled for AVX only
1504        return false;
1505      }
1506      break;
1507    case Op_StrIndexOf:
1508      if (!UseSSE42Intrinsics) {
1509        return false;
1510      }
1511      break;
1512    case Op_StrIndexOfChar:
1513      if (!UseSSE42Intrinsics) {
1514        return false;
1515      }
1516      break;
1517    case Op_OnSpinWait:
1518      if (VM_Version::supports_on_spin_wait() == false) {
1519        return false;
1520      }
1521      break;
1522    case Op_MulVB:
1523    case Op_LShiftVB:
1524    case Op_RShiftVB:
1525    case Op_URShiftVB:
1526    case Op_VectorInsert:
1527    case Op_VectorLoadMask:
1528    case Op_VectorStoreMask:
1529    case Op_VectorBlend:
1530      if (UseSSE < 4) {
1531        return false;
1532      }
1533      break;
1534#ifdef _LP64
1535    case Op_MaxD:
1536    case Op_MaxF:
1537    case Op_MinD:
1538    case Op_MinF:
1539      if (UseAVX < 1) { // enabled for AVX only
1540        return false;
1541      }
1542      break;
1543#endif
1544    case Op_CacheWB:
1545    case Op_CacheWBPreSync:
1546    case Op_CacheWBPostSync:
1547      if (!VM_Version::supports_data_cache_line_flush()) {
1548        return false;
1549      }
1550      break;
1551    case Op_ExtractB:
1552    case Op_ExtractL:
1553    case Op_ExtractI:
1554    case Op_RoundDoubleMode:
1555      if (UseSSE < 4) {
1556        return false;
1557      }
1558      break;
1559    case Op_RoundDoubleModeV:
1560      if (VM_Version::supports_avx() == false) {
1561        return false; // 128bit vroundpd is not available
1562      }
1563      break;
1564    case Op_LoadVectorGather:
1565      if (UseAVX < 2) {
1566        return false;
1567      }
1568      break;
1569    case Op_FmaVD:
1570    case Op_FmaVF:
1571      if (!UseFMA) {
1572        return false;
1573      }
1574      break;
1575    case Op_MacroLogicV:
1576      if (UseAVX < 3 || !UseVectorMacroLogic) {
1577        return false;
1578      }
1579      break;
1580
1581    case Op_VectorCmpMasked:
1582    case Op_VectorMaskGen:
1583    case Op_LoadVectorMasked:
1584    case Op_StoreVectorMasked:
1585      if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
1586        return false;
1587      }
1588      break;
1589    case Op_VectorMaskFirstTrue:
1590    case Op_VectorMaskLastTrue:
1591    case Op_VectorMaskTrueCount:
1592      if (!is_LP64 || UseAVX < 1) {
1593         return false;
1594      }
1595      break;
1596#ifndef _LP64
1597    case Op_AddReductionVF:
1598    case Op_AddReductionVD:
1599    case Op_MulReductionVF:
1600    case Op_MulReductionVD:
1601      if (UseSSE < 1) { // requires at least SSE
1602        return false;
1603      }
1604      break;
1605    case Op_MulAddVS2VI:
1606    case Op_RShiftVL:
1607    case Op_AbsVD:
1608    case Op_NegVD:
1609      if (UseSSE < 2) {
1610        return false;
1611      }
1612      break;
1613#endif // !LP64
1614    case Op_SignumF:
1615      if (UseSSE < 1) {
1616        return false;
1617      }
1618      break;
1619    case Op_SignumD:
1620      if (UseSSE < 2) {
1621        return false;
1622      }
1623      break;
1624  }
1625  return true;  // Match rules are supported by default.
1626}
1627
1628//------------------------------------------------------------------------
1629
1630// Identify extra cases that we might want to provide match rules for vector nodes and
1631// other intrinsics guarded with vector length (vlen) and element type (bt).
1632const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1633  const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1634  if (!match_rule_supported(opcode)) {
1635    return false;
1636  }
1637  // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1638  //   * SSE2 supports 128bit vectors for all types;
1639  //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1640  //   * AVX2 supports 256bit vectors for all types;
1641  //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1642  //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1643  // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1644  // And MaxVectorSize is taken into account as well.
1645  if (!vector_size_supported(bt, vlen)) {
1646    return false;
1647  }
1648  // Special cases which require vector length follow:
1649  //   * implementation limitations
1650  //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1651  //   * 128bit vroundpd instruction is present only in AVX1
1652  int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1653  switch (opcode) {
1654    case Op_AbsVF:
1655    case Op_NegVF:
1656      if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1657        return false; // 512bit vandps and vxorps are not available
1658      }
1659      break;
1660    case Op_AbsVD:
1661    case Op_NegVD:
1662    case Op_MulVL:
1663      if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1664        return false; // 512bit vpmullq, vandpd and vxorpd are not available
1665      }
1666      break;
1667    case Op_CMoveVF:
1668      if (vlen != 8) {
1669        return false; // implementation limitation (only vcmov8F_reg is present)
1670      }
1671      break;
1672    case Op_RotateRightV:
1673    case Op_RotateLeftV:
1674    case Op_MacroLogicV:
1675      if (!VM_Version::supports_evex() ||
1676          ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1677        return false;
1678      }
1679      break;
1680    case Op_ClearArray:
1681    case Op_VectorMaskGen:
1682    case Op_VectorCmpMasked:
1683    case Op_LoadVectorMasked:
1684    case Op_StoreVectorMasked:
1685      if (!is_LP64 || !VM_Version::supports_avx512bw()) {
1686        return false;
1687      }
1688      if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
1689        return false;
1690      }
1691      break;
1692    case Op_CMoveVD:
1693      if (vlen != 4) {
1694        return false; // implementation limitation (only vcmov4D_reg is present)
1695      }
1696      break;
1697    case Op_MaxV:
1698    case Op_MinV:
1699      if (UseSSE < 4 && is_integral_type(bt)) {
1700        return false;
1701      }
1702      if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1703          // Float/Double intrinsics are enabled for AVX family currently.
1704          if (UseAVX == 0) {
1705            return false;
1706          }
1707          if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1708            return false;
1709          }
1710      }
1711      break;
1712    case Op_CallLeafVector:
1713      if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
1714        return false;
1715      }
1716      break;
1717    case Op_AddReductionVI:
1718      if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1719        return false;
1720      }
1721      // fallthrough
1722    case Op_AndReductionV:
1723    case Op_OrReductionV:
1724    case Op_XorReductionV:
1725      if (is_subword_type(bt) && (UseSSE < 4)) {
1726        return false;
1727      }
1728#ifndef _LP64
1729      if (bt == T_BYTE || bt == T_LONG) {
1730        return false;
1731      }
1732#endif
1733      break;
1734#ifndef _LP64
1735    case Op_VectorInsert:
1736      if (bt == T_LONG || bt == T_DOUBLE) {
1737        return false;
1738      }
1739      break;
1740#endif
1741    case Op_MinReductionV:
1742    case Op_MaxReductionV:
1743      if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1744        return false;
1745      } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1746        return false;
1747      }
1748      // Float/Double intrinsics enabled for AVX family.
1749      if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1750        return false;
1751      }
1752      if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1753        return false;
1754      }
1755#ifndef _LP64
1756      if (bt == T_BYTE || bt == T_LONG) {
1757        return false;
1758      }
1759#endif
1760      break;
1761    case Op_VectorTest:
1762      if (UseSSE < 4) {
1763        return false; // Implementation limitation
1764      } else if (size_in_bits < 32) {
1765        return false; // Implementation limitation
1766      } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
1767        return false; // Implementation limitation
1768      }
1769      break;
1770    case Op_VectorLoadShuffle:
1771    case Op_VectorRearrange:
1772      if(vlen == 2) {
1773        return false; // Implementation limitation due to how shuffle is loaded
1774      } else if (size_in_bits == 256 && UseAVX < 2) {
1775        return false; // Implementation limitation
1776      } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi())  {
1777        return false; // Implementation limitation
1778      } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw())  {
1779        return false; // Implementation limitation
1780      }
1781      break;
1782    case Op_VectorLoadMask:
1783      if (size_in_bits == 256 && UseAVX < 2) {
1784        return false; // Implementation limitation
1785      }
1786      // fallthrough
1787    case Op_VectorStoreMask:
1788      if (vlen == 2) {
1789        return false; // Implementation limitation
1790      }
1791      break;
1792    case Op_VectorCastB2X:
1793      if (size_in_bits == 256 && UseAVX < 2) {
1794        return false; // Implementation limitation
1795      }
1796      break;
1797    case Op_VectorCastS2X:
1798      if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1799        return false;
1800      }
1801      break;
1802    case Op_VectorCastI2X:
1803      if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1804        return false;
1805      }
1806      break;
1807    case Op_VectorCastL2X:
1808      if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1809        return false;
1810      } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1811        return false;
1812      }
1813      break;
1814    case Op_VectorCastF2X:
1815    case Op_VectorCastD2X:
1816      if (is_integral_type(bt)) {
1817        // Casts from FP to integral types require special fixup logic not easily
1818        // implementable with vectors.
1819        return false; // Implementation limitation
1820      }
1821    case Op_MulReductionVI:
1822      if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1823        return false;
1824      }
1825      break;
1826    case Op_StoreVectorScatter:
1827      if(bt == T_BYTE || bt == T_SHORT) {
1828        return false;
1829      } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1830        return false;
1831      }
1832      // fallthrough
1833    case Op_LoadVectorGather:
1834      if (size_in_bits == 64 ) {
1835        return false;
1836      }
1837      break;
1838    case Op_VectorMaskCmp:
1839      if (vlen < 2 || size_in_bits < 32) {
1840        return false;
1841      }
1842      break;
1843  }
1844  return true;  // Per default match rules are supported.
1845}
1846
1847MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
1848  assert(Matcher::is_generic_vector(generic_opnd), "not generic");
1849  bool legacy = (generic_opnd->opcode() == LEGVEC);
1850  if (!VM_Version::supports_avx512vlbwdq() && // KNL
1851      is_temp && !legacy && (ideal_reg == Op_VecZ)) {
1852    // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
1853    return new legVecZOper();
1854  }
1855  if (legacy) {
1856    switch (ideal_reg) {
1857      case Op_VecS: return new legVecSOper();
1858      case Op_VecD: return new legVecDOper();
1859      case Op_VecX: return new legVecXOper();
1860      case Op_VecY: return new legVecYOper();
1861      case Op_VecZ: return new legVecZOper();
1862    }
1863  } else {
1864    switch (ideal_reg) {
1865      case Op_VecS: return new vecSOper();
1866      case Op_VecD: return new vecDOper();
1867      case Op_VecX: return new vecXOper();
1868      case Op_VecY: return new vecYOper();
1869      case Op_VecZ: return new vecZOper();
1870    }
1871  }
1872  ShouldNotReachHere();
1873  return NULL;
1874}
1875
1876bool Matcher::is_generic_reg2reg_move(MachNode* m) {
1877  switch (m->rule()) {
1878    case MoveVec2Leg_rule:
1879    case MoveLeg2Vec_rule:
1880      return true;
1881    default:
1882      return false;
1883  }
1884}
1885
1886bool Matcher::is_generic_vector(MachOper* opnd) {
1887  switch (opnd->opcode()) {
1888    case VEC:
1889    case LEGVEC:
1890      return true;
1891    default:
1892      return false;
1893  }
1894}
1895
1896//------------------------------------------------------------------------
1897
1898const RegMask* Matcher::predicate_reg_mask(void) {
1899  return &_VECTMASK_REG_mask;
1900}
1901
1902const TypeVect* Matcher::predicate_reg_type(const Type* elemTy, int length) {
1903  return new TypeVectMask(TypeInt::BOOL, length);
1904}
1905
1906const int Matcher::float_pressure(int default_pressure_threshold) {
1907  int float_pressure_threshold = default_pressure_threshold;
1908#ifdef _LP64
1909  if (UseAVX > 2) {
1910    // Increase pressure threshold on machines with AVX3 which have
1911    // 2x more XMM registers.
1912    float_pressure_threshold = default_pressure_threshold * 2;
1913  }
1914#endif
1915  return float_pressure_threshold;
1916}
1917
1918// Max vector size in bytes. 0 if not supported.
1919const int Matcher::vector_width_in_bytes(BasicType bt) {
1920  assert(is_java_primitive(bt), "only primitive type vectors");
1921  if (UseSSE < 2) return 0;
1922  // SSE2 supports 128bit vectors for all types.
1923  // AVX2 supports 256bit vectors for all types.
1924  // AVX2/EVEX supports 512bit vectors for all types.
1925  int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1926  // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1927  if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1928    size = (UseAVX > 2) ? 64 : 32;
1929  if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1930    size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1931  // Use flag to limit vector size.
1932  size = MIN2(size,(int)MaxVectorSize);
1933  // Minimum 2 values in vector (or 4 for bytes).
1934  switch (bt) {
1935  case T_DOUBLE:
1936  case T_LONG:
1937    if (size < 16) return 0;
1938    break;
1939  case T_FLOAT:
1940  case T_INT:
1941    if (size < 8) return 0;
1942    break;
1943  case T_BOOLEAN:
1944    if (size < 4) return 0;
1945    break;
1946  case T_CHAR:
1947    if (size < 4) return 0;
1948    break;
1949  case T_BYTE:
1950    if (size < 4) return 0;
1951    break;
1952  case T_SHORT:
1953    if (size < 4) return 0;
1954    break;
1955  default:
1956    ShouldNotReachHere();
1957  }
1958  return size;
1959}
1960
1961// Limits on vector size (number of elements) loaded into vector.
1962const int Matcher::max_vector_size(const BasicType bt) {
1963  return vector_width_in_bytes(bt)/type2aelembytes(bt);
1964}
1965const int Matcher::min_vector_size(const BasicType bt) {
1966  int max_size = max_vector_size(bt);
1967  // Min size which can be loaded into vector is 4 bytes.
1968  int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1969  // Support for calling svml double64 vectors
1970  if (bt == T_DOUBLE) {
1971    size = 1;
1972  }
1973  return MIN2(size,max_size);
1974}
1975
1976const int Matcher::scalable_vector_reg_size(const BasicType bt) {
1977  return -1;
1978}
1979
1980// Vector ideal reg corresponding to specified size in bytes
1981const uint Matcher::vector_ideal_reg(int size) {
1982  assert(MaxVectorSize >= size, "");
1983  switch(size) {
1984    case  4: return Op_VecS;
1985    case  8: return Op_VecD;
1986    case 16: return Op_VecX;
1987    case 32: return Op_VecY;
1988    case 64: return Op_VecZ;
1989  }
1990  ShouldNotReachHere();
1991  return 0;
1992}
1993
1994// Check for shift by small constant as well
1995static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1996  if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1997      shift->in(2)->get_int() <= 3 &&
1998      // Are there other uses besides address expressions?
1999      !matcher->is_visited(shift)) {
2000    address_visited.set(shift->_idx); // Flag as address_visited
2001    mstack.push(shift->in(2), Matcher::Visit);
2002    Node *conv = shift->in(1);
2003#ifdef _LP64
2004    // Allow Matcher to match the rule which bypass
2005    // ConvI2L operation for an array index on LP64
2006    // if the index value is positive.
2007    if (conv->Opcode() == Op_ConvI2L &&
2008        conv->as_Type()->type()->is_long()->_lo >= 0 &&
2009        // Are there other uses besides address expressions?
2010        !matcher->is_visited(conv)) {
2011      address_visited.set(conv->_idx); // Flag as address_visited
2012      mstack.push(conv->in(1), Matcher::Pre_Visit);
2013    } else
2014#endif
2015      mstack.push(conv, Matcher::Pre_Visit);
2016    return true;
2017  }
2018  return false;
2019}
2020
2021// This function identifies sub-graphs in which a 'load' node is
2022// input to two different nodes, and such that it can be matched
2023// with BMI instructions like blsi, blsr, etc.
2024// Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
2025// The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
2026// refers to the same node.
2027//
2028// Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
2029// This is a temporary solution until we make DAGs expressible in ADL.
2030template<typename ConType>
2031class FusedPatternMatcher {
2032  Node* _op1_node;
2033  Node* _mop_node;
2034  int _con_op;
2035
2036  static int match_next(Node* n, int next_op, int next_op_idx) {
2037    if (n->in(1) == NULL || n->in(2) == NULL) {
2038      return -1;
2039    }
2040
2041    if (next_op_idx == -1) { // n is commutative, try rotations
2042      if (n->in(1)->Opcode() == next_op) {
2043        return 1;
2044      } else if (n->in(2)->Opcode() == next_op) {
2045        return 2;
2046      }
2047    } else {
2048      assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
2049      if (n->in(next_op_idx)->Opcode() == next_op) {
2050        return next_op_idx;
2051      }
2052    }
2053    return -1;
2054  }
2055
2056 public:
2057  FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
2058    _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
2059
2060  bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
2061             int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
2062             typename ConType::NativeType con_value) {
2063    if (_op1_node->Opcode() != op1) {
2064      return false;
2065    }
2066    if (_mop_node->outcnt() > 2) {
2067      return false;
2068    }
2069    op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
2070    if (op1_op2_idx == -1) {
2071      return false;
2072    }
2073    // Memory operation must be the other edge
2074    int op1_mop_idx = (op1_op2_idx & 1) + 1;
2075
2076    // Check that the mop node is really what we want
2077    if (_op1_node->in(op1_mop_idx) == _mop_node) {
2078      Node* op2_node = _op1_node->in(op1_op2_idx);
2079      if (op2_node->outcnt() > 1) {
2080        return false;
2081      }
2082      assert(op2_node->Opcode() == op2, "Should be");
2083      op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
2084      if (op2_con_idx == -1) {
2085        return false;
2086      }
2087      // Memory operation must be the other edge
2088      int op2_mop_idx = (op2_con_idx & 1) + 1;
2089      // Check that the memory operation is the same node
2090      if (op2_node->in(op2_mop_idx) == _mop_node) {
2091        // Now check the constant
2092        const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
2093        if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
2094          return true;
2095        }
2096      }
2097    }
2098    return false;
2099  }
2100};
2101
2102static bool is_bmi_pattern(Node* n, Node* m) {
2103  assert(UseBMI1Instructions, "sanity");
2104  if (n != NULL && m != NULL) {
2105    if (m->Opcode() == Op_LoadI) {
2106      FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
2107      return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
2108             bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
2109             bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
2110    } else if (m->Opcode() == Op_LoadL) {
2111      FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
2112      return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
2113             bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
2114             bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
2115    }
2116  }
2117  return false;
2118}
2119
2120// Should the matcher clone input 'm' of node 'n'?
2121bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
2122  // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
2123  if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
2124    mstack.push(m, Visit);
2125    return true;
2126  }
2127  if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
2128    mstack.push(m, Visit);           // m = ShiftCntV
2129    return true;
2130  }
2131  return false;
2132}
2133
2134// Should the Matcher clone shifts on addressing modes, expecting them
2135// to be subsumed into complex addressing expressions or compute them
2136// into registers?
2137bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
2138  Node *off = m->in(AddPNode::Offset);
2139  if (off->is_Con()) {
2140    address_visited.test_set(m->_idx); // Flag as address_visited
2141    Node *adr = m->in(AddPNode::Address);
2142
2143    // Intel can handle 2 adds in addressing mode
2144    // AtomicAdd is not an addressing expression.
2145    // Cheap to find it by looking for screwy base.
2146    if (adr->is_AddP() &&
2147        !adr->in(AddPNode::Base)->is_top() &&
2148        LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
2149        // Are there other uses besides address expressions?
2150        !is_visited(adr)) {
2151      address_visited.set(adr->_idx); // Flag as address_visited
2152      Node *shift = adr->in(AddPNode::Offset);
2153      if (!clone_shift(shift, this, mstack, address_visited)) {
2154        mstack.push(shift, Pre_Visit);
2155      }
2156      mstack.push(adr->in(AddPNode::Address), Pre_Visit);
2157      mstack.push(adr->in(AddPNode::Base), Pre_Visit);
2158    } else {
2159      mstack.push(adr, Pre_Visit);
2160    }
2161
2162    // Clone X+offset as it also folds into most addressing expressions
2163    mstack.push(off, Visit);
2164    mstack.push(m->in(AddPNode::Base), Pre_Visit);
2165    return true;
2166  } else if (clone_shift(off, this, mstack, address_visited)) {
2167    address_visited.test_set(m->_idx); // Flag as address_visited
2168    mstack.push(m->in(AddPNode::Address), Pre_Visit);
2169    mstack.push(m->in(AddPNode::Base), Pre_Visit);
2170    return true;
2171  }
2172  return false;
2173}
2174
2175static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
2176  switch (bt) {
2177    case BoolTest::eq:
2178      return Assembler::eq;
2179    case BoolTest::ne:
2180      return Assembler::neq;
2181    case BoolTest::le:
2182    case BoolTest::ule:
2183      return Assembler::le;
2184    case BoolTest::ge:
2185    case BoolTest::uge:
2186      return Assembler::nlt;
2187    case BoolTest::lt:
2188    case BoolTest::ult:
2189      return Assembler::lt;
2190    case BoolTest::gt:
2191    case BoolTest::ugt:
2192      return Assembler::nle;
2193    default : ShouldNotReachHere(); return Assembler::_false;
2194  }
2195}
2196
2197static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
2198  switch (bt) {
2199  case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
2200  // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
2201  case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
2202  case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
2203  case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
2204  case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
2205  case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
2206  default: ShouldNotReachHere(); return Assembler::FALSE_OS;
2207  }
2208}
2209
2210// Helper methods for MachSpillCopyNode::implementation().
2211static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
2212                          int src_hi, int dst_hi, uint ireg, outputStream* st) {
2213  assert(ireg == Op_VecS || // 32bit vector
2214         (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
2215         (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
2216         "no non-adjacent vector moves" );
2217  if (cbuf) {
2218    C2_MacroAssembler _masm(cbuf);
2219    switch (ireg) {
2220    case Op_VecS: // copy whole register
2221    case Op_VecD:
2222    case Op_VecX:
2223#ifndef _LP64
2224      __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2225#else
2226      if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2227        __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2228      } else {
2229        __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2230     }
2231#endif
2232      break;
2233    case Op_VecY:
2234#ifndef _LP64
2235      __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2236#else
2237      if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2238        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2239      } else {
2240        __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2241     }
2242#endif
2243      break;
2244    case Op_VecZ:
2245      __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
2246      break;
2247    default:
2248      ShouldNotReachHere();
2249    }
2250#ifndef PRODUCT
2251  } else {
2252    switch (ireg) {
2253    case Op_VecS:
2254    case Op_VecD:
2255    case Op_VecX:
2256      st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2257      break;
2258    case Op_VecY:
2259    case Op_VecZ:
2260      st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2261      break;
2262    default:
2263      ShouldNotReachHere();
2264    }
2265#endif
2266  }
2267}
2268
2269void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
2270                     int stack_offset, int reg, uint ireg, outputStream* st) {
2271  if (cbuf) {
2272    C2_MacroAssembler _masm(cbuf);
2273    if (is_load) {
2274      switch (ireg) {
2275      case Op_VecS:
2276        __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2277        break;
2278      case Op_VecD:
2279        __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2280        break;
2281      case Op_VecX:
2282#ifndef _LP64
2283        __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2284#else
2285        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2286          __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2287        } else {
2288          __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2289          __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2290        }
2291#endif
2292        break;
2293      case Op_VecY:
2294#ifndef _LP64
2295        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2296#else
2297        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2298          __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2299        } else {
2300          __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2301          __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2302        }
2303#endif
2304        break;
2305      case Op_VecZ:
2306        __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
2307        break;
2308      default:
2309        ShouldNotReachHere();
2310      }
2311    } else { // store
2312      switch (ireg) {
2313      case Op_VecS:
2314        __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2315        break;
2316      case Op_VecD:
2317        __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2318        break;
2319      case Op_VecX:
2320#ifndef _LP64
2321        __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2322#else
2323        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2324          __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2325        }
2326        else {
2327          __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2328        }
2329#endif
2330        break;
2331      case Op_VecY:
2332#ifndef _LP64
2333        __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2334#else
2335        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2336          __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2337        }
2338        else {
2339          __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2340        }
2341#endif
2342        break;
2343      case Op_VecZ:
2344        __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2345        break;
2346      default:
2347        ShouldNotReachHere();
2348      }
2349    }
2350#ifndef PRODUCT
2351  } else {
2352    if (is_load) {
2353      switch (ireg) {
2354      case Op_VecS:
2355        st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2356        break;
2357      case Op_VecD:
2358        st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2359        break;
2360       case Op_VecX:
2361        st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2362        break;
2363      case Op_VecY:
2364      case Op_VecZ:
2365        st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2366        break;
2367      default:
2368        ShouldNotReachHere();
2369      }
2370    } else { // store
2371      switch (ireg) {
2372      case Op_VecS:
2373        st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2374        break;
2375      case Op_VecD:
2376        st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2377        break;
2378       case Op_VecX:
2379        st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2380        break;
2381      case Op_VecY:
2382      case Op_VecZ:
2383        st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2384        break;
2385      default:
2386        ShouldNotReachHere();
2387      }
2388    }
2389#endif
2390  }
2391}
2392
2393static inline jlong replicate8_imm(int con, int width) {
2394  // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2395  assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2396  int bit_width = width * 8;
2397  jlong val = con;
2398  val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
2399  while(bit_width < 64) {
2400    val |= (val << bit_width);
2401    bit_width <<= 1;
2402  }
2403  return val;
2404}
2405
2406#ifndef PRODUCT
2407  void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2408    st->print("nop \t# %d bytes pad for loops and calls", _count);
2409  }
2410#endif
2411
2412  void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2413    C2_MacroAssembler _masm(&cbuf);
2414    __ nop(_count);
2415  }
2416
2417  uint MachNopNode::size(PhaseRegAlloc*) const {
2418    return _count;
2419  }
2420
2421#ifndef PRODUCT
2422  void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2423    st->print("# breakpoint");
2424  }
2425#endif
2426
2427  void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2428    C2_MacroAssembler _masm(&cbuf);
2429    __ int3();
2430  }
2431
2432  uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2433    return MachNode::size(ra_);
2434  }
2435
2436%}
2437
2438encode %{
2439
2440  enc_class call_epilog %{
2441    if (VerifyStackAtCalls) {
2442      // Check that stack depth is unchanged: find majik cookie on stack
2443      int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2444      C2_MacroAssembler _masm(&cbuf);
2445      Label L;
2446      __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2447      __ jccb(Assembler::equal, L);
2448      // Die if stack mismatch
2449      __ int3();
2450      __ bind(L);
2451    }
2452  %}
2453
2454%}
2455
2456// Operands for bound floating pointer register arguments
2457operand rxmm0() %{
2458  constraint(ALLOC_IN_RC(xmm0_reg));
2459  match(VecX);
2460  format%{%}
2461  interface(REG_INTER);
2462%}
2463
2464//----------OPERANDS-----------------------------------------------------------
2465// Operand definitions must precede instruction definitions for correct parsing
2466// in the ADLC because operands constitute user defined types which are used in
2467// instruction definitions.
2468
2469// Vectors
2470
2471// Dummy generic vector class. Should be used for all vector operands.
2472// Replaced with vec[SDXYZ] during post-selection pass.
2473operand vec() %{
2474  constraint(ALLOC_IN_RC(dynamic));
2475  match(VecX);
2476  match(VecY);
2477  match(VecZ);
2478  match(VecS);
2479  match(VecD);
2480
2481  format %{ %}
2482  interface(REG_INTER);
2483%}
2484
2485// Dummy generic legacy vector class. Should be used for all legacy vector operands.
2486// Replaced with legVec[SDXYZ] during post-selection cleanup.
2487// Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
2488// runtime code generation via reg_class_dynamic.
2489operand legVec() %{
2490  constraint(ALLOC_IN_RC(dynamic));
2491  match(VecX);
2492  match(VecY);
2493  match(VecZ);
2494  match(VecS);
2495  match(VecD);
2496
2497  format %{ %}
2498  interface(REG_INTER);
2499%}
2500
2501// Replaces vec during post-selection cleanup. See above.
2502operand vecS() %{
2503  constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
2504  match(VecS);
2505
2506  format %{ %}
2507  interface(REG_INTER);
2508%}
2509
2510// Replaces legVec during post-selection cleanup. See above.
2511operand legVecS() %{
2512  constraint(ALLOC_IN_RC(vectors_reg_legacy));
2513  match(VecS);
2514
2515  format %{ %}
2516  interface(REG_INTER);
2517%}
2518
2519// Replaces vec during post-selection cleanup. See above.
2520operand vecD() %{
2521  constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
2522  match(VecD);
2523
2524  format %{ %}
2525  interface(REG_INTER);
2526%}
2527
2528// Replaces legVec during post-selection cleanup. See above.
2529operand legVecD() %{
2530  constraint(ALLOC_IN_RC(vectord_reg_legacy));
2531  match(VecD);
2532
2533  format %{ %}
2534  interface(REG_INTER);
2535%}
2536
2537// Replaces vec during post-selection cleanup. See above.
2538operand vecX() %{
2539  constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
2540  match(VecX);
2541
2542  format %{ %}
2543  interface(REG_INTER);
2544%}
2545
2546// Replaces legVec during post-selection cleanup. See above.
2547operand legVecX() %{
2548  constraint(ALLOC_IN_RC(vectorx_reg_legacy));
2549  match(VecX);
2550
2551  format %{ %}
2552  interface(REG_INTER);
2553%}
2554
2555// Replaces vec during post-selection cleanup. See above.
2556operand vecY() %{
2557  constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
2558  match(VecY);
2559
2560  format %{ %}
2561  interface(REG_INTER);
2562%}
2563
2564// Replaces legVec during post-selection cleanup. See above.
2565operand legVecY() %{
2566  constraint(ALLOC_IN_RC(vectory_reg_legacy));
2567  match(VecY);
2568
2569  format %{ %}
2570  interface(REG_INTER);
2571%}
2572
2573// Replaces vec during post-selection cleanup. See above.
2574operand vecZ() %{
2575  constraint(ALLOC_IN_RC(vectorz_reg));
2576  match(VecZ);
2577
2578  format %{ %}
2579  interface(REG_INTER);
2580%}
2581
2582// Replaces legVec during post-selection cleanup. See above.
2583operand legVecZ() %{
2584  constraint(ALLOC_IN_RC(vectorz_reg_legacy));
2585  match(VecZ);
2586
2587  format %{ %}
2588  interface(REG_INTER);
2589%}
2590
2591// Comparison Code for FP conditional move
2592operand cmpOp_vcmppd() %{
2593  match(Bool);
2594
2595  predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2596            n->as_Bool()->_test._test != BoolTest::no_overflow);
2597  format %{ "" %}
2598  interface(COND_INTER) %{
2599    equal        (0x0, "eq");
2600    less         (0x1, "lt");
2601    less_equal   (0x2, "le");
2602    not_equal    (0xC, "ne");
2603    greater_equal(0xD, "ge");
2604    greater      (0xE, "gt");
2605    //TODO cannot compile (adlc breaks) without two next lines with error:
2606    // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2607    // equal' for overflow.
2608    overflow     (0x20, "o");  // not really supported by the instruction
2609    no_overflow  (0x21, "no"); // not really supported by the instruction
2610  %}
2611%}
2612
2613
2614// INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2615
2616// ============================================================================
2617
2618instruct ShouldNotReachHere() %{
2619  match(Halt);
2620  format %{ "stop\t# ShouldNotReachHere" %}
2621  ins_encode %{
2622    if (is_reachable()) {
2623      __ stop(_halt_reason);
2624    }
2625  %}
2626  ins_pipe(pipe_slow);
2627%}
2628
2629// =================================EVEX special===============================
2630// Existing partial implementation for post-loop multi-versioning computes
2631// the mask corresponding to tail loop in K1 opmask register. This may then be
2632// used for predicating instructions in loop body during last post-loop iteration.
2633// TODO: Remove hard-coded K1 usage while fixing existing post-loop
2634// multiversioning support.
2635instruct setMask(rRegI dst, rRegI src, kReg_K1 mask) %{
2636  predicate(PostLoopMultiversioning && Matcher::has_predicated_vectors());
2637  match(Set dst (SetVectMaskI  src));
2638  effect(TEMP dst);
2639  format %{ "setvectmask   $dst, $src" %}
2640  ins_encode %{
2641    __ setvectmask($dst$$Register, $src$$Register, $mask$$KRegister);
2642  %}
2643  ins_pipe(pipe_slow);
2644%}
2645
2646// ============================================================================
2647
2648instruct addF_reg(regF dst, regF src) %{
2649  predicate((UseSSE>=1) && (UseAVX == 0));
2650  match(Set dst (AddF dst src));
2651
2652  format %{ "addss   $dst, $src" %}
2653  ins_cost(150);
2654  ins_encode %{
2655    __ addss($dst$$XMMRegister, $src$$XMMRegister);
2656  %}
2657  ins_pipe(pipe_slow);
2658%}
2659
2660instruct addF_mem(regF dst, memory src) %{
2661  predicate((UseSSE>=1) && (UseAVX == 0));
2662  match(Set dst (AddF dst (LoadF src)));
2663
2664  format %{ "addss   $dst, $src" %}
2665  ins_cost(150);
2666  ins_encode %{
2667    __ addss($dst$$XMMRegister, $src$$Address);
2668  %}
2669  ins_pipe(pipe_slow);
2670%}
2671
2672instruct addF_imm(regF dst, immF con) %{
2673  predicate((UseSSE>=1) && (UseAVX == 0));
2674  match(Set dst (AddF dst con));
2675  format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2676  ins_cost(150);
2677  ins_encode %{
2678    __ addss($dst$$XMMRegister, $constantaddress($con));
2679  %}
2680  ins_pipe(pipe_slow);
2681%}
2682
2683instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2684  predicate(UseAVX > 0);
2685  match(Set dst (AddF src1 src2));
2686
2687  format %{ "vaddss  $dst, $src1, $src2" %}
2688  ins_cost(150);
2689  ins_encode %{
2690    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2691  %}
2692  ins_pipe(pipe_slow);
2693%}
2694
2695instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2696  predicate(UseAVX > 0);
2697  match(Set dst (AddF src1 (LoadF src2)));
2698
2699  format %{ "vaddss  $dst, $src1, $src2" %}
2700  ins_cost(150);
2701  ins_encode %{
2702    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2703  %}
2704  ins_pipe(pipe_slow);
2705%}
2706
2707instruct addF_reg_imm(regF dst, regF src, immF con) %{
2708  predicate(UseAVX > 0);
2709  match(Set dst (AddF src con));
2710
2711  format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2712  ins_cost(150);
2713  ins_encode %{
2714    __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2715  %}
2716  ins_pipe(pipe_slow);
2717%}
2718
2719instruct addD_reg(regD dst, regD src) %{
2720  predicate((UseSSE>=2) && (UseAVX == 0));
2721  match(Set dst (AddD dst src));
2722
2723  format %{ "addsd   $dst, $src" %}
2724  ins_cost(150);
2725  ins_encode %{
2726    __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2727  %}
2728  ins_pipe(pipe_slow);
2729%}
2730
2731instruct addD_mem(regD dst, memory src) %{
2732  predicate((UseSSE>=2) && (UseAVX == 0));
2733  match(Set dst (AddD dst (LoadD src)));
2734
2735  format %{ "addsd   $dst, $src" %}
2736  ins_cost(150);
2737  ins_encode %{
2738    __ addsd($dst$$XMMRegister, $src$$Address);
2739  %}
2740  ins_pipe(pipe_slow);
2741%}
2742
2743instruct addD_imm(regD dst, immD con) %{
2744  predicate((UseSSE>=2) && (UseAVX == 0));
2745  match(Set dst (AddD dst con));
2746  format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2747  ins_cost(150);
2748  ins_encode %{
2749    __ addsd($dst$$XMMRegister, $constantaddress($con));
2750  %}
2751  ins_pipe(pipe_slow);
2752%}
2753
2754instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2755  predicate(UseAVX > 0);
2756  match(Set dst (AddD src1 src2));
2757
2758  format %{ "vaddsd  $dst, $src1, $src2" %}
2759  ins_cost(150);
2760  ins_encode %{
2761    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2762  %}
2763  ins_pipe(pipe_slow);
2764%}
2765
2766instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2767  predicate(UseAVX > 0);
2768  match(Set dst (AddD src1 (LoadD src2)));
2769
2770  format %{ "vaddsd  $dst, $src1, $src2" %}
2771  ins_cost(150);
2772  ins_encode %{
2773    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2774  %}
2775  ins_pipe(pipe_slow);
2776%}
2777
2778instruct addD_reg_imm(regD dst, regD src, immD con) %{
2779  predicate(UseAVX > 0);
2780  match(Set dst (AddD src con));
2781
2782  format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2783  ins_cost(150);
2784  ins_encode %{
2785    __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2786  %}
2787  ins_pipe(pipe_slow);
2788%}
2789
2790instruct subF_reg(regF dst, regF src) %{
2791  predicate((UseSSE>=1) && (UseAVX == 0));
2792  match(Set dst (SubF dst src));
2793
2794  format %{ "subss   $dst, $src" %}
2795  ins_cost(150);
2796  ins_encode %{
2797    __ subss($dst$$XMMRegister, $src$$XMMRegister);
2798  %}
2799  ins_pipe(pipe_slow);
2800%}
2801
2802instruct subF_mem(regF dst, memory src) %{
2803  predicate((UseSSE>=1) && (UseAVX == 0));
2804  match(Set dst (SubF dst (LoadF src)));
2805
2806  format %{ "subss   $dst, $src" %}
2807  ins_cost(150);
2808  ins_encode %{
2809    __ subss($dst$$XMMRegister, $src$$Address);
2810  %}
2811  ins_pipe(pipe_slow);
2812%}
2813
2814instruct subF_imm(regF dst, immF con) %{
2815  predicate((UseSSE>=1) && (UseAVX == 0));
2816  match(Set dst (SubF dst con));
2817  format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2818  ins_cost(150);
2819  ins_encode %{
2820    __ subss($dst$$XMMRegister, $constantaddress($con));
2821  %}
2822  ins_pipe(pipe_slow);
2823%}
2824
2825instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2826  predicate(UseAVX > 0);
2827  match(Set dst (SubF src1 src2));
2828
2829  format %{ "vsubss  $dst, $src1, $src2" %}
2830  ins_cost(150);
2831  ins_encode %{
2832    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2833  %}
2834  ins_pipe(pipe_slow);
2835%}
2836
2837instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2838  predicate(UseAVX > 0);
2839  match(Set dst (SubF src1 (LoadF src2)));
2840
2841  format %{ "vsubss  $dst, $src1, $src2" %}
2842  ins_cost(150);
2843  ins_encode %{
2844    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2845  %}
2846  ins_pipe(pipe_slow);
2847%}
2848
2849instruct subF_reg_imm(regF dst, regF src, immF con) %{
2850  predicate(UseAVX > 0);
2851  match(Set dst (SubF src con));
2852
2853  format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2854  ins_cost(150);
2855  ins_encode %{
2856    __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2857  %}
2858  ins_pipe(pipe_slow);
2859%}
2860
2861instruct subD_reg(regD dst, regD src) %{
2862  predicate((UseSSE>=2) && (UseAVX == 0));
2863  match(Set dst (SubD dst src));
2864
2865  format %{ "subsd   $dst, $src" %}
2866  ins_cost(150);
2867  ins_encode %{
2868    __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2869  %}
2870  ins_pipe(pipe_slow);
2871%}
2872
2873instruct subD_mem(regD dst, memory src) %{
2874  predicate((UseSSE>=2) && (UseAVX == 0));
2875  match(Set dst (SubD dst (LoadD src)));
2876
2877  format %{ "subsd   $dst, $src" %}
2878  ins_cost(150);
2879  ins_encode %{
2880    __ subsd($dst$$XMMRegister, $src$$Address);
2881  %}
2882  ins_pipe(pipe_slow);
2883%}
2884
2885instruct subD_imm(regD dst, immD con) %{
2886  predicate((UseSSE>=2) && (UseAVX == 0));
2887  match(Set dst (SubD dst con));
2888  format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2889  ins_cost(150);
2890  ins_encode %{
2891    __ subsd($dst$$XMMRegister, $constantaddress($con));
2892  %}
2893  ins_pipe(pipe_slow);
2894%}
2895
2896instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2897  predicate(UseAVX > 0);
2898  match(Set dst (SubD src1 src2));
2899
2900  format %{ "vsubsd  $dst, $src1, $src2" %}
2901  ins_cost(150);
2902  ins_encode %{
2903    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2904  %}
2905  ins_pipe(pipe_slow);
2906%}
2907
2908instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2909  predicate(UseAVX > 0);
2910  match(Set dst (SubD src1 (LoadD src2)));
2911
2912  format %{ "vsubsd  $dst, $src1, $src2" %}
2913  ins_cost(150);
2914  ins_encode %{
2915    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2916  %}
2917  ins_pipe(pipe_slow);
2918%}
2919
2920instruct subD_reg_imm(regD dst, regD src, immD con) %{
2921  predicate(UseAVX > 0);
2922  match(Set dst (SubD src con));
2923
2924  format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2925  ins_cost(150);
2926  ins_encode %{
2927    __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2928  %}
2929  ins_pipe(pipe_slow);
2930%}
2931
2932instruct mulF_reg(regF dst, regF src) %{
2933  predicate((UseSSE>=1) && (UseAVX == 0));
2934  match(Set dst (MulF dst src));
2935
2936  format %{ "mulss   $dst, $src" %}
2937  ins_cost(150);
2938  ins_encode %{
2939    __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2940  %}
2941  ins_pipe(pipe_slow);
2942%}
2943
2944instruct mulF_mem(regF dst, memory src) %{
2945  predicate((UseSSE>=1) && (UseAVX == 0));
2946  match(Set dst (MulF dst (LoadF src)));
2947
2948  format %{ "mulss   $dst, $src" %}
2949  ins_cost(150);
2950  ins_encode %{
2951    __ mulss($dst$$XMMRegister, $src$$Address);
2952  %}
2953  ins_pipe(pipe_slow);
2954%}
2955
2956instruct mulF_imm(regF dst, immF con) %{
2957  predicate((UseSSE>=1) && (UseAVX == 0));
2958  match(Set dst (MulF dst con));
2959  format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2960  ins_cost(150);
2961  ins_encode %{
2962    __ mulss($dst$$XMMRegister, $constantaddress($con));
2963  %}
2964  ins_pipe(pipe_slow);
2965%}
2966
2967instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2968  predicate(UseAVX > 0);
2969  match(Set dst (MulF src1 src2));
2970
2971  format %{ "vmulss  $dst, $src1, $src2" %}
2972  ins_cost(150);
2973  ins_encode %{
2974    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2975  %}
2976  ins_pipe(pipe_slow);
2977%}
2978
2979instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2980  predicate(UseAVX > 0);
2981  match(Set dst (MulF src1 (LoadF src2)));
2982
2983  format %{ "vmulss  $dst, $src1, $src2" %}
2984  ins_cost(150);
2985  ins_encode %{
2986    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2987  %}
2988  ins_pipe(pipe_slow);
2989%}
2990
2991instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2992  predicate(UseAVX > 0);
2993  match(Set dst (MulF src con));
2994
2995  format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2996  ins_cost(150);
2997  ins_encode %{
2998    __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2999  %}
3000  ins_pipe(pipe_slow);
3001%}
3002
3003instruct mulD_reg(regD dst, regD src) %{
3004  predicate((UseSSE>=2) && (UseAVX == 0));
3005  match(Set dst (MulD dst src));
3006
3007  format %{ "mulsd   $dst, $src" %}
3008  ins_cost(150);
3009  ins_encode %{
3010    __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
3011  %}
3012  ins_pipe(pipe_slow);
3013%}
3014
3015instruct mulD_mem(regD dst, memory src) %{
3016  predicate((UseSSE>=2) && (UseAVX == 0));
3017  match(Set dst (MulD dst (LoadD src)));
3018
3019  format %{ "mulsd   $dst, $src" %}
3020  ins_cost(150);
3021  ins_encode %{
3022    __ mulsd($dst$$XMMRegister, $src$$Address);
3023  %}
3024  ins_pipe(pipe_slow);
3025%}
3026
3027instruct mulD_imm(regD dst, immD con) %{
3028  predicate((UseSSE>=2) && (UseAVX == 0));
3029  match(Set dst (MulD dst con));
3030  format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3031  ins_cost(150);
3032  ins_encode %{
3033    __ mulsd($dst$$XMMRegister, $constantaddress($con));
3034  %}
3035  ins_pipe(pipe_slow);
3036%}
3037
3038instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
3039  predicate(UseAVX > 0);
3040  match(Set dst (MulD src1 src2));
3041
3042  format %{ "vmulsd  $dst, $src1, $src2" %}
3043  ins_cost(150);
3044  ins_encode %{
3045    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3046  %}
3047  ins_pipe(pipe_slow);
3048%}
3049
3050instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
3051  predicate(UseAVX > 0);
3052  match(Set dst (MulD src1 (LoadD src2)));
3053
3054  format %{ "vmulsd  $dst, $src1, $src2" %}
3055  ins_cost(150);
3056  ins_encode %{
3057    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3058  %}
3059  ins_pipe(pipe_slow);
3060%}
3061
3062instruct mulD_reg_imm(regD dst, regD src, immD con) %{
3063  predicate(UseAVX > 0);
3064  match(Set dst (MulD src con));
3065
3066  format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3067  ins_cost(150);
3068  ins_encode %{
3069    __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3070  %}
3071  ins_pipe(pipe_slow);
3072%}
3073
3074instruct divF_reg(regF dst, regF src) %{
3075  predicate((UseSSE>=1) && (UseAVX == 0));
3076  match(Set dst (DivF dst src));
3077
3078  format %{ "divss   $dst, $src" %}
3079  ins_cost(150);
3080  ins_encode %{
3081    __ divss($dst$$XMMRegister, $src$$XMMRegister);
3082  %}
3083  ins_pipe(pipe_slow);
3084%}
3085
3086instruct divF_mem(regF dst, memory src) %{
3087  predicate((UseSSE>=1) && (UseAVX == 0));
3088  match(Set dst (DivF dst (LoadF src)));
3089
3090  format %{ "divss   $dst, $src" %}
3091  ins_cost(150);
3092  ins_encode %{
3093    __ divss($dst$$XMMRegister, $src$$Address);
3094  %}
3095  ins_pipe(pipe_slow);
3096%}
3097
3098instruct divF_imm(regF dst, immF con) %{
3099  predicate((UseSSE>=1) && (UseAVX == 0));
3100  match(Set dst (DivF dst con));
3101  format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3102  ins_cost(150);
3103  ins_encode %{
3104    __ divss($dst$$XMMRegister, $constantaddress($con));
3105  %}
3106  ins_pipe(pipe_slow);
3107%}
3108
3109instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
3110  predicate(UseAVX > 0);
3111  match(Set dst (DivF src1 src2));
3112
3113  format %{ "vdivss  $dst, $src1, $src2" %}
3114  ins_cost(150);
3115  ins_encode %{
3116    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3117  %}
3118  ins_pipe(pipe_slow);
3119%}
3120
3121instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
3122  predicate(UseAVX > 0);
3123  match(Set dst (DivF src1 (LoadF src2)));
3124
3125  format %{ "vdivss  $dst, $src1, $src2" %}
3126  ins_cost(150);
3127  ins_encode %{
3128    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3129  %}
3130  ins_pipe(pipe_slow);
3131%}
3132
3133instruct divF_reg_imm(regF dst, regF src, immF con) %{
3134  predicate(UseAVX > 0);
3135  match(Set dst (DivF src con));
3136
3137  format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3138  ins_cost(150);
3139  ins_encode %{
3140    __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3141  %}
3142  ins_pipe(pipe_slow);
3143%}
3144
3145instruct divD_reg(regD dst, regD src) %{
3146  predicate((UseSSE>=2) && (UseAVX == 0));
3147  match(Set dst (DivD dst src));
3148
3149  format %{ "divsd   $dst, $src" %}
3150  ins_cost(150);
3151  ins_encode %{
3152    __ divsd($dst$$XMMRegister, $src$$XMMRegister);
3153  %}
3154  ins_pipe(pipe_slow);
3155%}
3156
3157instruct divD_mem(regD dst, memory src) %{
3158  predicate((UseSSE>=2) && (UseAVX == 0));
3159  match(Set dst (DivD dst (LoadD src)));
3160
3161  format %{ "divsd   $dst, $src" %}
3162  ins_cost(150);
3163  ins_encode %{
3164    __ divsd($dst$$XMMRegister, $src$$Address);
3165  %}
3166  ins_pipe(pipe_slow);
3167%}
3168
3169instruct divD_imm(regD dst, immD con) %{
3170  predicate((UseSSE>=2) && (UseAVX == 0));
3171  match(Set dst (DivD dst con));
3172  format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3173  ins_cost(150);
3174  ins_encode %{
3175    __ divsd($dst$$XMMRegister, $constantaddress($con));
3176  %}
3177  ins_pipe(pipe_slow);
3178%}
3179
3180instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
3181  predicate(UseAVX > 0);
3182  match(Set dst (DivD src1 src2));
3183
3184  format %{ "vdivsd  $dst, $src1, $src2" %}
3185  ins_cost(150);
3186  ins_encode %{
3187    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3188  %}
3189  ins_pipe(pipe_slow);
3190%}
3191
3192instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
3193  predicate(UseAVX > 0);
3194  match(Set dst (DivD src1 (LoadD src2)));
3195
3196  format %{ "vdivsd  $dst, $src1, $src2" %}
3197  ins_cost(150);
3198  ins_encode %{
3199    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3200  %}
3201  ins_pipe(pipe_slow);
3202%}
3203
3204instruct divD_reg_imm(regD dst, regD src, immD con) %{
3205  predicate(UseAVX > 0);
3206  match(Set dst (DivD src con));
3207
3208  format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3209  ins_cost(150);
3210  ins_encode %{
3211    __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3212  %}
3213  ins_pipe(pipe_slow);
3214%}
3215
3216instruct absF_reg(regF dst) %{
3217  predicate((UseSSE>=1) && (UseAVX == 0));
3218  match(Set dst (AbsF dst));
3219  ins_cost(150);
3220  format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
3221  ins_encode %{
3222    __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3223  %}
3224  ins_pipe(pipe_slow);
3225%}
3226
3227instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3228  predicate(UseAVX > 0);
3229  match(Set dst (AbsF src));
3230  ins_cost(150);
3231  format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3232  ins_encode %{
3233    int vlen_enc = Assembler::AVX_128bit;
3234    __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3235              ExternalAddress(float_signmask()), vlen_enc);
3236  %}
3237  ins_pipe(pipe_slow);
3238%}
3239
3240instruct absD_reg(regD dst) %{
3241  predicate((UseSSE>=2) && (UseAVX == 0));
3242  match(Set dst (AbsD dst));
3243  ins_cost(150);
3244  format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
3245            "# abs double by sign masking" %}
3246  ins_encode %{
3247    __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3248  %}
3249  ins_pipe(pipe_slow);
3250%}
3251
3252instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3253  predicate(UseAVX > 0);
3254  match(Set dst (AbsD src));
3255  ins_cost(150);
3256  format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
3257            "# abs double by sign masking" %}
3258  ins_encode %{
3259    int vlen_enc = Assembler::AVX_128bit;
3260    __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3261              ExternalAddress(double_signmask()), vlen_enc);
3262  %}
3263  ins_pipe(pipe_slow);
3264%}
3265
3266instruct negF_reg(regF dst) %{
3267  predicate((UseSSE>=1) && (UseAVX == 0));
3268  match(Set dst (NegF dst));
3269  ins_cost(150);
3270  format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
3271  ins_encode %{
3272    __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3273  %}
3274  ins_pipe(pipe_slow);
3275%}
3276
3277instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3278  predicate(UseAVX > 0);
3279  match(Set dst (NegF src));
3280  ins_cost(150);
3281  format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3282  ins_encode %{
3283    __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
3284                 ExternalAddress(float_signflip()));
3285  %}
3286  ins_pipe(pipe_slow);
3287%}
3288
3289instruct negD_reg(regD dst) %{
3290  predicate((UseSSE>=2) && (UseAVX == 0));
3291  match(Set dst (NegD dst));
3292  ins_cost(150);
3293  format %{ "xorpd   $dst, [0x8000000000000000]\t"
3294            "# neg double by sign flipping" %}
3295  ins_encode %{
3296    __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
3297  %}
3298  ins_pipe(pipe_slow);
3299%}
3300
3301instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
3302  predicate(UseAVX > 0);
3303  match(Set dst (NegD src));
3304  ins_cost(150);
3305  format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
3306            "# neg double by sign flipping" %}
3307  ins_encode %{
3308    __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
3309                 ExternalAddress(double_signflip()));
3310  %}
3311  ins_pipe(pipe_slow);
3312%}
3313
3314// sqrtss instruction needs destination register to be pre initialized for best performance
3315// Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3316instruct sqrtF_reg(regF dst) %{
3317  predicate(UseSSE>=1);
3318  match(Set dst (SqrtF dst));
3319  format %{ "sqrtss  $dst, $dst" %}
3320  ins_encode %{
3321    __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
3322  %}
3323  ins_pipe(pipe_slow);
3324%}
3325
3326// sqrtsd instruction needs destination register to be pre initialized for best performance
3327// Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3328instruct sqrtD_reg(regD dst) %{
3329  predicate(UseSSE>=2);
3330  match(Set dst (SqrtD dst));
3331  format %{ "sqrtsd  $dst, $dst" %}
3332  ins_encode %{
3333    __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
3334  %}
3335  ins_pipe(pipe_slow);
3336%}
3337
3338// ---------------------------------------- VectorReinterpret ------------------------------------
3339
3340instruct reinterpret(vec dst) %{
3341  predicate(vector_length_in_bytes(n) == vector_length_in_bytes(n->in(1))); // dst == src
3342  match(Set dst (VectorReinterpret dst));
3343  ins_cost(125);
3344  format %{ "vector_reinterpret $dst\t!" %}
3345  ins_encode %{
3346    // empty
3347  %}
3348  ins_pipe( pipe_slow );
3349%}
3350
3351instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
3352  predicate(UseAVX == 0 &&
3353            (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst
3354  match(Set dst (VectorReinterpret src));
3355  ins_cost(125);
3356  effect(TEMP dst, TEMP scratch);
3357  format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3358  ins_encode %{
3359    assert(vector_length_in_bytes(this)       <= 16, "required");
3360    assert(vector_length_in_bytes(this, $src) <=  8, "required");
3361
3362    int src_vlen_in_bytes = vector_length_in_bytes(this, $src);
3363    if (src_vlen_in_bytes == 4) {
3364      __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3365    } else {
3366      assert(src_vlen_in_bytes == 8, "");
3367      __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3368    }
3369    __ pand($dst$$XMMRegister, $src$$XMMRegister);
3370  %}
3371  ins_pipe( pipe_slow );
3372%}
3373
3374instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
3375  predicate(UseAVX > 0 &&
3376            (vector_length_in_bytes(n->in(1)) == 4) && // src
3377            (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst
3378  match(Set dst (VectorReinterpret src));
3379  ins_cost(125);
3380  effect(TEMP scratch);
3381  format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3382  ins_encode %{
3383    __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
3384  %}
3385  ins_pipe( pipe_slow );
3386%}
3387
3388
3389instruct vreinterpret_expand(legVec dst, vec src) %{
3390  predicate(UseAVX > 0 &&
3391            (vector_length_in_bytes(n->in(1)) > 4) && // src
3392            (vector_length_in_bytes(n->in(1)) < vector_length_in_bytes(n))); // src < dst
3393  match(Set dst (VectorReinterpret src));
3394  ins_cost(125);
3395  format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3396  ins_encode %{
3397    switch (vector_length_in_bytes(this, $src)) {
3398      case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3399      case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3400      case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3401      default: ShouldNotReachHere();
3402    }
3403  %}
3404  ins_pipe( pipe_slow );
3405%}
3406
3407instruct reinterpret_shrink(vec dst, legVec src) %{
3408  predicate(vector_length_in_bytes(n->in(1)) > vector_length_in_bytes(n)); // src > dst
3409  match(Set dst (VectorReinterpret src));
3410  ins_cost(125);
3411  format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3412  ins_encode %{
3413    switch (vector_length_in_bytes(this)) {
3414      case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
3415      case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3416      case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3417      case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3418      default: ShouldNotReachHere();
3419    }
3420  %}
3421  ins_pipe( pipe_slow );
3422%}
3423
3424// ----------------------------------------------------------------------------------------------------
3425
3426#ifdef _LP64
3427instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3428  match(Set dst (RoundDoubleMode src rmode));
3429  format %{ "roundsd $dst,$src" %}
3430  ins_cost(150);
3431  ins_encode %{
3432    assert(UseSSE >= 4, "required");
3433    __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3434  %}
3435  ins_pipe(pipe_slow);
3436%}
3437
3438instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3439  match(Set dst (RoundDoubleMode (LoadD src) rmode));
3440  format %{ "roundsd $dst,$src" %}
3441  ins_cost(150);
3442  ins_encode %{
3443    assert(UseSSE >= 4, "required");
3444    __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3445  %}
3446  ins_pipe(pipe_slow);
3447%}
3448
3449instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3450  match(Set dst (RoundDoubleMode con rmode));
3451  effect(TEMP scratch_reg);
3452  format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3453  ins_cost(150);
3454  ins_encode %{
3455    assert(UseSSE >= 4, "required");
3456    __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3457  %}
3458  ins_pipe(pipe_slow);
3459%}
3460
3461instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3462  predicate(vector_length(n) < 8);
3463  match(Set dst (RoundDoubleModeV src rmode));
3464  format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3465  ins_encode %{
3466    assert(UseAVX > 0, "required");
3467    int vlen_enc = vector_length_encoding(this);
3468    __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3469  %}
3470  ins_pipe( pipe_slow );
3471%}
3472
3473instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3474  predicate(vector_length(n) == 8);
3475  match(Set dst (RoundDoubleModeV src rmode));
3476  format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3477  ins_encode %{
3478    assert(UseAVX > 2, "required");
3479    __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3480  %}
3481  ins_pipe( pipe_slow );
3482%}
3483
3484instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3485  predicate(vector_length(n) < 8);
3486  match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3487  format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3488  ins_encode %{
3489    assert(UseAVX > 0, "required");
3490    int vlen_enc = vector_length_encoding(this);
3491    __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3492  %}
3493  ins_pipe( pipe_slow );
3494%}
3495
3496instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3497  predicate(vector_length(n) == 8);
3498  match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3499  format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3500  ins_encode %{
3501    assert(UseAVX > 2, "required");
3502    __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3503  %}
3504  ins_pipe( pipe_slow );
3505%}
3506#endif // _LP64
3507
3508instruct onspinwait() %{
3509  match(OnSpinWait);
3510  ins_cost(200);
3511
3512  format %{
3513    $$template
3514    $$emit$$"pause\t! membar_onspinwait"
3515  %}
3516  ins_encode %{
3517    __ pause();
3518  %}
3519  ins_pipe(pipe_slow);
3520%}
3521
3522// a * b + c
3523instruct fmaD_reg(regD a, regD b, regD c) %{
3524  predicate(UseFMA);
3525  match(Set c (FmaD  c (Binary a b)));
3526  format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3527  ins_cost(150);
3528  ins_encode %{
3529    __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3530  %}
3531  ins_pipe( pipe_slow );
3532%}
3533
3534// a * b + c
3535instruct fmaF_reg(regF a, regF b, regF c) %{
3536  predicate(UseFMA);
3537  match(Set c (FmaF  c (Binary a b)));
3538  format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3539  ins_cost(150);
3540  ins_encode %{
3541    __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3542  %}
3543  ins_pipe( pipe_slow );
3544%}
3545
3546// ====================VECTOR INSTRUCTIONS=====================================
3547
3548// Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
3549instruct MoveVec2Leg(legVec dst, vec src) %{
3550  match(Set dst src);
3551  format %{ "" %}
3552  ins_encode %{
3553    ShouldNotReachHere();
3554  %}
3555  ins_pipe( fpu_reg_reg );
3556%}
3557
3558instruct MoveLeg2Vec(vec dst, legVec src) %{
3559  match(Set dst src);
3560  format %{ "" %}
3561  ins_encode %{
3562    ShouldNotReachHere();
3563  %}
3564  ins_pipe( fpu_reg_reg );
3565%}
3566
3567// ============================================================================
3568
3569// Load vectors generic operand pattern
3570instruct loadV(vec dst, memory mem) %{
3571  match(Set dst (LoadVector mem));
3572  ins_cost(125);
3573  format %{ "load_vector $dst,$mem" %}
3574  ins_encode %{
3575    switch (vector_length_in_bytes(this)) {
3576      case  4: __ movdl    ($dst$$XMMRegister, $mem$$Address); break;
3577      case  8: __ movq     ($dst$$XMMRegister, $mem$$Address); break;
3578      case 16: __ movdqu   ($dst$$XMMRegister, $mem$$Address); break;
3579      case 32: __ vmovdqu  ($dst$$XMMRegister, $mem$$Address); break;
3580      case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3581      default: ShouldNotReachHere();
3582    }
3583  %}
3584  ins_pipe( pipe_slow );
3585%}
3586
3587// Store vectors generic operand pattern.
3588instruct storeV(memory mem, vec src) %{
3589  match(Set mem (StoreVector mem src));
3590  ins_cost(145);
3591  format %{ "store_vector $mem,$src\n\t" %}
3592  ins_encode %{
3593    switch (vector_length_in_bytes(this, $src)) {
3594      case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
3595      case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
3596      case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
3597      case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
3598      case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3599      default: ShouldNotReachHere();
3600    }
3601  %}
3602  ins_pipe( pipe_slow );
3603%}
3604
3605// ---------------------------------------- Gather ------------------------------------
3606
3607// Gather INT, LONG, FLOAT, DOUBLE
3608
3609instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
3610  predicate(vector_length_in_bytes(n) <= 32);
3611  match(Set dst (LoadVectorGather mem idx));
3612  effect(TEMP dst, TEMP tmp, TEMP mask);
3613  format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
3614  ins_encode %{
3615    assert(UseAVX >= 2, "sanity");
3616
3617    int vlen_enc = vector_length_encoding(this);
3618    BasicType elem_bt = vector_element_basic_type(this);
3619
3620    assert(vector_length_in_bytes(this) >= 16, "sanity");
3621    assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3622
3623    if (vlen_enc == Assembler::AVX_128bit) {
3624      __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3625    } else {
3626      __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3627    }
3628    __ lea($tmp$$Register, $mem$$Address);
3629    __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
3630  %}
3631  ins_pipe( pipe_slow );
3632%}
3633
3634instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
3635  predicate(vector_length_in_bytes(n) == 64);
3636  match(Set dst (LoadVectorGather mem idx));
3637  effect(TEMP dst, TEMP tmp, TEMP ktmp);
3638  format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and k2 as TEMP" %}
3639  ins_encode %{
3640    assert(UseAVX > 2, "sanity");
3641
3642    int vlen_enc = vector_length_encoding(this);
3643    BasicType elem_bt = vector_element_basic_type(this);
3644
3645    assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3646
3647    __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3648    __ lea($tmp$$Register, $mem$$Address);
3649    __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3650  %}
3651  ins_pipe( pipe_slow );
3652%}
3653
3654// ====================Scatter=======================================
3655
3656// Scatter INT, LONG, FLOAT, DOUBLE
3657
3658instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
3659  predicate(UseAVX > 2);
3660  match(Set mem (StoreVectorScatter mem (Binary src idx)));
3661  effect(TEMP tmp, TEMP ktmp);
3662  format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
3663  ins_encode %{
3664    int vlen_enc = vector_length_encoding(this, $src);
3665    BasicType elem_bt = vector_element_basic_type(this, $src);
3666
3667    assert(vector_length_in_bytes(this, $src) >= 16, "sanity");
3668    assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3669
3670    __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3671    __ lea($tmp$$Register, $mem$$Address);
3672    __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3673  %}
3674  ins_pipe( pipe_slow );
3675%}
3676
3677// ====================REPLICATE=======================================
3678
3679// Replicate byte scalar to be vector
3680instruct ReplB_reg(vec dst, rRegI src) %{
3681  match(Set dst (ReplicateB src));
3682  format %{ "replicateB $dst,$src" %}
3683  ins_encode %{
3684    uint vlen = vector_length(this);
3685    if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3686      assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3687      int vlen_enc = vector_length_encoding(this);
3688      __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3689    } else if (VM_Version::supports_avx2()) {
3690      int vlen_enc = vector_length_encoding(this);
3691      __ movdl($dst$$XMMRegister, $src$$Register);
3692      __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3693    } else {
3694      __ movdl($dst$$XMMRegister, $src$$Register);
3695      __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3696      __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3697      if (vlen >= 16) {
3698        __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3699        if (vlen >= 32) {
3700          assert(vlen == 32, "sanity");
3701          __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3702        }
3703      }
3704    }
3705  %}
3706  ins_pipe( pipe_slow );
3707%}
3708
3709instruct ReplB_mem(vec dst, memory mem) %{
3710  predicate(VM_Version::supports_avx2());
3711  match(Set dst (ReplicateB (LoadB mem)));
3712  format %{ "replicateB $dst,$mem" %}
3713  ins_encode %{
3714    int vlen_enc = vector_length_encoding(this);
3715    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
3716  %}
3717  ins_pipe( pipe_slow );
3718%}
3719
3720instruct ReplB_imm(vec dst, immI con) %{
3721  match(Set dst (ReplicateB con));
3722  format %{ "replicateB $dst,$con" %}
3723  ins_encode %{
3724    uint vlen = vector_length(this);
3725    InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 1));
3726    if (vlen == 4) {
3727      __ movdl($dst$$XMMRegister, const_addr);
3728    } else {
3729      __ movq($dst$$XMMRegister, const_addr);
3730      if (vlen >= 16) {
3731        if (VM_Version::supports_avx2()) {
3732          int vlen_enc = vector_length_encoding(this);
3733          __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3734        } else {
3735          assert(vlen == 16, "sanity");
3736          __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3737        }
3738      }
3739    }
3740  %}
3741  ins_pipe( pipe_slow );
3742%}
3743
3744// Replicate byte scalar zero to be vector
3745instruct ReplB_zero(vec dst, immI_0 zero) %{
3746  match(Set dst (ReplicateB zero));
3747  format %{ "replicateB $dst,$zero" %}
3748  ins_encode %{
3749    uint vlen = vector_length(this);
3750    if (vlen <= 16) {
3751      __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3752    } else {
3753      // Use vpxor since AVX512F does not have 512bit vxorpd (requires AVX512DQ).
3754      int vlen_enc = vector_length_encoding(this);
3755      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3756    }
3757  %}
3758  ins_pipe( fpu_reg_reg );
3759%}
3760
3761// ====================ReplicateS=======================================
3762
3763instruct ReplS_reg(vec dst, rRegI src) %{
3764  match(Set dst (ReplicateS src));
3765  format %{ "replicateS $dst,$src" %}
3766  ins_encode %{
3767    uint vlen = vector_length(this);
3768    if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3769      assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
3770      int vlen_enc = vector_length_encoding(this);
3771      __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
3772    } else if (VM_Version::supports_avx2()) {
3773      int vlen_enc = vector_length_encoding(this);
3774      __ movdl($dst$$XMMRegister, $src$$Register);
3775      __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3776    } else {
3777      __ movdl($dst$$XMMRegister, $src$$Register);
3778      __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3779      if (vlen >= 8) {
3780        __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3781        if (vlen >= 16) {
3782          assert(vlen == 16, "sanity");
3783          __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3784        }
3785      }
3786    }
3787  %}
3788  ins_pipe( pipe_slow );
3789%}
3790
3791instruct ReplS_mem(vec dst, memory mem) %{
3792  predicate(VM_Version::supports_avx2());
3793  match(Set dst (ReplicateS (LoadS mem)));
3794  format %{ "replicateS $dst,$mem" %}
3795  ins_encode %{
3796    int vlen_enc = vector_length_encoding(this);
3797    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
3798  %}
3799  ins_pipe( pipe_slow );
3800%}
3801
3802instruct ReplS_imm(vec dst, immI con) %{
3803  match(Set dst (ReplicateS con));
3804  format %{ "replicateS $dst,$con" %}
3805  ins_encode %{
3806    uint vlen = vector_length(this);
3807    InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 2));
3808    if (vlen == 2) {
3809      __ movdl($dst$$XMMRegister, const_addr);
3810    } else {
3811      __ movq($dst$$XMMRegister, const_addr);
3812      if (vlen >= 8) {
3813        if (VM_Version::supports_avx2()) {
3814          int vlen_enc = vector_length_encoding(this);
3815          __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3816        } else {
3817          assert(vlen == 8, "sanity");
3818          __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3819        }
3820      }
3821    }
3822  %}
3823  ins_pipe( fpu_reg_reg );
3824%}
3825
3826instruct ReplS_zero(vec dst, immI_0 zero) %{
3827  match(Set dst (ReplicateS zero));
3828  format %{ "replicateS $dst,$zero" %}
3829  ins_encode %{
3830    uint vlen = vector_length(this);
3831    if (vlen <= 8) {
3832      __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3833    } else {
3834      int vlen_enc = vector_length_encoding(this);
3835      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3836    }
3837  %}
3838  ins_pipe( fpu_reg_reg );
3839%}
3840
3841// ====================ReplicateI=======================================
3842
3843instruct ReplI_reg(vec dst, rRegI src) %{
3844  match(Set dst (ReplicateI src));
3845  format %{ "replicateI $dst,$src" %}
3846  ins_encode %{
3847    uint vlen = vector_length(this);
3848    if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3849      int vlen_enc = vector_length_encoding(this);
3850      __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
3851    } else if (VM_Version::supports_avx2()) {
3852      int vlen_enc = vector_length_encoding(this);
3853      __ movdl($dst$$XMMRegister, $src$$Register);
3854      __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3855    } else {
3856      __ movdl($dst$$XMMRegister, $src$$Register);
3857      __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3858      if (vlen >= 8) {
3859        assert(vlen == 8, "sanity");
3860        __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3861      }
3862    }
3863  %}
3864  ins_pipe( pipe_slow );
3865%}
3866
3867instruct ReplI_mem(vec dst, memory mem) %{
3868  match(Set dst (ReplicateI (LoadI mem)));
3869  format %{ "replicateI $dst,$mem" %}
3870  ins_encode %{
3871    uint vlen = vector_length(this);
3872    if (vlen <= 4) {
3873      __ movdl($dst$$XMMRegister, $mem$$Address);
3874      __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3875    } else {
3876      assert(VM_Version::supports_avx2(), "sanity");
3877      int vlen_enc = vector_length_encoding(this);
3878      __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
3879    }
3880  %}
3881  ins_pipe( pipe_slow );
3882%}
3883
3884instruct ReplI_imm(vec dst, immI con) %{
3885  match(Set dst (ReplicateI con));
3886  format %{ "replicateI $dst,$con" %}
3887  ins_encode %{
3888    uint vlen = vector_length(this);
3889    InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 4));
3890    if (vlen <= 4) {
3891      __ movq($dst$$XMMRegister, const_addr);
3892      if (vlen == 4) {
3893        __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3894      }
3895    } else {
3896      assert(VM_Version::supports_avx2(), "sanity");
3897      int vlen_enc = vector_length_encoding(this);
3898      __ movq($dst$$XMMRegister, const_addr);
3899      __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3900    }
3901  %}
3902  ins_pipe( pipe_slow );
3903%}
3904
3905// Replicate integer (4 byte) scalar zero to be vector
3906instruct ReplI_zero(vec dst, immI_0 zero) %{
3907  match(Set dst (ReplicateI zero));
3908  format %{ "replicateI $dst,$zero" %}
3909  ins_encode %{
3910    uint vlen = vector_length(this);
3911    if (vlen <= 4) {
3912      __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3913    } else {
3914      int vlen_enc = vector_length_encoding(this);
3915      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3916    }
3917  %}
3918  ins_pipe( fpu_reg_reg );
3919%}
3920
3921instruct ReplI_M1(vec dst, immI_M1 con) %{
3922  predicate(UseAVX > 0);
3923  match(Set dst (ReplicateB con));
3924  match(Set dst (ReplicateS con));
3925  match(Set dst (ReplicateI con));
3926  effect(TEMP dst);
3927  format %{ "vallones $dst" %}
3928  ins_encode %{
3929    int vector_len = vector_length_encoding(this);
3930    __ vallones($dst$$XMMRegister, vector_len);
3931  %}
3932  ins_pipe( pipe_slow );
3933%}
3934
3935// ====================ReplicateL=======================================
3936
3937#ifdef _LP64
3938// Replicate long (8 byte) scalar to be vector
3939instruct ReplL_reg(vec dst, rRegL src) %{
3940  match(Set dst (ReplicateL src));
3941  format %{ "replicateL $dst,$src" %}
3942  ins_encode %{
3943    uint vlen = vector_length(this);
3944    if (vlen == 2) {
3945      __ movdq($dst$$XMMRegister, $src$$Register);
3946      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3947    } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3948      int vlen_enc = vector_length_encoding(this);
3949      __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
3950    } else if (VM_Version::supports_avx2()) {
3951      assert(vlen == 4, "sanity");
3952      int vlen_enc = vector_length_encoding(this);
3953      __ movdq($dst$$XMMRegister, $src$$Register);
3954      __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3955    } else {
3956      assert(vlen == 4, "sanity");
3957      __ movdq($dst$$XMMRegister, $src$$Register);
3958      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3959      __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3960    }
3961  %}
3962  ins_pipe( pipe_slow );
3963%}
3964#else // _LP64
3965// Replicate long (8 byte) scalar to be vector
3966instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
3967  predicate(vector_length(n) <= 4);
3968  match(Set dst (ReplicateL src));
3969  effect(TEMP dst, USE src, TEMP tmp);
3970  format %{ "replicateL $dst,$src" %}
3971  ins_encode %{
3972    uint vlen = vector_length(this);
3973    if (vlen == 2) {
3974      __ movdl($dst$$XMMRegister, $src$$Register);
3975      __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3976      __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3977      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3978    } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3979      int vlen_enc = Assembler::AVX_256bit;
3980      __ movdl($dst$$XMMRegister, $src$$Register);
3981      __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3982      __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3983      __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3984    } else {
3985      __ movdl($dst$$XMMRegister, $src$$Register);
3986      __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3987      __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3988      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3989      __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3990    }
3991  %}
3992  ins_pipe( pipe_slow );
3993%}
3994
3995instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
3996  predicate(vector_length(n) == 8);
3997  match(Set dst (ReplicateL src));
3998  effect(TEMP dst, USE src, TEMP tmp);
3999  format %{ "replicateL $dst,$src" %}
4000  ins_encode %{
4001    if (VM_Version::supports_avx512vl()) {
4002      __ movdl($dst$$XMMRegister, $src$$Register);
4003      __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4004      __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4005      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4006      __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4007      __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4008    } else {
4009      int vlen_enc = Assembler::AVX_512bit;
4010      __ movdl($dst$$XMMRegister, $src$$Register);
4011      __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4012      __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4013      __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4014    }
4015  %}
4016  ins_pipe( pipe_slow );
4017%}
4018#endif // _LP64
4019
4020instruct ReplL_mem(vec dst, memory mem) %{
4021  match(Set dst (ReplicateL (LoadL mem)));
4022  format %{ "replicateL $dst,$mem" %}
4023  ins_encode %{
4024    uint vlen = vector_length(this);
4025    if (vlen == 2) {
4026      __ movq($dst$$XMMRegister, $mem$$Address);
4027      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4028    } else {
4029      assert(VM_Version::supports_avx2(), "sanity");
4030      int vlen_enc = vector_length_encoding(this);
4031      __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
4032    }
4033  %}
4034  ins_pipe( pipe_slow );
4035%}
4036
4037// Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4038instruct ReplL_imm(vec dst, immL con) %{
4039  match(Set dst (ReplicateL con));
4040  format %{ "replicateL $dst,$con" %}
4041  ins_encode %{
4042    uint vlen = vector_length(this);
4043    InternalAddress const_addr = $constantaddress($con);
4044    if (vlen == 2) {
4045      __ movq($dst$$XMMRegister, const_addr);
4046      __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4047    } else {
4048      assert(VM_Version::supports_avx2(), "sanity");
4049      int vlen_enc = vector_length_encoding(this);
4050      __ movq($dst$$XMMRegister, const_addr);
4051      __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4052    }
4053  %}
4054  ins_pipe( pipe_slow );
4055%}
4056
4057instruct ReplL_zero(vec dst, immL0 zero) %{
4058  match(Set dst (ReplicateL zero));
4059  format %{ "replicateL $dst,$zero" %}
4060  ins_encode %{
4061    int vlen = vector_length(this);
4062    if (vlen == 2) {
4063      __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4064    } else {
4065      int vlen_enc = vector_length_encoding(this);
4066      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4067    }
4068  %}
4069  ins_pipe( fpu_reg_reg );
4070%}
4071
4072instruct ReplL_M1(vec dst, immL_M1 con) %{
4073  predicate(UseAVX > 0);
4074  match(Set dst (ReplicateL con));
4075  effect(TEMP dst);
4076  format %{ "vallones $dst" %}
4077  ins_encode %{
4078    int vector_len = vector_length_encoding(this);
4079    __ vallones($dst$$XMMRegister, vector_len);
4080  %}
4081  ins_pipe( pipe_slow );
4082%}
4083
4084// ====================ReplicateF=======================================
4085
4086instruct ReplF_reg(vec dst, vlRegF src) %{
4087  match(Set dst (ReplicateF src));
4088  format %{ "replicateF $dst,$src" %}
4089  ins_encode %{
4090    uint vlen = vector_length(this);
4091    if (vlen <= 4) {
4092      __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4093   } else if (VM_Version::supports_avx2()) {
4094      int vlen_enc = vector_length_encoding(this);
4095      __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4096    } else {
4097      assert(vlen == 8, "sanity");
4098      __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4099      __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4100    }
4101  %}
4102  ins_pipe( pipe_slow );
4103%}
4104
4105instruct ReplF_mem(vec dst, memory mem) %{
4106  match(Set dst (ReplicateF (LoadF mem)));
4107  format %{ "replicateF $dst,$mem" %}
4108  ins_encode %{
4109    uint vlen = vector_length(this);
4110    if (vlen <= 4) {
4111      __ movdl($dst$$XMMRegister, $mem$$Address);
4112      __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4113    } else {
4114      assert(VM_Version::supports_avx(), "sanity");
4115      int vlen_enc = vector_length_encoding(this);
4116      __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4117    }
4118  %}
4119  ins_pipe( pipe_slow );
4120%}
4121
4122instruct ReplF_zero(vec dst, immF0 zero) %{
4123  match(Set dst (ReplicateF zero));
4124  format %{ "replicateF $dst,$zero" %}
4125  ins_encode %{
4126    uint vlen = vector_length(this);
4127    if (vlen <= 4) {
4128      __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4129    } else {
4130      int vlen_enc = vector_length_encoding(this);
4131      __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4132    }
4133  %}
4134  ins_pipe( fpu_reg_reg );
4135%}
4136
4137// ====================ReplicateD=======================================
4138
4139// Replicate double (8 bytes) scalar to be vector
4140instruct ReplD_reg(vec dst, vlRegD src) %{
4141  match(Set dst (ReplicateD src));
4142  format %{ "replicateD $dst,$src" %}
4143  ins_encode %{
4144    uint vlen = vector_length(this);
4145    if (vlen == 2) {
4146      __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4147    } else if (VM_Version::supports_avx2()) {
4148      int vlen_enc = vector_length_encoding(this);
4149      __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4150    } else {
4151      assert(vlen == 4, "sanity");
4152      __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4153      __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4154    }
4155  %}
4156  ins_pipe( pipe_slow );
4157%}
4158
4159instruct ReplD_mem(vec dst, memory mem) %{
4160  match(Set dst (ReplicateD (LoadD mem)));
4161  format %{ "replicateD $dst,$mem" %}
4162  ins_encode %{
4163    uint vlen = vector_length(this);
4164    if (vlen == 2) {
4165      __ movq($dst$$XMMRegister, $mem$$Address);
4166      __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
4167    } else {
4168      assert(VM_Version::supports_avx(), "sanity");
4169      int vlen_enc = vector_length_encoding(this);
4170      __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4171    }
4172  %}
4173  ins_pipe( pipe_slow );
4174%}
4175
4176instruct ReplD_zero(vec dst, immD0 zero) %{
4177  match(Set dst (ReplicateD zero));
4178  format %{ "replicateD $dst,$zero" %}
4179  ins_encode %{
4180    uint vlen = vector_length(this);
4181    if (vlen == 2) {
4182      __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4183    } else {
4184      int vlen_enc = vector_length_encoding(this);
4185      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4186    }
4187  %}
4188  ins_pipe( fpu_reg_reg );
4189%}
4190
4191// ====================VECTOR INSERT=======================================
4192
4193instruct insert(vec dst, rRegI val, immU8 idx) %{
4194  predicate(vector_length_in_bytes(n) < 32);
4195  match(Set dst (VectorInsert (Binary dst val) idx));
4196  format %{ "vector_insert $dst,$val,$idx" %}
4197  ins_encode %{
4198    assert(UseSSE >= 4, "required");
4199    assert(vector_length_in_bytes(this) >= 8, "required");
4200
4201    BasicType elem_bt = vector_element_basic_type(this);
4202
4203    assert(is_integral_type(elem_bt), "");
4204    assert($idx$$constant < (int)vector_length(this), "out of bounds");
4205
4206    __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4207  %}
4208  ins_pipe( pipe_slow );
4209%}
4210
4211instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4212  predicate(vector_length_in_bytes(n) == 32);
4213  match(Set dst (VectorInsert (Binary src val) idx));
4214  effect(TEMP vtmp);
4215  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4216  ins_encode %{
4217    int vlen_enc = Assembler::AVX_256bit;
4218    BasicType elem_bt = vector_element_basic_type(this);
4219    int elem_per_lane = 16/type2aelembytes(elem_bt);
4220    int log2epr = log2(elem_per_lane);
4221
4222    assert(is_integral_type(elem_bt), "sanity");
4223    assert($idx$$constant < (int)vector_length(this), "out of bounds");
4224
4225    uint x_idx = $idx$$constant & right_n_bits(log2epr);
4226    uint y_idx = ($idx$$constant >> log2epr) & 1;
4227    __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4228    __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4229    __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4230  %}
4231  ins_pipe( pipe_slow );
4232%}
4233
4234instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4235  predicate(vector_length_in_bytes(n) == 64);
4236  match(Set dst (VectorInsert (Binary src val) idx));
4237  effect(TEMP vtmp);
4238  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4239  ins_encode %{
4240    assert(UseAVX > 2, "sanity");
4241
4242    BasicType elem_bt = vector_element_basic_type(this);
4243    int elem_per_lane = 16/type2aelembytes(elem_bt);
4244    int log2epr = log2(elem_per_lane);
4245
4246    assert(is_integral_type(elem_bt), "");
4247    assert($idx$$constant < (int)vector_length(this), "out of bounds");
4248
4249    uint x_idx = $idx$$constant & right_n_bits(log2epr);
4250    uint y_idx = ($idx$$constant >> log2epr) & 3;
4251    __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4252    __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4253    __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4254  %}
4255  ins_pipe( pipe_slow );
4256%}
4257
4258#ifdef _LP64
4259instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4260  predicate(vector_length(n) == 2);
4261  match(Set dst (VectorInsert (Binary dst val) idx));
4262  format %{ "vector_insert $dst,$val,$idx" %}
4263  ins_encode %{
4264    assert(UseSSE >= 4, "required");
4265    assert(vector_element_basic_type(this) == T_LONG, "");
4266    assert($idx$$constant < (int)vector_length(this), "out of bounds");
4267
4268    __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4269  %}
4270  ins_pipe( pipe_slow );
4271%}
4272
4273instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4274  predicate(vector_length(n) == 4);
4275  match(Set dst (VectorInsert (Binary src val) idx));
4276  effect(TEMP vtmp);
4277  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4278  ins_encode %{
4279    assert(vector_element_basic_type(this) == T_LONG, "");
4280    assert($idx$$constant < (int)vector_length(this), "out of bounds");
4281
4282    uint x_idx = $idx$$constant & right_n_bits(1);
4283    uint y_idx = ($idx$$constant >> 1) & 1;
4284    int vlen_enc = Assembler::AVX_256bit;
4285    __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4286    __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4287    __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4288  %}
4289  ins_pipe( pipe_slow );
4290%}
4291
4292instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4293  predicate(vector_length(n) == 8);
4294  match(Set dst (VectorInsert (Binary src val) idx));
4295  effect(TEMP vtmp);
4296  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4297  ins_encode %{
4298    assert(vector_element_basic_type(this) == T_LONG, "sanity");
4299    assert($idx$$constant < (int)vector_length(this), "out of bounds");
4300
4301    uint x_idx = $idx$$constant & right_n_bits(1);
4302    uint y_idx = ($idx$$constant >> 1) & 3;
4303    __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4304    __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4305    __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4306  %}
4307  ins_pipe( pipe_slow );
4308%}
4309#endif
4310
4311instruct insertF(vec dst, regF val, immU8 idx) %{
4312  predicate(vector_length(n) < 8);
4313  match(Set dst (VectorInsert (Binary dst val) idx));
4314  format %{ "vector_insert $dst,$val,$idx" %}
4315  ins_encode %{
4316    assert(UseSSE >= 4, "sanity");
4317
4318    assert(vector_element_basic_type(this) == T_FLOAT, "sanity");
4319    assert($idx$$constant < (int)vector_length(this), "out of bounds");
4320
4321    __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
4322  %}
4323  ins_pipe( pipe_slow );
4324%}
4325
4326instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4327  predicate(vector_length(n) >= 8);
4328  match(Set dst (VectorInsert (Binary src val) idx));
4329  effect(TEMP vtmp);
4330  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4331  ins_encode %{
4332    assert(vector_element_basic_type(this) == T_FLOAT, "sanity");
4333    assert($idx$$constant < (int)vector_length(this), "out of bounds");
4334
4335    int vlen = vector_length(this);
4336    uint x_idx = $idx$$constant & right_n_bits(2);
4337    if (vlen == 8) {
4338      uint y_idx = ($idx$$constant >> 2) & 1;
4339      int vlen_enc = Assembler::AVX_256bit;
4340      __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4341      __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4342      __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4343    } else {
4344      assert(vlen == 16, "sanity");
4345      uint y_idx = ($idx$$constant >> 2) & 3;
4346      __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4347      __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4348      __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4349    }
4350  %}
4351  ins_pipe( pipe_slow );
4352%}
4353
4354#ifdef _LP64
4355instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4356  predicate(vector_length(n) == 2);
4357  match(Set dst (VectorInsert (Binary dst val) idx));
4358  effect(TEMP tmp);
4359  format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4360  ins_encode %{
4361    assert(UseSSE >= 4, "sanity");
4362    assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4363    assert($idx$$constant < (int)vector_length(this), "out of bounds");
4364
4365    __ movq($tmp$$Register, $val$$XMMRegister);
4366    __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4367  %}
4368  ins_pipe( pipe_slow );
4369%}
4370
4371instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4372  predicate(vector_length(n) == 4);
4373  match(Set dst (VectorInsert (Binary src val) idx));
4374  effect(TEMP vtmp, TEMP tmp);
4375  format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4376  ins_encode %{
4377    assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4378    assert($idx$$constant < (int)vector_length(this), "out of bounds");
4379
4380    uint x_idx = $idx$$constant & right_n_bits(1);
4381    uint y_idx = ($idx$$constant >> 1) & 1;
4382    int vlen_enc = Assembler::AVX_256bit;
4383    __ movq($tmp$$Register, $val$$XMMRegister);
4384    __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4385    __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4386    __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4387  %}
4388  ins_pipe( pipe_slow );
4389%}
4390
4391instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4392  predicate(vector_length(n) == 8);
4393  match(Set dst (VectorInsert (Binary src val) idx));
4394  effect(TEMP tmp, TEMP vtmp);
4395  format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4396  ins_encode %{
4397    assert(vector_element_basic_type(this) == T_DOUBLE, "sanity");
4398    assert($idx$$constant < (int)vector_length(this), "out of bounds");
4399
4400    uint x_idx = $idx$$constant & right_n_bits(1);
4401    uint y_idx = ($idx$$constant >> 1) & 3;
4402    __ movq($tmp$$Register, $val$$XMMRegister);
4403    __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4404    __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4405    __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4406  %}
4407  ins_pipe( pipe_slow );
4408%}
4409#endif
4410
4411// ====================REDUCTION ARITHMETIC=======================================
4412
4413// =======================Int Reduction==========================================
4414
4415instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4416  predicate(vector_element_basic_type(n->in(2)) == T_INT); // src2
4417  match(Set dst (AddReductionVI src1 src2));
4418  match(Set dst (MulReductionVI src1 src2));
4419  match(Set dst (AndReductionV  src1 src2));
4420  match(Set dst ( OrReductionV  src1 src2));
4421  match(Set dst (XorReductionV  src1 src2));
4422  match(Set dst (MinReductionV  src1 src2));
4423  match(Set dst (MaxReductionV  src1 src2));
4424  effect(TEMP vtmp1, TEMP vtmp2);
4425  format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4426  ins_encode %{
4427    int opcode = this->ideal_Opcode();
4428    int vlen = vector_length(this, $src2);
4429    __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4430  %}
4431  ins_pipe( pipe_slow );
4432%}
4433
4434// =======================Long Reduction==========================================
4435
4436#ifdef _LP64
4437instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4438  predicate(vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
4439  match(Set dst (AddReductionVL src1 src2));
4440  match(Set dst (MulReductionVL src1 src2));
4441  match(Set dst (AndReductionV  src1 src2));
4442  match(Set dst ( OrReductionV  src1 src2));
4443  match(Set dst (XorReductionV  src1 src2));
4444  match(Set dst (MinReductionV  src1 src2));
4445  match(Set dst (MaxReductionV  src1 src2));
4446  effect(TEMP vtmp1, TEMP vtmp2);
4447  format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4448  ins_encode %{
4449    int opcode = this->ideal_Opcode();
4450    int vlen = vector_length(this, $src2);
4451    __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4452  %}
4453  ins_pipe( pipe_slow );
4454%}
4455
4456instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
4457  predicate(vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
4458  match(Set dst (AddReductionVL src1 src2));
4459  match(Set dst (MulReductionVL src1 src2));
4460  match(Set dst (AndReductionV  src1 src2));
4461  match(Set dst ( OrReductionV  src1 src2));
4462  match(Set dst (XorReductionV  src1 src2));
4463  match(Set dst (MinReductionV  src1 src2));
4464  match(Set dst (MaxReductionV  src1 src2));
4465  effect(TEMP vtmp1, TEMP vtmp2);
4466  format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4467  ins_encode %{
4468    int opcode = this->ideal_Opcode();
4469    int vlen = vector_length(this, $src2);
4470    __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4471  %}
4472  ins_pipe( pipe_slow );
4473%}
4474#endif // _LP64
4475
4476// =======================Float Reduction==========================================
4477
4478instruct reductionF128(regF dst, vec src, vec vtmp) %{
4479  predicate(vector_length(n->in(2)) <= 4); // src
4480  match(Set dst (AddReductionVF dst src));
4481  match(Set dst (MulReductionVF dst src));
4482  effect(TEMP dst, TEMP vtmp);
4483  format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
4484  ins_encode %{
4485    int opcode = this->ideal_Opcode();
4486    int vlen = vector_length(this, $src);
4487    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4488  %}
4489  ins_pipe( pipe_slow );
4490%}
4491
4492instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
4493  predicate(vector_length(n->in(2)) == 8); // src
4494  match(Set dst (AddReductionVF dst src));
4495  match(Set dst (MulReductionVF dst src));
4496  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4497  format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4498  ins_encode %{
4499    int opcode = this->ideal_Opcode();
4500    int vlen = vector_length(this, $src);
4501    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4502  %}
4503  ins_pipe( pipe_slow );
4504%}
4505
4506instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4507  predicate(vector_length(n->in(2)) == 16); // src
4508  match(Set dst (AddReductionVF dst src));
4509  match(Set dst (MulReductionVF dst src));
4510  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4511  format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4512  ins_encode %{
4513    int opcode = this->ideal_Opcode();
4514    int vlen = vector_length(this, $src);
4515    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4516  %}
4517  ins_pipe( pipe_slow );
4518%}
4519
4520// =======================Double Reduction==========================================
4521
4522instruct reduction2D(regD dst, vec src, vec vtmp) %{
4523  predicate(vector_length(n->in(2)) == 2); // src
4524  match(Set dst (AddReductionVD dst src));
4525  match(Set dst (MulReductionVD dst src));
4526  effect(TEMP dst, TEMP vtmp);
4527  format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
4528  ins_encode %{
4529    int opcode = this->ideal_Opcode();
4530    int vlen = vector_length(this, $src);
4531    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4532%}
4533  ins_pipe( pipe_slow );
4534%}
4535
4536instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
4537  predicate(vector_length(n->in(2)) == 4); // src
4538  match(Set dst (AddReductionVD dst src));
4539  match(Set dst (MulReductionVD dst src));
4540  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4541  format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4542  ins_encode %{
4543    int opcode = this->ideal_Opcode();
4544    int vlen = vector_length(this, $src);
4545    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4546  %}
4547  ins_pipe( pipe_slow );
4548%}
4549
4550instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4551  predicate(vector_length(n->in(2)) == 8); // src
4552  match(Set dst (AddReductionVD dst src));
4553  match(Set dst (MulReductionVD dst src));
4554  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4555  format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4556  ins_encode %{
4557    int opcode = this->ideal_Opcode();
4558    int vlen = vector_length(this, $src);
4559    __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4560  %}
4561  ins_pipe( pipe_slow );
4562%}
4563
4564// =======================Byte Reduction==========================================
4565
4566#ifdef _LP64
4567instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4568  predicate(vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
4569  match(Set dst (AddReductionVI src1 src2));
4570  match(Set dst (AndReductionV  src1 src2));
4571  match(Set dst ( OrReductionV  src1 src2));
4572  match(Set dst (XorReductionV  src1 src2));
4573  match(Set dst (MinReductionV  src1 src2));
4574  match(Set dst (MaxReductionV  src1 src2));
4575  effect(TEMP vtmp1, TEMP vtmp2);
4576  format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4577  ins_encode %{
4578    int opcode = this->ideal_Opcode();
4579    int vlen = vector_length(this, $src2);
4580    __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4581  %}
4582  ins_pipe( pipe_slow );
4583%}
4584
4585instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4586  predicate(vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
4587  match(Set dst (AddReductionVI src1 src2));
4588  match(Set dst (AndReductionV  src1 src2));
4589  match(Set dst ( OrReductionV  src1 src2));
4590  match(Set dst (XorReductionV  src1 src2));
4591  match(Set dst (MinReductionV  src1 src2));
4592  match(Set dst (MaxReductionV  src1 src2));
4593  effect(TEMP vtmp1, TEMP vtmp2);
4594  format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4595  ins_encode %{
4596    int opcode = this->ideal_Opcode();
4597    int vlen = vector_length(this, $src2);
4598    __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4599  %}
4600  ins_pipe( pipe_slow );
4601%}
4602#endif
4603
4604// =======================Short Reduction==========================================
4605
4606instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4607  predicate(vector_element_basic_type(n->in(2)) == T_SHORT); // src2
4608  match(Set dst (AddReductionVI src1 src2));
4609  match(Set dst (MulReductionVI src1 src2));
4610  match(Set dst (AndReductionV  src1 src2));
4611  match(Set dst ( OrReductionV  src1 src2));
4612  match(Set dst (XorReductionV  src1 src2));
4613  match(Set dst (MinReductionV  src1 src2));
4614  match(Set dst (MaxReductionV  src1 src2));
4615  effect(TEMP vtmp1, TEMP vtmp2);
4616  format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4617  ins_encode %{
4618    int opcode = this->ideal_Opcode();
4619    int vlen = vector_length(this, $src2);
4620    __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4621  %}
4622  ins_pipe( pipe_slow );
4623%}
4624
4625// =======================Mul Reduction==========================================
4626
4627instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4628  predicate(vector_element_basic_type(n->in(2)) == T_BYTE &&
4629            vector_length(n->in(2)) <= 32); // src2
4630  match(Set dst (MulReductionVI src1 src2));
4631  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4632  format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4633  ins_encode %{
4634    int opcode = this->ideal_Opcode();
4635    int vlen = vector_length(this, $src2);
4636    __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4637  %}
4638  ins_pipe( pipe_slow );
4639%}
4640
4641instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4642  predicate(vector_element_basic_type(n->in(2)) == T_BYTE &&
4643            vector_length(n->in(2)) == 64); // src2
4644  match(Set dst (MulReductionVI src1 src2));
4645  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4646  format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4647  ins_encode %{
4648    int opcode = this->ideal_Opcode();
4649    int vlen = vector_length(this, $src2);
4650    __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4651  %}
4652  ins_pipe( pipe_slow );
4653%}
4654
4655//--------------------Min/Max Float Reduction --------------------
4656// Float Min Reduction
4657instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
4658                            legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4659  predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4660            ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4661             (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4662            vector_length(n->in(2)) == 2);
4663  match(Set dst (MinReductionV src1 src2));
4664  match(Set dst (MaxReductionV src1 src2));
4665  effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4666  format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4667  ins_encode %{
4668    assert(UseAVX > 0, "sanity");
4669
4670    int opcode = this->ideal_Opcode();
4671    int vlen = vector_length(this, $src2);
4672    __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4673                         $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4674  %}
4675  ins_pipe( pipe_slow );
4676%}
4677
4678instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
4679                           legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4680  predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4681            ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4682             (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4683            vector_length(n->in(2)) >= 4);
4684  match(Set dst (MinReductionV src1 src2));
4685  match(Set dst (MaxReductionV src1 src2));
4686  effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4687  format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4688  ins_encode %{
4689    assert(UseAVX > 0, "sanity");
4690
4691    int opcode = this->ideal_Opcode();
4692    int vlen = vector_length(this, $src2);
4693    __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4694                         $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4695  %}
4696  ins_pipe( pipe_slow );
4697%}
4698
4699instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
4700                               legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4701  predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4702            vector_length(n->in(2)) == 2);
4703  match(Set dst (MinReductionV dst src));
4704  match(Set dst (MaxReductionV dst src));
4705  effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4706  format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4707  ins_encode %{
4708    assert(UseAVX > 0, "sanity");
4709
4710    int opcode = this->ideal_Opcode();
4711    int vlen = vector_length(this, $src);
4712    __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4713                         $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4714  %}
4715  ins_pipe( pipe_slow );
4716%}
4717
4718
4719instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
4720                              legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4721  predicate(vector_element_basic_type(n->in(2)) == T_FLOAT &&
4722            vector_length(n->in(2)) >= 4);
4723  match(Set dst (MinReductionV dst src));
4724  match(Set dst (MaxReductionV dst src));
4725  effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4726  format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4727  ins_encode %{
4728    assert(UseAVX > 0, "sanity");
4729
4730    int opcode = this->ideal_Opcode();
4731    int vlen = vector_length(this, $src);
4732    __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4733                         $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4734  %}
4735  ins_pipe( pipe_slow );
4736%}
4737
4738
4739//--------------------Min Double Reduction --------------------
4740instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
4741                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4742                            rFlagsReg cr) %{
4743  predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4744            ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4745             (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4746            vector_length(n->in(2)) == 2);
4747  match(Set dst (MinReductionV src1 src2));
4748  match(Set dst (MaxReductionV src1 src2));
4749  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4750  format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4751  ins_encode %{
4752    assert(UseAVX > 0, "sanity");
4753
4754    int opcode = this->ideal_Opcode();
4755    int vlen = vector_length(this, $src2);
4756    __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4757                          $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4758  %}
4759  ins_pipe( pipe_slow );
4760%}
4761
4762instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
4763                           legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4764                           rFlagsReg cr) %{
4765  predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4766            ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4767             (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4768            vector_length(n->in(2)) >= 4);
4769  match(Set dst (MinReductionV src1 src2));
4770  match(Set dst (MaxReductionV src1 src2));
4771  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4772  format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4773  ins_encode %{
4774    assert(UseAVX > 0, "sanity");
4775
4776    int opcode = this->ideal_Opcode();
4777    int vlen = vector_length(this, $src2);
4778    __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4779                          $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4780  %}
4781  ins_pipe( pipe_slow );
4782%}
4783
4784
4785instruct minmax_reduction2D_av(legRegD dst, legVec src,
4786                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4787                               rFlagsReg cr) %{
4788  predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4789            vector_length(n->in(2)) == 2);
4790  match(Set dst (MinReductionV dst src));
4791  match(Set dst (MaxReductionV dst src));
4792  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4793  format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4794  ins_encode %{
4795    assert(UseAVX > 0, "sanity");
4796
4797    int opcode = this->ideal_Opcode();
4798    int vlen = vector_length(this, $src);
4799    __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4800                          $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4801  %}
4802  ins_pipe( pipe_slow );
4803%}
4804
4805instruct minmax_reductionD_av(legRegD dst, legVec src,
4806                              legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4807                              rFlagsReg cr) %{
4808  predicate(vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4809            vector_length(n->in(2)) >= 4);
4810  match(Set dst (MinReductionV dst src));
4811  match(Set dst (MaxReductionV dst src));
4812  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4813  format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4814  ins_encode %{
4815    assert(UseAVX > 0, "sanity");
4816
4817    int opcode = this->ideal_Opcode();
4818    int vlen = vector_length(this, $src);
4819    __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4820                          $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4821  %}
4822  ins_pipe( pipe_slow );
4823%}
4824
4825// ====================VECTOR ARITHMETIC=======================================
4826
4827// --------------------------------- ADD --------------------------------------
4828
4829// Bytes vector add
4830instruct vaddB(vec dst, vec src) %{
4831  predicate(UseAVX == 0);
4832  match(Set dst (AddVB dst src));
4833  format %{ "paddb   $dst,$src\t! add packedB" %}
4834  ins_encode %{
4835    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
4836  %}
4837  ins_pipe( pipe_slow );
4838%}
4839
4840instruct vaddB_reg(vec dst, vec src1, vec src2) %{
4841  predicate(UseAVX > 0);
4842  match(Set dst (AddVB src1 src2));
4843  format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
4844  ins_encode %{
4845    int vlen_enc = vector_length_encoding(this);
4846    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4847  %}
4848  ins_pipe( pipe_slow );
4849%}
4850
4851instruct vaddB_mem(vec dst, vec src, memory mem) %{
4852  predicate((UseAVX > 0) &&
4853            (vector_length_in_bytes(n->in(1)) > 8));
4854  match(Set dst (AddVB src (LoadVector mem)));
4855  format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
4856  ins_encode %{
4857    int vlen_enc = vector_length_encoding(this);
4858    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4859  %}
4860  ins_pipe( pipe_slow );
4861%}
4862
4863// Shorts/Chars vector add
4864instruct vaddS(vec dst, vec src) %{
4865  predicate(UseAVX == 0);
4866  match(Set dst (AddVS dst src));
4867  format %{ "paddw   $dst,$src\t! add packedS" %}
4868  ins_encode %{
4869    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
4870  %}
4871  ins_pipe( pipe_slow );
4872%}
4873
4874instruct vaddS_reg(vec dst, vec src1, vec src2) %{
4875  predicate(UseAVX > 0);
4876  match(Set dst (AddVS src1 src2));
4877  format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
4878  ins_encode %{
4879    int vlen_enc = vector_length_encoding(this);
4880    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4881  %}
4882  ins_pipe( pipe_slow );
4883%}
4884
4885instruct vaddS_mem(vec dst, vec src, memory mem) %{
4886  predicate((UseAVX > 0) &&
4887            (vector_length_in_bytes(n->in(1)) > 8));
4888  match(Set dst (AddVS src (LoadVector mem)));
4889  format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
4890  ins_encode %{
4891    int vlen_enc = vector_length_encoding(this);
4892    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4893  %}
4894  ins_pipe( pipe_slow );
4895%}
4896
4897// Integers vector add
4898instruct vaddI(vec dst, vec src) %{
4899  predicate(UseAVX == 0);
4900  match(Set dst (AddVI dst src));
4901  format %{ "paddd   $dst,$src\t! add packedI" %}
4902  ins_encode %{
4903    __ paddd($dst$$XMMRegister, $src$$XMMRegister);
4904  %}
4905  ins_pipe( pipe_slow );
4906%}
4907
4908instruct vaddI_reg(vec dst, vec src1, vec src2) %{
4909  predicate(UseAVX > 0);
4910  match(Set dst (AddVI src1 src2));
4911  format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
4912  ins_encode %{
4913    int vlen_enc = vector_length_encoding(this);
4914    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4915  %}
4916  ins_pipe( pipe_slow );
4917%}
4918
4919
4920instruct vaddI_mem(vec dst, vec src, memory mem) %{
4921  predicate((UseAVX > 0) &&
4922            (vector_length_in_bytes(n->in(1)) > 8));
4923  match(Set dst (AddVI src (LoadVector mem)));
4924  format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
4925  ins_encode %{
4926    int vlen_enc = vector_length_encoding(this);
4927    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4928  %}
4929  ins_pipe( pipe_slow );
4930%}
4931
4932// Longs vector add
4933instruct vaddL(vec dst, vec src) %{
4934  predicate(UseAVX == 0);
4935  match(Set dst (AddVL dst src));
4936  format %{ "paddq   $dst,$src\t! add packedL" %}
4937  ins_encode %{
4938    __ paddq($dst$$XMMRegister, $src$$XMMRegister);
4939  %}
4940  ins_pipe( pipe_slow );
4941%}
4942
4943instruct vaddL_reg(vec dst, vec src1, vec src2) %{
4944  predicate(UseAVX > 0);
4945  match(Set dst (AddVL src1 src2));
4946  format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
4947  ins_encode %{
4948    int vlen_enc = vector_length_encoding(this);
4949    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4950  %}
4951  ins_pipe( pipe_slow );
4952%}
4953
4954instruct vaddL_mem(vec dst, vec src, memory mem) %{
4955  predicate((UseAVX > 0) &&
4956            (vector_length_in_bytes(n->in(1)) > 8));
4957  match(Set dst (AddVL src (LoadVector mem)));
4958  format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
4959  ins_encode %{
4960    int vlen_enc = vector_length_encoding(this);
4961    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4962  %}
4963  ins_pipe( pipe_slow );
4964%}
4965
4966// Floats vector add
4967instruct vaddF(vec dst, vec src) %{
4968  predicate(UseAVX == 0);
4969  match(Set dst (AddVF dst src));
4970  format %{ "addps   $dst,$src\t! add packedF" %}
4971  ins_encode %{
4972    __ addps($dst$$XMMRegister, $src$$XMMRegister);
4973  %}
4974  ins_pipe( pipe_slow );
4975%}
4976
4977instruct vaddF_reg(vec dst, vec src1, vec src2) %{
4978  predicate(UseAVX > 0);
4979  match(Set dst (AddVF src1 src2));
4980  format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
4981  ins_encode %{
4982    int vlen_enc = vector_length_encoding(this);
4983    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4984  %}
4985  ins_pipe( pipe_slow );
4986%}
4987
4988instruct vaddF_mem(vec dst, vec src, memory mem) %{
4989  predicate((UseAVX > 0) &&
4990            (vector_length_in_bytes(n->in(1)) > 8));
4991  match(Set dst (AddVF src (LoadVector mem)));
4992  format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
4993  ins_encode %{
4994    int vlen_enc = vector_length_encoding(this);
4995    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4996  %}
4997  ins_pipe( pipe_slow );
4998%}
4999
5000// Doubles vector add
5001instruct vaddD(vec dst, vec src) %{
5002  predicate(UseAVX == 0);
5003  match(Set dst (AddVD dst src));
5004  format %{ "addpd   $dst,$src\t! add packedD" %}
5005  ins_encode %{
5006    __ addpd($dst$$XMMRegister, $src$$XMMRegister);
5007  %}
5008  ins_pipe( pipe_slow );
5009%}
5010
5011instruct vaddD_reg(vec dst, vec src1, vec src2) %{
5012  predicate(UseAVX > 0);
5013  match(Set dst (AddVD src1 src2));
5014  format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
5015  ins_encode %{
5016    int vlen_enc = vector_length_encoding(this);
5017    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5018  %}
5019  ins_pipe( pipe_slow );
5020%}
5021
5022instruct vaddD_mem(vec dst, vec src, memory mem) %{
5023  predicate((UseAVX > 0) &&
5024            (vector_length_in_bytes(n->in(1)) > 8));
5025  match(Set dst (AddVD src (LoadVector mem)));
5026  format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
5027  ins_encode %{
5028    int vlen_enc = vector_length_encoding(this);
5029    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5030  %}
5031  ins_pipe( pipe_slow );
5032%}
5033
5034// --------------------------------- SUB --------------------------------------
5035
5036// Bytes vector sub
5037instruct vsubB(vec dst, vec src) %{
5038  predicate(UseAVX == 0);
5039  match(Set dst (SubVB dst src));
5040  format %{ "psubb   $dst,$src\t! sub packedB" %}
5041  ins_encode %{
5042    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5043  %}
5044  ins_pipe( pipe_slow );
5045%}
5046
5047instruct vsubB_reg(vec dst, vec src1, vec src2) %{
5048  predicate(UseAVX > 0);
5049  match(Set dst (SubVB src1 src2));
5050  format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
5051  ins_encode %{
5052    int vlen_enc = vector_length_encoding(this);
5053    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5054  %}
5055  ins_pipe( pipe_slow );
5056%}
5057
5058instruct vsubB_mem(vec dst, vec src, memory mem) %{
5059  predicate((UseAVX > 0) &&
5060            (vector_length_in_bytes(n->in(1)) > 8));
5061  match(Set dst (SubVB src (LoadVector mem)));
5062  format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
5063  ins_encode %{
5064    int vlen_enc = vector_length_encoding(this);
5065    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5066  %}
5067  ins_pipe( pipe_slow );
5068%}
5069
5070// Shorts/Chars vector sub
5071instruct vsubS(vec dst, vec src) %{
5072  predicate(UseAVX == 0);
5073  match(Set dst (SubVS dst src));
5074  format %{ "psubw   $dst,$src\t! sub packedS" %}
5075  ins_encode %{
5076    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5077  %}
5078  ins_pipe( pipe_slow );
5079%}
5080
5081
5082instruct vsubS_reg(vec dst, vec src1, vec src2) %{
5083  predicate(UseAVX > 0);
5084  match(Set dst (SubVS src1 src2));
5085  format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
5086  ins_encode %{
5087    int vlen_enc = vector_length_encoding(this);
5088    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5089  %}
5090  ins_pipe( pipe_slow );
5091%}
5092
5093instruct vsubS_mem(vec dst, vec src, memory mem) %{
5094  predicate((UseAVX > 0) &&
5095            (vector_length_in_bytes(n->in(1)) > 8));
5096  match(Set dst (SubVS src (LoadVector mem)));
5097  format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
5098  ins_encode %{
5099    int vlen_enc = vector_length_encoding(this);
5100    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5101  %}
5102  ins_pipe( pipe_slow );
5103%}
5104
5105// Integers vector sub
5106instruct vsubI(vec dst, vec src) %{
5107  predicate(UseAVX == 0);
5108  match(Set dst (SubVI dst src));
5109  format %{ "psubd   $dst,$src\t! sub packedI" %}
5110  ins_encode %{
5111    __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5112  %}
5113  ins_pipe( pipe_slow );
5114%}
5115
5116instruct vsubI_reg(vec dst, vec src1, vec src2) %{
5117  predicate(UseAVX > 0);
5118  match(Set dst (SubVI src1 src2));
5119  format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
5120  ins_encode %{
5121    int vlen_enc = vector_length_encoding(this);
5122    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5123  %}
5124  ins_pipe( pipe_slow );
5125%}
5126
5127instruct vsubI_mem(vec dst, vec src, memory mem) %{
5128  predicate((UseAVX > 0) &&
5129            (vector_length_in_bytes(n->in(1)) > 8));
5130  match(Set dst (SubVI src (LoadVector mem)));
5131  format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
5132  ins_encode %{
5133    int vlen_enc = vector_length_encoding(this);
5134    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5135  %}
5136  ins_pipe( pipe_slow );
5137%}
5138
5139// Longs vector sub
5140instruct vsubL(vec dst, vec src) %{
5141  predicate(UseAVX == 0);
5142  match(Set dst (SubVL dst src));
5143  format %{ "psubq   $dst,$src\t! sub packedL" %}
5144  ins_encode %{
5145    __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5146  %}
5147  ins_pipe( pipe_slow );
5148%}
5149
5150instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5151  predicate(UseAVX > 0);
5152  match(Set dst (SubVL src1 src2));
5153  format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
5154  ins_encode %{
5155    int vlen_enc = vector_length_encoding(this);
5156    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5157  %}
5158  ins_pipe( pipe_slow );
5159%}
5160
5161
5162instruct vsubL_mem(vec dst, vec src, memory mem) %{
5163  predicate((UseAVX > 0) &&
5164            (vector_length_in_bytes(n->in(1)) > 8));
5165  match(Set dst (SubVL src (LoadVector mem)));
5166  format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
5167  ins_encode %{
5168    int vlen_enc = vector_length_encoding(this);
5169    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5170  %}
5171  ins_pipe( pipe_slow );
5172%}
5173
5174// Floats vector sub
5175instruct vsubF(vec dst, vec src) %{
5176  predicate(UseAVX == 0);
5177  match(Set dst (SubVF dst src));
5178  format %{ "subps   $dst,$src\t! sub packedF" %}
5179  ins_encode %{
5180    __ subps($dst$$XMMRegister, $src$$XMMRegister);
5181  %}
5182  ins_pipe( pipe_slow );
5183%}
5184
5185instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5186  predicate(UseAVX > 0);
5187  match(Set dst (SubVF src1 src2));
5188  format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
5189  ins_encode %{
5190    int vlen_enc = vector_length_encoding(this);
5191    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5192  %}
5193  ins_pipe( pipe_slow );
5194%}
5195
5196instruct vsubF_mem(vec dst, vec src, memory mem) %{
5197  predicate((UseAVX > 0) &&
5198            (vector_length_in_bytes(n->in(1)) > 8));
5199  match(Set dst (SubVF src (LoadVector mem)));
5200  format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
5201  ins_encode %{
5202    int vlen_enc = vector_length_encoding(this);
5203    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5204  %}
5205  ins_pipe( pipe_slow );
5206%}
5207
5208// Doubles vector sub
5209instruct vsubD(vec dst, vec src) %{
5210  predicate(UseAVX == 0);
5211  match(Set dst (SubVD dst src));
5212  format %{ "subpd   $dst,$src\t! sub packedD" %}
5213  ins_encode %{
5214    __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5215  %}
5216  ins_pipe( pipe_slow );
5217%}
5218
5219instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5220  predicate(UseAVX > 0);
5221  match(Set dst (SubVD src1 src2));
5222  format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
5223  ins_encode %{
5224    int vlen_enc = vector_length_encoding(this);
5225    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5226  %}
5227  ins_pipe( pipe_slow );
5228%}
5229
5230instruct vsubD_mem(vec dst, vec src, memory mem) %{
5231  predicate((UseAVX > 0) &&
5232            (vector_length_in_bytes(n->in(1)) > 8));
5233  match(Set dst (SubVD src (LoadVector mem)));
5234  format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
5235  ins_encode %{
5236    int vlen_enc = vector_length_encoding(this);
5237    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5238  %}
5239  ins_pipe( pipe_slow );
5240%}
5241
5242// --------------------------------- MUL --------------------------------------
5243
5244// Byte vector mul
5245instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5246  predicate(vector_length(n) == 4 ||
5247            vector_length(n) == 8);
5248  match(Set dst (MulVB src1 src2));
5249  effect(TEMP dst, TEMP tmp, TEMP scratch);
5250  format %{"vector_mulB $dst,$src1,$src2" %}
5251  ins_encode %{
5252    assert(UseSSE > 3, "required");
5253    __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
5254    __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
5255    __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
5256    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5257    __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5258    __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5259  %}
5260  ins_pipe( pipe_slow );
5261%}
5262
5263instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5264  predicate(vector_length(n) == 16 && UseAVX <= 1);
5265  match(Set dst (MulVB src1 src2));
5266  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5267  format %{"vector_mulB $dst,$src1,$src2" %}
5268  ins_encode %{
5269    assert(UseSSE > 3, "required");
5270    __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
5271    __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
5272    __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
5273    __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
5274    __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
5275    __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5276    __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
5277    __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
5278    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5279    __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5280    __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5281    __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5282  %}
5283  ins_pipe( pipe_slow );
5284%}
5285
5286instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5287  predicate(vector_length(n) == 16 && UseAVX > 1);
5288  match(Set dst (MulVB src1 src2));
5289  effect(TEMP dst, TEMP tmp, TEMP scratch);
5290  format %{"vector_mulB $dst,$src1,$src2" %}
5291  ins_encode %{
5292  int vlen_enc = Assembler::AVX_256bit;
5293    __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5294    __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5295    __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5296    __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5297    __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5298    __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
5299    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
5300  %}
5301  ins_pipe( pipe_slow );
5302%}
5303
5304instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5305  predicate(vector_length(n) == 32);
5306  match(Set dst (MulVB src1 src2));
5307  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5308  format %{"vector_mulB $dst,$src1,$src2" %}
5309  ins_encode %{
5310    assert(UseAVX > 1, "required");
5311    int vlen_enc = Assembler::AVX_256bit;
5312    __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5313    __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
5314    __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5315    __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5316    __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5317    __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5318    __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5319    __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5320    __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5321    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5322    __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5323    __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5324    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5325    __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5326  %}
5327  ins_pipe( pipe_slow );
5328%}
5329
5330instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5331  predicate(vector_length(n) == 64);
5332  match(Set dst (MulVB src1 src2));
5333  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5334  format %{"vector_mulB $dst,$src1,$src2\n\t" %}
5335  ins_encode %{
5336    assert(UseAVX > 2, "required");
5337    int vlen_enc = Assembler::AVX_512bit;
5338    __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5339    __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
5340    __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5341    __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5342    __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5343    __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5344    __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5345    __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5346    __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5347    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5348    __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5349    __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5350    __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5351    __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5352    __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5353  %}
5354  ins_pipe( pipe_slow );
5355%}
5356
5357// Shorts/Chars vector mul
5358instruct vmulS(vec dst, vec src) %{
5359  predicate(UseAVX == 0);
5360  match(Set dst (MulVS dst src));
5361  format %{ "pmullw $dst,$src\t! mul packedS" %}
5362  ins_encode %{
5363    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5364  %}
5365  ins_pipe( pipe_slow );
5366%}
5367
5368instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5369  predicate(UseAVX > 0);
5370  match(Set dst (MulVS src1 src2));
5371  format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5372  ins_encode %{
5373    int vlen_enc = vector_length_encoding(this);
5374    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5375  %}
5376  ins_pipe( pipe_slow );
5377%}
5378
5379instruct vmulS_mem(vec dst, vec src, memory mem) %{
5380  predicate((UseAVX > 0) &&
5381            (vector_length_in_bytes(n->in(1)) > 8));
5382  match(Set dst (MulVS src (LoadVector mem)));
5383  format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5384  ins_encode %{
5385    int vlen_enc = vector_length_encoding(this);
5386    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5387  %}
5388  ins_pipe( pipe_slow );
5389%}
5390
5391// Integers vector mul
5392instruct vmulI(vec dst, vec src) %{
5393  predicate(UseAVX == 0);
5394  match(Set dst (MulVI dst src));
5395  format %{ "pmulld  $dst,$src\t! mul packedI" %}
5396  ins_encode %{
5397    assert(UseSSE > 3, "required");
5398    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5399  %}
5400  ins_pipe( pipe_slow );
5401%}
5402
5403instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5404  predicate(UseAVX > 0);
5405  match(Set dst (MulVI src1 src2));
5406  format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5407  ins_encode %{
5408    int vlen_enc = vector_length_encoding(this);
5409    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5410  %}
5411  ins_pipe( pipe_slow );
5412%}
5413
5414instruct vmulI_mem(vec dst, vec src, memory mem) %{
5415  predicate((UseAVX > 0) &&
5416            (vector_length_in_bytes(n->in(1)) > 8));
5417  match(Set dst (MulVI src (LoadVector mem)));
5418  format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
5419  ins_encode %{
5420    int vlen_enc = vector_length_encoding(this);
5421    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5422  %}
5423  ins_pipe( pipe_slow );
5424%}
5425
5426// Longs vector mul
5427instruct vmulL_reg(vec dst, vec src1, vec src2) %{
5428  predicate(VM_Version::supports_avx512dq());
5429  match(Set dst (MulVL src1 src2));
5430  format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
5431  ins_encode %{
5432    assert(UseAVX > 2, "required");
5433    int vlen_enc = vector_length_encoding(this);
5434    __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5435  %}
5436  ins_pipe( pipe_slow );
5437%}
5438
5439instruct vmulL_mem(vec dst, vec src, memory mem) %{
5440  predicate(VM_Version::supports_avx512dq() &&
5441              (vector_length_in_bytes(n->in(1)) > 8));
5442  match(Set dst (MulVL src (LoadVector mem)));
5443  format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
5444  ins_encode %{
5445    assert(UseAVX > 2, "required");
5446    int vlen_enc = vector_length_encoding(this);
5447    __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5448  %}
5449  ins_pipe( pipe_slow );
5450%}
5451
5452instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
5453  predicate(vector_length(n) == 2 && !VM_Version::supports_avx512dq());
5454  match(Set dst (MulVL dst src2));
5455  effect(TEMP dst, TEMP tmp);
5456  format %{ "pshufd $tmp,$src2, 177\n\t"
5457            "pmulld $tmp,$dst\n\t"
5458            "phaddd $tmp,$tmp\n\t"
5459            "pmovzxdq $tmp,$tmp\n\t"
5460            "psllq $tmp, 32\n\t"
5461            "pmuludq $dst,$src2\n\t"
5462            "paddq $dst,$tmp\n\t! mul packed2L" %}
5463
5464  ins_encode %{
5465    assert(VM_Version::supports_sse4_1(), "required");
5466    int vlen_enc = Assembler::AVX_128bit;
5467    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
5468    __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
5469    __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5470    __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
5471    __ psllq($tmp$$XMMRegister, 32);
5472    __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
5473    __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
5474  %}
5475  ins_pipe( pipe_slow );
5476%}
5477
5478instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
5479  predicate(vector_length(n) == 4 && !VM_Version::supports_avx512dq());
5480  match(Set dst (MulVL src1 src2));
5481  effect(TEMP tmp1, TEMP tmp);
5482  format %{ "vpshufd $tmp,$src2\n\t"
5483            "vpmulld $tmp,$src1,$tmp\n\t"
5484            "vphaddd $tmp,$tmp,$tmp\n\t"
5485            "vpmovzxdq $tmp,$tmp\n\t"
5486            "vpsllq $tmp,$tmp\n\t"
5487            "vpmuludq $tmp1,$src1,$src2\n\t"
5488            "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
5489  ins_encode %{
5490    int vlen_enc = Assembler::AVX_256bit;
5491    __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
5492    __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5493    __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
5494    __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5495    __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5496    __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
5497    __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5498    __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5499  %}
5500  ins_pipe( pipe_slow );
5501%}
5502
5503// Floats vector mul
5504instruct vmulF(vec dst, vec src) %{
5505  predicate(UseAVX == 0);
5506  match(Set dst (MulVF dst src));
5507  format %{ "mulps   $dst,$src\t! mul packedF" %}
5508  ins_encode %{
5509    __ mulps($dst$$XMMRegister, $src$$XMMRegister);
5510  %}
5511  ins_pipe( pipe_slow );
5512%}
5513
5514instruct vmulF_reg(vec dst, vec src1, vec src2) %{
5515  predicate(UseAVX > 0);
5516  match(Set dst (MulVF src1 src2));
5517  format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
5518  ins_encode %{
5519    int vlen_enc = vector_length_encoding(this);
5520    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5521  %}
5522  ins_pipe( pipe_slow );
5523%}
5524
5525instruct vmulF_mem(vec dst, vec src, memory mem) %{
5526  predicate((UseAVX > 0) &&
5527            (vector_length_in_bytes(n->in(1)) > 8));
5528  match(Set dst (MulVF src (LoadVector mem)));
5529  format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
5530  ins_encode %{
5531    int vlen_enc = vector_length_encoding(this);
5532    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5533  %}
5534  ins_pipe( pipe_slow );
5535%}
5536
5537// Doubles vector mul
5538instruct vmulD(vec dst, vec src) %{
5539  predicate(UseAVX == 0);
5540  match(Set dst (MulVD dst src));
5541  format %{ "mulpd   $dst,$src\t! mul packedD" %}
5542  ins_encode %{
5543    __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
5544  %}
5545  ins_pipe( pipe_slow );
5546%}
5547
5548instruct vmulD_reg(vec dst, vec src1, vec src2) %{
5549  predicate(UseAVX > 0);
5550  match(Set dst (MulVD src1 src2));
5551  format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
5552  ins_encode %{
5553    int vlen_enc = vector_length_encoding(this);
5554    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5555  %}
5556  ins_pipe( pipe_slow );
5557%}
5558
5559instruct vmulD_mem(vec dst, vec src, memory mem) %{
5560  predicate((UseAVX > 0) &&
5561            (vector_length_in_bytes(n->in(1)) > 8));
5562  match(Set dst (MulVD src (LoadVector mem)));
5563  format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
5564  ins_encode %{
5565    int vlen_enc = vector_length_encoding(this);
5566    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5567  %}
5568  ins_pipe( pipe_slow );
5569%}
5570
5571instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5572  predicate(vector_length(n) == 8);
5573  match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
5574  effect(TEMP dst, USE src1, USE src2);
5575  format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
5576            "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
5577         %}
5578  ins_encode %{
5579    assert(UseAVX > 0, "required");
5580
5581    int vlen_enc = Assembler::AVX_256bit;
5582    int cond = (Assembler::Condition)($copnd$$cmpcode);
5583    __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5584    __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5585  %}
5586  ins_pipe( pipe_slow );
5587%}
5588
5589instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5590  predicate(vector_length(n) == 4);
5591  match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
5592  effect(TEMP dst, USE src1, USE src2);
5593  format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
5594            "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
5595         %}
5596  ins_encode %{
5597    assert(UseAVX > 0, "required");
5598
5599    int vlen_enc = Assembler::AVX_256bit;
5600    int cond = (Assembler::Condition)($copnd$$cmpcode);
5601    __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5602    __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5603  %}
5604  ins_pipe( pipe_slow );
5605%}
5606
5607// --------------------------------- DIV --------------------------------------
5608
5609// Floats vector div
5610instruct vdivF(vec dst, vec src) %{
5611  predicate(UseAVX == 0);
5612  match(Set dst (DivVF dst src));
5613  format %{ "divps   $dst,$src\t! div packedF" %}
5614  ins_encode %{
5615    __ divps($dst$$XMMRegister, $src$$XMMRegister);
5616  %}
5617  ins_pipe( pipe_slow );
5618%}
5619
5620instruct vdivF_reg(vec dst, vec src1, vec src2) %{
5621  predicate(UseAVX > 0);
5622  match(Set dst (DivVF src1 src2));
5623  format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
5624  ins_encode %{
5625    int vlen_enc = vector_length_encoding(this);
5626    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5627  %}
5628  ins_pipe( pipe_slow );
5629%}
5630
5631instruct vdivF_mem(vec dst, vec src, memory mem) %{
5632  predicate((UseAVX > 0) &&
5633            (vector_length_in_bytes(n->in(1)) > 8));
5634  match(Set dst (DivVF src (LoadVector mem)));
5635  format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
5636  ins_encode %{
5637    int vlen_enc = vector_length_encoding(this);
5638    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5639  %}
5640  ins_pipe( pipe_slow );
5641%}
5642
5643// Doubles vector div
5644instruct vdivD(vec dst, vec src) %{
5645  predicate(UseAVX == 0);
5646  match(Set dst (DivVD dst src));
5647  format %{ "divpd   $dst,$src\t! div packedD" %}
5648  ins_encode %{
5649    __ divpd($dst$$XMMRegister, $src$$XMMRegister);
5650  %}
5651  ins_pipe( pipe_slow );
5652%}
5653
5654instruct vdivD_reg(vec dst, vec src1, vec src2) %{
5655  predicate(UseAVX > 0);
5656  match(Set dst (DivVD src1 src2));
5657  format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
5658  ins_encode %{
5659    int vlen_enc = vector_length_encoding(this);
5660    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5661  %}
5662  ins_pipe( pipe_slow );
5663%}
5664
5665instruct vdivD_mem(vec dst, vec src, memory mem) %{
5666  predicate((UseAVX > 0) &&
5667            (vector_length_in_bytes(n->in(1)) > 8));
5668  match(Set dst (DivVD src (LoadVector mem)));
5669  format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
5670  ins_encode %{
5671    int vlen_enc = vector_length_encoding(this);
5672    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5673  %}
5674  ins_pipe( pipe_slow );
5675%}
5676
5677// ------------------------------ MinMax ---------------------------------------
5678
5679// Byte, Short, Int vector Min/Max
5680instruct minmax_reg_sse(vec dst, vec src) %{
5681  predicate(is_integral_type(vector_element_basic_type(n)) && vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5682            UseAVX == 0);
5683  match(Set dst (MinV dst src));
5684  match(Set dst (MaxV dst src));
5685  format %{ "vector_minmax  $dst,$src\t!  " %}
5686  ins_encode %{
5687    assert(UseSSE >= 4, "required");
5688
5689    int opcode = this->ideal_Opcode();
5690    BasicType elem_bt = vector_element_basic_type(this);
5691    __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
5692  %}
5693  ins_pipe( pipe_slow );
5694%}
5695
5696instruct vminmax_reg(vec dst, vec src1, vec src2) %{
5697  predicate(is_integral_type(vector_element_basic_type(n)) && vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5698            UseAVX > 0);
5699  match(Set dst (MinV src1 src2));
5700  match(Set dst (MaxV src1 src2));
5701  format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
5702  ins_encode %{
5703    int opcode = this->ideal_Opcode();
5704    int vlen_enc = vector_length_encoding(this);
5705    BasicType elem_bt = vector_element_basic_type(this);
5706
5707    __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5708  %}
5709  ins_pipe( pipe_slow );
5710%}
5711
5712// Long vector Min/Max
5713instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
5714  predicate(vector_length_in_bytes(n) == 16 && vector_element_basic_type(n) == T_LONG &&
5715            UseAVX == 0);
5716  match(Set dst (MinV dst src));
5717  match(Set dst (MaxV src dst));
5718  effect(TEMP dst, TEMP tmp);
5719  format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
5720  ins_encode %{
5721    assert(UseSSE >= 4, "required");
5722
5723    int opcode = this->ideal_Opcode();
5724    BasicType elem_bt = vector_element_basic_type(this);
5725    assert(elem_bt == T_LONG, "sanity");
5726
5727    __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
5728  %}
5729  ins_pipe( pipe_slow );
5730%}
5731
5732instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
5733  predicate(vector_length_in_bytes(n) <= 32 && vector_element_basic_type(n) == T_LONG &&
5734            UseAVX > 0 && !VM_Version::supports_avx512vl());
5735  match(Set dst (MinV src1 src2));
5736  match(Set dst (MaxV src1 src2));
5737  effect(TEMP dst);
5738  format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
5739  ins_encode %{
5740    int vlen_enc = vector_length_encoding(this);
5741    int opcode = this->ideal_Opcode();
5742    BasicType elem_bt = vector_element_basic_type(this);
5743    assert(elem_bt == T_LONG, "sanity");
5744
5745    __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5746  %}
5747  ins_pipe( pipe_slow );
5748%}
5749
5750instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
5751  predicate((vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
5752            vector_element_basic_type(n) == T_LONG);
5753  match(Set dst (MinV src1 src2));
5754  match(Set dst (MaxV src1 src2));
5755  format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
5756  ins_encode %{
5757    assert(UseAVX > 2, "required");
5758
5759    int vlen_enc = vector_length_encoding(this);
5760    int opcode = this->ideal_Opcode();
5761    BasicType elem_bt = vector_element_basic_type(this);
5762    assert(elem_bt == T_LONG, "sanity");
5763
5764    __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5765  %}
5766  ins_pipe( pipe_slow );
5767%}
5768
5769// Float/Double vector Min/Max
5770instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
5771  predicate(vector_length_in_bytes(n) <= 32 &&
5772            is_floating_point_type(vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
5773            UseAVX > 0);
5774  match(Set dst (MinV a b));
5775  match(Set dst (MaxV a b));
5776  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
5777  format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
5778  ins_encode %{
5779    assert(UseAVX > 0, "required");
5780
5781    int opcode = this->ideal_Opcode();
5782    int vlen_enc = vector_length_encoding(this);
5783    BasicType elem_bt = vector_element_basic_type(this);
5784
5785    __ vminmax_fp(opcode, elem_bt,
5786                  $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5787                  $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5788  %}
5789  ins_pipe( pipe_slow );
5790%}
5791
5792instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
5793  predicate(vector_length_in_bytes(n) == 64 &&
5794            is_floating_point_type(vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
5795  match(Set dst (MinV a b));
5796  match(Set dst (MaxV a b));
5797  effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
5798  format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
5799  ins_encode %{
5800    assert(UseAVX > 2, "required");
5801
5802    int opcode = this->ideal_Opcode();
5803    int vlen_enc = vector_length_encoding(this);
5804    BasicType elem_bt = vector_element_basic_type(this);
5805
5806    __ evminmax_fp(opcode, elem_bt,
5807                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5808                   $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5809  %}
5810  ins_pipe( pipe_slow );
5811%}
5812
5813// --------------------------------- Signum ---------------------------
5814
5815instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
5816  match(Set dst (SignumF dst (Binary zero one)));
5817  effect(TEMP scratch, KILL cr);
5818  format %{ "signumF $dst, $dst\t! using $scratch as TEMP" %}
5819  ins_encode %{
5820    int opcode = this->ideal_Opcode();
5821    __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
5822  %}
5823  ins_pipe( pipe_slow );
5824%}
5825
5826instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr) %{
5827  match(Set dst (SignumD dst (Binary zero one)));
5828  effect(TEMP scratch, KILL cr);
5829  format %{ "signumD $dst, $dst\t! using $scratch as TEMP" %}
5830  ins_encode %{
5831    int opcode = this->ideal_Opcode();
5832    __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
5833  %}
5834  ins_pipe( pipe_slow );
5835%}
5836
5837// --------------------------------- Sqrt --------------------------------------
5838
5839instruct vsqrtF_reg(vec dst, vec src) %{
5840  match(Set dst (SqrtVF src));
5841  format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
5842  ins_encode %{
5843    assert(UseAVX > 0, "required");
5844    int vlen_enc = vector_length_encoding(this);
5845    __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5846  %}
5847  ins_pipe( pipe_slow );
5848%}
5849
5850instruct vsqrtF_mem(vec dst, memory mem) %{
5851  predicate(vector_length_in_bytes(n->in(1)) > 8);
5852  match(Set dst (SqrtVF (LoadVector mem)));
5853  format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
5854  ins_encode %{
5855    assert(UseAVX > 0, "required");
5856    int vlen_enc = vector_length_encoding(this);
5857    __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
5858  %}
5859  ins_pipe( pipe_slow );
5860%}
5861
5862// Floating point vector sqrt
5863instruct vsqrtD_reg(vec dst, vec src) %{
5864  match(Set dst (SqrtVD src));
5865  format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
5866  ins_encode %{
5867    assert(UseAVX > 0, "required");
5868    int vlen_enc = vector_length_encoding(this);
5869    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5870  %}
5871  ins_pipe( pipe_slow );
5872%}
5873
5874instruct vsqrtD_mem(vec dst, memory mem) %{
5875  predicate(vector_length_in_bytes(n->in(1)) > 8);
5876  match(Set dst (SqrtVD (LoadVector mem)));
5877  format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
5878  ins_encode %{
5879    assert(UseAVX > 0, "required");
5880    int vlen_enc = vector_length_encoding(this);
5881    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
5882  %}
5883  ins_pipe( pipe_slow );
5884%}
5885
5886// ------------------------------ Shift ---------------------------------------
5887
5888// Left and right shift count vectors are the same on x86
5889// (only lowest bits of xmm reg are used for count).
5890instruct vshiftcnt(vec dst, rRegI cnt) %{
5891  match(Set dst (LShiftCntV cnt));
5892  match(Set dst (RShiftCntV cnt));
5893  format %{ "movdl    $dst,$cnt\t! load shift count" %}
5894  ins_encode %{
5895    __ movdl($dst$$XMMRegister, $cnt$$Register);
5896  %}
5897  ins_pipe( pipe_slow );
5898%}
5899
5900// Byte vector shift
5901instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5902  predicate(vector_length(n) <= 8 && VectorNode::is_vshift_cnt(n->in(2)));
5903  match(Set dst ( LShiftVB src shift));
5904  match(Set dst ( RShiftVB src shift));
5905  match(Set dst (URShiftVB src shift));
5906  effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
5907  format %{"vector_byte_shift $dst,$src,$shift" %}
5908  ins_encode %{
5909    assert(UseSSE > 3, "required");
5910    int opcode = this->ideal_Opcode();
5911    bool sign = (opcode != Op_URShiftVB);
5912    __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
5913    __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
5914    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5915    __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5916    __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5917  %}
5918  ins_pipe( pipe_slow );
5919%}
5920
5921instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5922  predicate(vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5923            UseAVX <= 1);
5924  match(Set dst ( LShiftVB src shift));
5925  match(Set dst ( RShiftVB src shift));
5926  match(Set dst (URShiftVB src shift));
5927  effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
5928  format %{"vector_byte_shift $dst,$src,$shift" %}
5929  ins_encode %{
5930    assert(UseSSE > 3, "required");
5931    int opcode = this->ideal_Opcode();
5932    bool sign = (opcode != Op_URShiftVB);
5933    __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
5934    __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
5935    __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
5936    __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
5937    __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
5938    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5939    __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5940    __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5941    __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5942  %}
5943  ins_pipe( pipe_slow );
5944%}
5945
5946instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5947  predicate(vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5948            UseAVX > 1);
5949  match(Set dst ( LShiftVB src shift));
5950  match(Set dst ( RShiftVB src shift));
5951  match(Set dst (URShiftVB src shift));
5952  effect(TEMP dst, TEMP tmp, TEMP scratch);
5953  format %{"vector_byte_shift $dst,$src,$shift" %}
5954  ins_encode %{
5955    int opcode = this->ideal_Opcode();
5956    bool sign = (opcode != Op_URShiftVB);
5957    int vlen_enc = Assembler::AVX_256bit;
5958    __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
5959    __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5960    __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5961    __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
5962    __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
5963  %}
5964  ins_pipe( pipe_slow );
5965%}
5966
5967instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5968  predicate(vector_length(n) == 32 && VectorNode::is_vshift_cnt(n->in(2)));
5969  match(Set dst ( LShiftVB src shift));
5970  match(Set dst ( RShiftVB src shift));
5971  match(Set dst (URShiftVB src shift));
5972  effect(TEMP dst, TEMP tmp, TEMP scratch);
5973  format %{"vector_byte_shift $dst,$src,$shift" %}
5974  ins_encode %{
5975    assert(UseAVX > 1, "required");
5976    int opcode = this->ideal_Opcode();
5977    bool sign = (opcode != Op_URShiftVB);
5978    int vlen_enc = Assembler::AVX_256bit;
5979    __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
5980    __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5981    __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5982    __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5983    __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5984    __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5985    __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5986    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5987    __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5988  %}
5989  ins_pipe( pipe_slow );
5990%}
5991
5992instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5993  predicate(vector_length(n) == 64 && VectorNode::is_vshift_cnt(n->in(2)));
5994  match(Set dst ( LShiftVB src shift));
5995  match(Set dst  (RShiftVB src shift));
5996  match(Set dst (URShiftVB src shift));
5997  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5998  format %{"vector_byte_shift $dst,$src,$shift" %}
5999  ins_encode %{
6000    assert(UseAVX > 2, "required");
6001    int opcode = this->ideal_Opcode();
6002    bool sign = (opcode != Op_URShiftVB);
6003    int vlen_enc = Assembler::AVX_512bit;
6004    __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6005    __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6006    __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6007    __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6008    __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6009    __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6010    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6011    __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6012    __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6013    __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6014    __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6015    __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6016  %}
6017  ins_pipe( pipe_slow );
6018%}
6019
6020// Shorts vector logical right shift produces incorrect Java result
6021// for negative data because java code convert short value into int with
6022// sign extension before a shift. But char vectors are fine since chars are
6023// unsigned values.
6024// Shorts/Chars vector left shift
6025instruct vshiftS(vec dst, vec src, vec shift) %{
6026  predicate(VectorNode::is_vshift_cnt(n->in(2)));
6027  match(Set dst ( LShiftVS src shift));
6028  match(Set dst ( RShiftVS src shift));
6029  match(Set dst (URShiftVS src shift));
6030  effect(TEMP dst, USE src, USE shift);
6031  format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6032  ins_encode %{
6033    int opcode = this->ideal_Opcode();
6034    if (UseAVX > 0) {
6035      int vlen_enc = vector_length_encoding(this);
6036      __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6037    } else {
6038      int vlen = vector_length(this);
6039      if (vlen == 2) {
6040        __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6041        __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6042      } else if (vlen == 4) {
6043        __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6044        __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6045      } else {
6046        assert (vlen == 8, "sanity");
6047        __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6048        __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6049      }
6050    }
6051  %}
6052  ins_pipe( pipe_slow );
6053%}
6054
6055// Integers vector left shift
6056instruct vshiftI(vec dst, vec src, vec shift) %{
6057  predicate(VectorNode::is_vshift_cnt(n->in(2)));
6058  match(Set dst ( LShiftVI src shift));
6059  match(Set dst ( RShiftVI src shift));
6060  match(Set dst (URShiftVI src shift));
6061  effect(TEMP dst, USE src, USE shift);
6062  format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6063  ins_encode %{
6064    int opcode = this->ideal_Opcode();
6065    if (UseAVX > 0) {
6066      int vlen_enc = vector_length_encoding(this);
6067      __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6068    } else {
6069      int vlen = vector_length(this);
6070      if (vlen == 2) {
6071        __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6072        __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6073      } else {
6074        assert(vlen == 4, "sanity");
6075        __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6076        __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6077      }
6078    }
6079  %}
6080  ins_pipe( pipe_slow );
6081%}
6082
6083// Integers vector left constant shift
6084instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
6085  match(Set dst (LShiftVI src (LShiftCntV shift)));
6086  match(Set dst (RShiftVI src (RShiftCntV shift)));
6087  match(Set dst (URShiftVI src (RShiftCntV shift)));
6088  format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
6089  ins_encode %{
6090    int opcode = this->ideal_Opcode();
6091    if (UseAVX > 0) {
6092      int vector_len = vector_length_encoding(this);
6093      __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6094    } else {
6095      int vlen = vector_length(this);
6096      if (vlen == 2) {
6097        __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6098        __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6099      } else {
6100        assert(vlen == 4, "sanity");
6101        __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6102        __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6103      }
6104    }
6105  %}
6106  ins_pipe( pipe_slow );
6107%}
6108
6109// Longs vector shift
6110instruct vshiftL(vec dst, vec src, vec shift) %{
6111  predicate(VectorNode::is_vshift_cnt(n->in(2)));
6112  match(Set dst ( LShiftVL src shift));
6113  match(Set dst (URShiftVL src shift));
6114  effect(TEMP dst, USE src, USE shift);
6115  format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6116  ins_encode %{
6117    int opcode = this->ideal_Opcode();
6118    if (UseAVX > 0) {
6119      int vlen_enc = vector_length_encoding(this);
6120      __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6121    } else {
6122      assert(vector_length(this) == 2, "");
6123      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6124      __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6125    }
6126  %}
6127  ins_pipe( pipe_slow );
6128%}
6129
6130// Longs vector constant shift
6131instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
6132  match(Set dst (LShiftVL src (LShiftCntV shift)));
6133  match(Set dst (URShiftVL src (RShiftCntV shift)));
6134  format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
6135  ins_encode %{
6136    int opcode = this->ideal_Opcode();
6137    if (UseAVX > 0) {
6138      int vector_len = vector_length_encoding(this);
6139      __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6140    } else {
6141      assert(vector_length(this) == 2, "");
6142      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6143      __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6144    }
6145  %}
6146  ins_pipe( pipe_slow );
6147%}
6148
6149// -------------------ArithmeticRightShift -----------------------------------
6150// Long vector arithmetic right shift
6151instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6152  predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX <= 2);
6153  match(Set dst (RShiftVL src shift));
6154  effect(TEMP dst, TEMP tmp, TEMP scratch);
6155  format %{ "vshiftq $dst,$src,$shift" %}
6156  ins_encode %{
6157    uint vlen = vector_length(this);
6158    if (vlen == 2) {
6159      assert(UseSSE >= 2, "required");
6160      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6161      __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6162      __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6163      __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6164      __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6165      __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6166    } else {
6167      assert(vlen == 4, "sanity");
6168      assert(UseAVX > 1, "required");
6169      int vlen_enc = Assembler::AVX_256bit;
6170      __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6171      __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6172      __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6173      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6174      __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6175    }
6176  %}
6177  ins_pipe( pipe_slow );
6178%}
6179
6180instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6181  predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX > 2);
6182  match(Set dst (RShiftVL src shift));
6183  format %{ "vshiftq $dst,$src,$shift" %}
6184  ins_encode %{
6185    int vlen_enc = vector_length_encoding(this);
6186    __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6187  %}
6188  ins_pipe( pipe_slow );
6189%}
6190
6191// ------------------- Variable Shift -----------------------------
6192// Byte variable shift
6193instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6194  predicate(vector_length(n) <= 8 &&
6195            !VectorNode::is_vshift_cnt(n->in(2)) &&
6196            !VM_Version::supports_avx512bw());
6197  match(Set dst ( LShiftVB src shift));
6198  match(Set dst ( RShiftVB src shift));
6199  match(Set dst (URShiftVB src shift));
6200  effect(TEMP dst, TEMP vtmp, TEMP scratch);
6201  format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6202  ins_encode %{
6203    assert(UseAVX >= 2, "required");
6204
6205    int opcode = this->ideal_Opcode();
6206    int vlen_enc = Assembler::AVX_128bit;
6207    __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6208    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6209  %}
6210  ins_pipe( pipe_slow );
6211%}
6212
6213instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6214  predicate(vector_length(n) == 16 &&
6215            !VectorNode::is_vshift_cnt(n->in(2)) &&
6216            !VM_Version::supports_avx512bw());
6217  match(Set dst ( LShiftVB src shift));
6218  match(Set dst ( RShiftVB src shift));
6219  match(Set dst (URShiftVB src shift));
6220  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6221  format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6222  ins_encode %{
6223    assert(UseAVX >= 2, "required");
6224
6225    int opcode = this->ideal_Opcode();
6226    int vlen_enc = Assembler::AVX_128bit;
6227    // Shift lower half and get word result in dst
6228    __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6229
6230    // Shift upper half and get word result in vtmp1
6231    __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6232    __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6233    __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6234
6235    // Merge and down convert the two word results to byte in dst
6236    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6237  %}
6238  ins_pipe( pipe_slow );
6239%}
6240
6241instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6242  predicate(vector_length(n) == 32 &&
6243            !VectorNode::is_vshift_cnt(n->in(2)) &&
6244            !VM_Version::supports_avx512bw());
6245  match(Set dst ( LShiftVB src shift));
6246  match(Set dst ( RShiftVB src shift));
6247  match(Set dst (URShiftVB src shift));
6248  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6249  format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6250  ins_encode %{
6251    assert(UseAVX >= 2, "required");
6252
6253    int opcode = this->ideal_Opcode();
6254    int vlen_enc = Assembler::AVX_128bit;
6255    // Process lower 128 bits and get result in dst
6256    __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6257    __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6258    __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6259    __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6260    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6261
6262    // Process higher 128 bits and get result in vtmp3
6263    __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6264    __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6265    __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6266    __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6267    __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6268    __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6269    __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6270
6271    // Merge the two results in dst
6272    __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6273  %}
6274  ins_pipe( pipe_slow );
6275%}
6276
6277instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6278  predicate(vector_length(n) <= 32 &&
6279            !VectorNode::is_vshift_cnt(n->in(2)) &&
6280            VM_Version::supports_avx512bw());
6281  match(Set dst ( LShiftVB src shift));
6282  match(Set dst ( RShiftVB src shift));
6283  match(Set dst (URShiftVB src shift));
6284  effect(TEMP dst, TEMP vtmp, TEMP scratch);
6285  format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6286  ins_encode %{
6287    assert(UseAVX > 2, "required");
6288
6289    int opcode = this->ideal_Opcode();
6290    int vlen_enc = vector_length_encoding(this);
6291    __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6292  %}
6293  ins_pipe( pipe_slow );
6294%}
6295
6296instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6297  predicate(vector_length(n) == 64 &&
6298            !VectorNode::is_vshift_cnt(n->in(2)) &&
6299            VM_Version::supports_avx512bw());
6300  match(Set dst ( LShiftVB src shift));
6301  match(Set dst ( RShiftVB src shift));
6302  match(Set dst (URShiftVB src shift));
6303  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6304  format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6305  ins_encode %{
6306    assert(UseAVX > 2, "required");
6307
6308    int opcode = this->ideal_Opcode();
6309    int vlen_enc = Assembler::AVX_256bit;
6310    __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6311    __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6312    __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6313    __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6314    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6315  %}
6316  ins_pipe( pipe_slow );
6317%}
6318
6319// Short variable shift
6320instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6321  predicate(vector_length(n) <= 8 &&
6322            !VectorNode::is_vshift_cnt(n->in(2)) &&
6323            !VM_Version::supports_avx512bw());
6324  match(Set dst ( LShiftVS src shift));
6325  match(Set dst ( RShiftVS src shift));
6326  match(Set dst (URShiftVS src shift));
6327  effect(TEMP dst, TEMP vtmp, TEMP scratch);
6328  format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6329  ins_encode %{
6330    assert(UseAVX >= 2, "required");
6331
6332    int opcode = this->ideal_Opcode();
6333    bool sign = (opcode != Op_URShiftVS);
6334    int vlen_enc = Assembler::AVX_256bit;
6335    __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6336    __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6337    __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6338    __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6339    __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6340    __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6341  %}
6342  ins_pipe( pipe_slow );
6343%}
6344
6345instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6346  predicate(vector_length(n) == 16 &&
6347            !VectorNode::is_vshift_cnt(n->in(2)) &&
6348            !VM_Version::supports_avx512bw());
6349  match(Set dst ( LShiftVS src shift));
6350  match(Set dst ( RShiftVS src shift));
6351  match(Set dst (URShiftVS src shift));
6352  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6353  format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6354  ins_encode %{
6355    assert(UseAVX >= 2, "required");
6356
6357    int opcode = this->ideal_Opcode();
6358    bool sign = (opcode != Op_URShiftVS);
6359    int vlen_enc = Assembler::AVX_256bit;
6360    // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6361    __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6362    __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6363    __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6364    __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6365
6366    // Shift upper half, with result in dst usign vtmp1 as TEMP
6367    __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6368    __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6369    __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6370    __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6371    __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6372    __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6373
6374    // Merge lower and upper half result into dst
6375    __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6376    __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6377  %}
6378  ins_pipe( pipe_slow );
6379%}
6380
6381instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6382  predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6383            VM_Version::supports_avx512bw());
6384  match(Set dst ( LShiftVS src shift));
6385  match(Set dst ( RShiftVS src shift));
6386  match(Set dst (URShiftVS src shift));
6387  format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6388  ins_encode %{
6389    assert(UseAVX > 2, "required");
6390
6391    int opcode = this->ideal_Opcode();
6392    int vlen_enc = vector_length_encoding(this);
6393    if (!VM_Version::supports_avx512vl()) {
6394      vlen_enc = Assembler::AVX_512bit;
6395    }
6396    __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6397  %}
6398  ins_pipe( pipe_slow );
6399%}
6400
6401//Integer variable shift
6402instruct vshiftI_var(vec dst, vec src, vec shift) %{
6403  predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6404  match(Set dst ( LShiftVI src shift));
6405  match(Set dst ( RShiftVI src shift));
6406  match(Set dst (URShiftVI src shift));
6407  format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6408  ins_encode %{
6409    assert(UseAVX >= 2, "required");
6410
6411    int opcode = this->ideal_Opcode();
6412    int vlen_enc = vector_length_encoding(this);
6413    __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6414  %}
6415  ins_pipe( pipe_slow );
6416%}
6417
6418//Long variable shift
6419instruct vshiftL_var(vec dst, vec src, vec shift) %{
6420  predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6421  match(Set dst ( LShiftVL src shift));
6422  match(Set dst (URShiftVL src shift));
6423  format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6424  ins_encode %{
6425    assert(UseAVX >= 2, "required");
6426
6427    int opcode = this->ideal_Opcode();
6428    int vlen_enc = vector_length_encoding(this);
6429    __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6430  %}
6431  ins_pipe( pipe_slow );
6432%}
6433
6434//Long variable right shift arithmetic
6435instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6436  predicate(vector_length(n) <= 4 &&
6437            !VectorNode::is_vshift_cnt(n->in(2)) &&
6438            UseAVX == 2);
6439  match(Set dst (RShiftVL src shift));
6440  effect(TEMP dst, TEMP vtmp);
6441  format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6442  ins_encode %{
6443    int opcode = this->ideal_Opcode();
6444    int vlen_enc = vector_length_encoding(this);
6445    __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6446                 $vtmp$$XMMRegister);
6447  %}
6448  ins_pipe( pipe_slow );
6449%}
6450
6451instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6452  predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6453            UseAVX > 2);
6454  match(Set dst (RShiftVL src shift));
6455  format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6456  ins_encode %{
6457    int opcode = this->ideal_Opcode();
6458    int vlen_enc = vector_length_encoding(this);
6459    __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6460  %}
6461  ins_pipe( pipe_slow );
6462%}
6463
6464// --------------------------------- AND --------------------------------------
6465
6466instruct vand(vec dst, vec src) %{
6467  predicate(UseAVX == 0);
6468  match(Set dst (AndV dst src));
6469  format %{ "pand    $dst,$src\t! and vectors" %}
6470  ins_encode %{
6471    __ pand($dst$$XMMRegister, $src$$XMMRegister);
6472  %}
6473  ins_pipe( pipe_slow );
6474%}
6475
6476instruct vand_reg(vec dst, vec src1, vec src2) %{
6477  predicate(UseAVX > 0);
6478  match(Set dst (AndV src1 src2));
6479  format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
6480  ins_encode %{
6481    int vlen_enc = vector_length_encoding(this);
6482    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6483  %}
6484  ins_pipe( pipe_slow );
6485%}
6486
6487instruct vand_mem(vec dst, vec src, memory mem) %{
6488  predicate((UseAVX > 0) &&
6489            (vector_length_in_bytes(n->in(1)) > 8));
6490  match(Set dst (AndV src (LoadVector mem)));
6491  format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
6492  ins_encode %{
6493    int vlen_enc = vector_length_encoding(this);
6494    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6495  %}
6496  ins_pipe( pipe_slow );
6497%}
6498
6499// --------------------------------- OR ---------------------------------------
6500
6501instruct vor(vec dst, vec src) %{
6502  predicate(UseAVX == 0);
6503  match(Set dst (OrV dst src));
6504  format %{ "por     $dst,$src\t! or vectors" %}
6505  ins_encode %{
6506    __ por($dst$$XMMRegister, $src$$XMMRegister);
6507  %}
6508  ins_pipe( pipe_slow );
6509%}
6510
6511instruct vor_reg(vec dst, vec src1, vec src2) %{
6512  predicate(UseAVX > 0);
6513  match(Set dst (OrV src1 src2));
6514  format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
6515  ins_encode %{
6516    int vlen_enc = vector_length_encoding(this);
6517    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6518  %}
6519  ins_pipe( pipe_slow );
6520%}
6521
6522instruct vor_mem(vec dst, vec src, memory mem) %{
6523  predicate((UseAVX > 0) &&
6524            (vector_length_in_bytes(n->in(1)) > 8));
6525  match(Set dst (OrV src (LoadVector mem)));
6526  format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
6527  ins_encode %{
6528    int vlen_enc = vector_length_encoding(this);
6529    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6530  %}
6531  ins_pipe( pipe_slow );
6532%}
6533
6534// --------------------------------- XOR --------------------------------------
6535
6536instruct vxor(vec dst, vec src) %{
6537  predicate(UseAVX == 0);
6538  match(Set dst (XorV dst src));
6539  format %{ "pxor    $dst,$src\t! xor vectors" %}
6540  ins_encode %{
6541    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
6542  %}
6543  ins_pipe( pipe_slow );
6544%}
6545
6546instruct vxor_reg(vec dst, vec src1, vec src2) %{
6547  predicate(UseAVX > 0);
6548  match(Set dst (XorV src1 src2));
6549  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
6550  ins_encode %{
6551    int vlen_enc = vector_length_encoding(this);
6552    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6553  %}
6554  ins_pipe( pipe_slow );
6555%}
6556
6557instruct vxor_mem(vec dst, vec src, memory mem) %{
6558  predicate((UseAVX > 0) &&
6559            (vector_length_in_bytes(n->in(1)) > 8));
6560  match(Set dst (XorV src (LoadVector mem)));
6561  format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
6562  ins_encode %{
6563    int vlen_enc = vector_length_encoding(this);
6564    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6565  %}
6566  ins_pipe( pipe_slow );
6567%}
6568
6569// --------------------------------- VectorCast --------------------------------------
6570
6571instruct vcastBtoX(vec dst, vec src) %{
6572  match(Set dst (VectorCastB2X src));
6573  format %{ "vector_cast_b2x $dst,$src\t!" %}
6574  ins_encode %{
6575    assert(UseAVX > 0, "required");
6576
6577    BasicType to_elem_bt = vector_element_basic_type(this);
6578    int vlen_enc = vector_length_encoding(this);
6579    switch (to_elem_bt) {
6580      case T_SHORT:
6581        __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6582        break;
6583      case T_INT:
6584        __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6585        break;
6586      case T_FLOAT:
6587        __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6588        __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6589        break;
6590      case T_LONG:
6591        __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6592        break;
6593      case T_DOUBLE:
6594        __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6595        __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6596        break;
6597
6598      default: assert(false, "%s", type2name(to_elem_bt));
6599    }
6600  %}
6601  ins_pipe( pipe_slow );
6602%}
6603
6604instruct castStoX(vec dst, vec src, rRegP scratch) %{
6605  predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6606            vector_length(n->in(1)) <= 8 && // src
6607            vector_element_basic_type(n) == T_BYTE);
6608  effect(TEMP scratch);
6609  match(Set dst (VectorCastS2X src));
6610  format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
6611  ins_encode %{
6612    assert(UseAVX > 0, "required");
6613
6614    __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
6615    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6616  %}
6617  ins_pipe( pipe_slow );
6618%}
6619
6620instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6621  predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6622            vector_length(n->in(1)) == 16 && // src
6623            vector_element_basic_type(n) == T_BYTE);
6624  effect(TEMP dst, TEMP vtmp, TEMP scratch);
6625  match(Set dst (VectorCastS2X src));
6626  format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
6627  ins_encode %{
6628    assert(UseAVX > 0, "required");
6629
6630    int vlen_enc = vector_length_encoding(vector_length_in_bytes(this, $src));
6631    __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6632    __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
6633    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6634  %}
6635  ins_pipe( pipe_slow );
6636%}
6637
6638instruct vcastStoX_evex(vec dst, vec src) %{
6639  predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
6640            (vector_length_in_bytes(n) >= vector_length_in_bytes(n->in(1)))); // dst >= src
6641  match(Set dst (VectorCastS2X src));
6642  format %{ "vector_cast_s2x $dst,$src\t!" %}
6643  ins_encode %{
6644    BasicType to_elem_bt = vector_element_basic_type(this);
6645    int src_vlen_enc = vector_length_encoding(this, $src);
6646    int vlen_enc = vector_length_encoding(this);
6647    switch (to_elem_bt) {
6648      case T_BYTE:
6649        if (!VM_Version::supports_avx512vl()) {
6650          vlen_enc = Assembler::AVX_512bit;
6651        }
6652        __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6653        break;
6654      case T_INT:
6655        __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6656        break;
6657      case T_FLOAT:
6658        __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6659        __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6660        break;
6661      case T_LONG:
6662        __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6663        break;
6664      case T_DOUBLE:
6665        __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6666        __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6667        break;
6668      default:
6669        ShouldNotReachHere();
6670    }
6671  %}
6672  ins_pipe( pipe_slow );
6673%}
6674
6675instruct castItoX(vec dst, vec src, rRegP scratch) %{
6676  predicate(UseAVX <= 2 &&
6677            (vector_length_in_bytes(n->in(1)) <= 16) &&
6678            (vector_length_in_bytes(n) < vector_length_in_bytes(n->in(1)))); // dst < src
6679  match(Set dst (VectorCastI2X src));
6680  format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
6681  effect(TEMP scratch);
6682  ins_encode %{
6683    assert(UseAVX > 0, "required");
6684
6685    BasicType to_elem_bt = vector_element_basic_type(this);
6686    int vlen_enc = vector_length_encoding(this, $src);
6687
6688    if (to_elem_bt == T_BYTE) {
6689      __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6690      __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6691      __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6692    } else {
6693      assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6694      __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6695      __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6696    }
6697  %}
6698  ins_pipe( pipe_slow );
6699%}
6700
6701instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6702  predicate(UseAVX <= 2 &&
6703            (vector_length_in_bytes(n->in(1)) == 32) &&
6704            (vector_length_in_bytes(n) < vector_length_in_bytes(n->in(1)))); // dst < src
6705  match(Set dst (VectorCastI2X src));
6706  format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
6707  effect(TEMP dst, TEMP vtmp, TEMP scratch);
6708  ins_encode %{
6709    assert(UseAVX > 0, "required");
6710
6711    BasicType to_elem_bt = vector_element_basic_type(this);
6712    int vlen_enc = vector_length_encoding(this, $src);
6713
6714    if (to_elem_bt == T_BYTE) {
6715      __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6716      __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6717      __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6718      __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6719    } else {
6720      assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6721      __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6722      __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6723      __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6724    }
6725  %}
6726  ins_pipe( pipe_slow );
6727%}
6728
6729instruct vcastItoX_evex(vec dst, vec src) %{
6730  predicate(UseAVX > 2 ||
6731            (vector_length_in_bytes(n) >= vector_length_in_bytes(n->in(1)))); // dst >= src
6732  match(Set dst (VectorCastI2X src));
6733  format %{ "vector_cast_i2x $dst,$src\t!" %}
6734  ins_encode %{
6735    assert(UseAVX > 0, "required");
6736
6737    BasicType dst_elem_bt = vector_element_basic_type(this);
6738    int src_vlen_enc = vector_length_encoding(this, $src);
6739    int dst_vlen_enc = vector_length_encoding(this);
6740    switch (dst_elem_bt) {
6741      case T_BYTE:
6742        if (!VM_Version::supports_avx512vl()) {
6743          src_vlen_enc = Assembler::AVX_512bit;
6744        }
6745        __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6746        break;
6747      case T_SHORT:
6748        if (!VM_Version::supports_avx512vl()) {
6749          src_vlen_enc = Assembler::AVX_512bit;
6750        }
6751        __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6752        break;
6753      case T_FLOAT:
6754        __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6755        break;
6756      case T_LONG:
6757        __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6758        break;
6759      case T_DOUBLE:
6760        __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6761        break;
6762      default:
6763        ShouldNotReachHere();
6764    }
6765  %}
6766  ins_pipe( pipe_slow );
6767%}
6768
6769instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
6770  predicate((vector_element_basic_type(n) == T_BYTE || vector_element_basic_type(n) == T_SHORT) &&
6771            UseAVX <= 2);
6772  match(Set dst (VectorCastL2X src));
6773  effect(TEMP scratch);
6774  format %{ "vector_cast_l2x  $dst,$src\t! using $scratch as TEMP" %}
6775  ins_encode %{
6776    assert(UseAVX > 0, "required");
6777
6778    int vlen = vector_length_in_bytes(this, $src);
6779    BasicType to_elem_bt  = vector_element_basic_type(this);
6780    AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
6781                                                      : ExternalAddress(vector_int_to_short_mask());
6782    if (vlen <= 16) {
6783      __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
6784      __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6785      __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6786    } else {
6787      assert(vlen <= 32, "required");
6788      __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
6789      __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
6790      __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6791      __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6792    }
6793    if (to_elem_bt == T_BYTE) {
6794      __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6795    }
6796  %}
6797  ins_pipe( pipe_slow );
6798%}
6799
6800instruct vcastLtoX_evex(vec dst, vec src) %{
6801  predicate(UseAVX > 2 ||
6802            (vector_element_basic_type(n) == T_INT ||
6803             vector_element_basic_type(n) == T_FLOAT ||
6804             vector_element_basic_type(n) == T_DOUBLE));
6805  match(Set dst (VectorCastL2X src));
6806  format %{ "vector_cast_l2x  $dst,$src\t!" %}
6807  ins_encode %{
6808    BasicType to_elem_bt = vector_element_basic_type(this);
6809    int vlen = vector_length_in_bytes(this, $src);
6810    int vlen_enc = vector_length_encoding(this, $src);
6811    switch (to_elem_bt) {
6812      case T_BYTE:
6813        if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6814          vlen_enc = Assembler::AVX_512bit;
6815        }
6816        __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6817        break;
6818      case T_SHORT:
6819        if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6820          vlen_enc = Assembler::AVX_512bit;
6821        }
6822        __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6823        break;
6824      case T_INT:
6825        if (vlen == 8) {
6826          if ($dst$$XMMRegister != $src$$XMMRegister) {
6827            __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6828          }
6829        } else if (vlen == 16) {
6830          __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
6831        } else if (vlen == 32) {
6832          if (UseAVX > 2) {
6833            if (!VM_Version::supports_avx512vl()) {
6834              vlen_enc = Assembler::AVX_512bit;
6835            }
6836            __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6837          } else {
6838            __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
6839            __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
6840          }
6841        } else { // vlen == 64
6842          __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6843        }
6844        break;
6845      case T_FLOAT:
6846        assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6847        __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6848        break;
6849      case T_DOUBLE:
6850        assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6851        __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6852        break;
6853
6854      default: assert(false, "%s", type2name(to_elem_bt));
6855    }
6856  %}
6857  ins_pipe( pipe_slow );
6858%}
6859
6860instruct vcastFtoD_reg(vec dst, vec src) %{
6861  predicate(vector_element_basic_type(n) == T_DOUBLE);
6862  match(Set dst (VectorCastF2X src));
6863  format %{ "vector_cast_f2x  $dst,$src\t!" %}
6864  ins_encode %{
6865    int vlen_enc = vector_length_encoding(this);
6866    __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6867  %}
6868  ins_pipe( pipe_slow );
6869%}
6870
6871instruct vcastDtoF_reg(vec dst, vec src) %{
6872  predicate(vector_element_basic_type(n) == T_FLOAT);
6873  match(Set dst (VectorCastD2X src));
6874  format %{ "vector_cast_d2x  $dst,$src\t!" %}
6875  ins_encode %{
6876    int vlen_enc = vector_length_encoding(this, $src);
6877    __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6878  %}
6879  ins_pipe( pipe_slow );
6880%}
6881
6882// --------------------------------- VectorMaskCmp --------------------------------------
6883
6884instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
6885  predicate(vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
6886            vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6887            is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6888  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6889  format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
6890  ins_encode %{
6891    int vlen_enc = vector_length_encoding(this, $src1);
6892    Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6893    if (vector_element_basic_type(this, $src1) == T_FLOAT) {
6894      __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6895    } else {
6896      __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6897    }
6898  %}
6899  ins_pipe( pipe_slow );
6900%}
6901
6902instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
6903  predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
6904            is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6905  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6906  effect(TEMP scratch, TEMP ktmp);
6907  format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6908  ins_encode %{
6909    int vlen_enc = Assembler::AVX_512bit;
6910    Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6911    KRegister mask = k0; // The comparison itself is not being masked.
6912    if (vector_element_basic_type(this, $src1) == T_FLOAT) {
6913      __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6914      __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6915    } else {
6916      __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6917      __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6918    }
6919  %}
6920  ins_pipe( pipe_slow );
6921%}
6922
6923instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) %{
6924  predicate((UseAVX <= 2 || !VM_Version::supports_avx512vl()) &&
6925            !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6926            vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
6927            vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6928            is_integral_type(vector_element_basic_type(n->in(1)->in(1)))); // src1
6929  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6930  effect(TEMP scratch);
6931  format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6932  ins_encode %{
6933    int vlen_enc = vector_length_encoding(this, $src1);
6934    Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6935    Assembler::Width ww = widthForType(vector_element_basic_type(this, $src1));
6936    __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, ww, vlen_enc, $scratch$$Register);
6937  %}
6938  ins_pipe( pipe_slow );
6939%}
6940
6941instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
6942  predicate((UseAVX == 2 || !VM_Version::supports_avx512vl()) &&
6943            is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6944            vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
6945            vector_length_in_bytes(n->in(1)->in(1)) <= 16 && // src1
6946            is_integral_type(vector_element_basic_type(n->in(1)->in(1)))); // src1
6947  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6948  effect(TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6949  format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6950  ins_encode %{
6951    int vlen = vector_length_in_bytes(this, $src1);
6952    Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6953    BasicType bt = vector_element_basic_type(this, $src1);
6954    __ vpcmpu(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
6955              $vtmp2$$XMMRegister, $scratch$$Register);
6956  %}
6957  ins_pipe( pipe_slow );
6958%}
6959
6960instruct vcmpu32(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, legVec vtmp3, rRegP scratch) %{
6961  predicate((UseAVX == 2 || !VM_Version::supports_avx512vl()) &&
6962            is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6963            vector_length_in_bytes(n->in(1)->in(1)) == 32 && // src1
6964            is_integral_type(vector_element_basic_type(n->in(1)->in(1)))); // src1
6965  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6966  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP scratch);
6967  format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6968  ins_encode %{
6969    int vlen = vector_length_in_bytes(this, $src1);
6970    Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6971    BasicType bt = vector_element_basic_type(this, $src1);
6972    __ vpcmpu32(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
6973                $vtmp2$$XMMRegister, $vtmp3$$XMMRegister, $scratch$$Register);
6974  %}
6975  ins_pipe( pipe_slow );
6976%}
6977
6978instruct evcmp(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
6979  predicate(UseAVX > 2 &&
6980            (VM_Version::supports_avx512vl() ||
6981             vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
6982             is_integral_type(vector_element_basic_type(n->in(1)->in(1)))); // src1
6983  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6984  effect(TEMP scratch, TEMP ktmp);
6985  format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6986  ins_encode %{
6987    assert(UseAVX > 2, "required");
6988
6989    int vlen_enc = vector_length_encoding(this, $src1);
6990    Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6991    bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
6992    KRegister mask = k0; // The comparison itself is not being masked.
6993    bool merge = false;
6994    BasicType src1_elem_bt = vector_element_basic_type(this, $src1);
6995
6996    switch (src1_elem_bt) {
6997      case T_BYTE: {
6998        __ evpcmpb($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
6999        __ evmovdqub($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7000        break;
7001      }
7002      case T_SHORT: {
7003        __ evpcmpw($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7004        __ evmovdquw($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7005        break;
7006      }
7007      case T_INT: {
7008        __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7009        __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7010        break;
7011      }
7012      case T_LONG: {
7013        __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7014        __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7015        break;
7016      }
7017      default: assert(false, "%s", type2name(src1_elem_bt));
7018    }
7019  %}
7020  ins_pipe( pipe_slow );
7021%}
7022
7023// Extract
7024
7025instruct extractI(rRegI dst, legVec src, immU8 idx) %{
7026  predicate(vector_length_in_bytes(n->in(1)) <= 16); // src
7027  match(Set dst (ExtractI src idx));
7028  match(Set dst (ExtractS src idx));
7029#ifdef _LP64
7030  match(Set dst (ExtractB src idx));
7031#endif
7032  format %{ "extractI $dst,$src,$idx\t!" %}
7033  ins_encode %{
7034    assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7035
7036    BasicType elem_bt = vector_element_basic_type(this, $src);
7037    __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7038  %}
7039  ins_pipe( pipe_slow );
7040%}
7041
7042instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
7043  predicate(vector_length_in_bytes(n->in(1)) == 32 || // src
7044            vector_length_in_bytes(n->in(1)) == 64);  // src
7045  match(Set dst (ExtractI src idx));
7046  match(Set dst (ExtractS src idx));
7047#ifdef _LP64
7048  match(Set dst (ExtractB src idx));
7049#endif
7050  effect(TEMP vtmp);
7051  format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
7052  ins_encode %{
7053    assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7054
7055    BasicType elem_bt = vector_element_basic_type(this, $src);
7056    XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7057    __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
7058  %}
7059  ins_pipe( pipe_slow );
7060%}
7061
7062#ifdef _LP64
7063instruct extractL(rRegL dst, legVec src, immU8 idx) %{
7064  predicate(vector_length(n->in(1)) <= 2); // src
7065  match(Set dst (ExtractL src idx));
7066  format %{ "extractL $dst,$src,$idx\t!" %}
7067  ins_encode %{
7068    assert(UseSSE >= 4, "required");
7069    assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7070
7071    __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7072  %}
7073  ins_pipe( pipe_slow );
7074%}
7075
7076instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
7077  predicate(vector_length(n->in(1)) == 4 || // src
7078            vector_length(n->in(1)) == 8);  // src
7079  match(Set dst (ExtractL src idx));
7080  effect(TEMP vtmp);
7081  format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
7082  ins_encode %{
7083    assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7084
7085    XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7086    __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
7087  %}
7088  ins_pipe( pipe_slow );
7089%}
7090#endif
7091
7092instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7093  predicate(vector_length(n->in(1)) <= 4);
7094  match(Set dst (ExtractF src idx));
7095  effect(TEMP dst, TEMP tmp, TEMP vtmp);
7096  format %{ "extractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7097  ins_encode %{
7098    assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7099
7100    __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
7101  %}
7102  ins_pipe( pipe_slow );
7103%}
7104
7105instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7106  predicate(vector_length(n->in(1)/*src*/) == 8 ||
7107            vector_length(n->in(1)/*src*/) == 16);
7108  match(Set dst (ExtractF src idx));
7109  effect(TEMP tmp, TEMP vtmp);
7110  format %{ "vextractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7111  ins_encode %{
7112    assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7113
7114    XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7115    __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
7116  %}
7117  ins_pipe( pipe_slow );
7118%}
7119
7120instruct extractD(legRegD dst, legVec src, immU8 idx) %{
7121  predicate(vector_length(n->in(1)) == 2); // src
7122  match(Set dst (ExtractD src idx));
7123  format %{ "extractD $dst,$src,$idx\t!" %}
7124  ins_encode %{
7125    assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7126
7127    __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7128  %}
7129  ins_pipe( pipe_slow );
7130%}
7131
7132instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
7133  predicate(vector_length(n->in(1)) == 4 || // src
7134            vector_length(n->in(1)) == 8);  // src
7135  match(Set dst (ExtractD src idx));
7136  effect(TEMP vtmp);
7137  format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
7138  ins_encode %{
7139    assert($idx$$constant < (int)vector_length(this, $src), "out of bounds");
7140
7141    XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7142    __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
7143  %}
7144  ins_pipe( pipe_slow );
7145%}
7146
7147// --------------------------------- Vector Blend --------------------------------------
7148
7149instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
7150  predicate(UseAVX == 0);
7151  match(Set dst (VectorBlend (Binary dst src) mask));
7152  format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
7153  effect(TEMP tmp);
7154  ins_encode %{
7155    assert(UseSSE >= 4, "required");
7156
7157    if ($mask$$XMMRegister != $tmp$$XMMRegister) {
7158      __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
7159    }
7160    __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
7161  %}
7162  ins_pipe( pipe_slow );
7163%}
7164
7165instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
7166  predicate(UseAVX > 0 &&
7167            vector_length_in_bytes(n) <= 32 &&
7168            is_integral_type(vector_element_basic_type(n)));
7169  match(Set dst (VectorBlend (Binary src1 src2) mask));
7170  format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7171  ins_encode %{
7172    int vlen_enc = vector_length_encoding(this);
7173    __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7174  %}
7175  ins_pipe( pipe_slow );
7176%}
7177
7178instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
7179  predicate(UseAVX > 0 &&
7180            vector_length_in_bytes(n) <= 32 &&
7181            !is_integral_type(vector_element_basic_type(n)));
7182  match(Set dst (VectorBlend (Binary src1 src2) mask));
7183  format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7184  ins_encode %{
7185    int vlen_enc = vector_length_encoding(this);
7186    __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7187  %}
7188  ins_pipe( pipe_slow );
7189%}
7190
7191instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch, kReg ktmp) %{
7192  predicate(vector_length_in_bytes(n) == 64);
7193  match(Set dst (VectorBlend (Binary src1 src2) mask));
7194  format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7195  effect(TEMP scratch, TEMP ktmp);
7196  ins_encode %{
7197     int vlen_enc = Assembler::AVX_512bit;
7198     BasicType elem_bt = vector_element_basic_type(this);
7199    __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
7200    __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7201  %}
7202  ins_pipe( pipe_slow );
7203%}
7204
7205// --------------------------------- ABS --------------------------------------
7206// a = |a|
7207instruct vabsB_reg(vec dst, vec src) %{
7208  match(Set dst (AbsVB  src));
7209  format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
7210  ins_encode %{
7211    uint vlen = vector_length(this);
7212    if (vlen <= 16) {
7213      __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7214    } else {
7215      int vlen_enc = vector_length_encoding(this);
7216      __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7217    }
7218  %}
7219  ins_pipe( pipe_slow );
7220%}
7221
7222instruct vabsS_reg(vec dst, vec src) %{
7223  match(Set dst (AbsVS  src));
7224  format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
7225  ins_encode %{
7226    uint vlen = vector_length(this);
7227    if (vlen <= 8) {
7228      __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7229    } else {
7230      int vlen_enc = vector_length_encoding(this);
7231      __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7232    }
7233  %}
7234  ins_pipe( pipe_slow );
7235%}
7236
7237instruct vabsI_reg(vec dst, vec src) %{
7238  match(Set dst (AbsVI  src));
7239  format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
7240  ins_encode %{
7241    uint vlen = vector_length(this);
7242    if (vlen <= 4) {
7243      __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7244    } else {
7245      int vlen_enc = vector_length_encoding(this);
7246      __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7247    }
7248  %}
7249  ins_pipe( pipe_slow );
7250%}
7251
7252instruct vabsL_reg(vec dst, vec src) %{
7253  match(Set dst (AbsVL  src));
7254  format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
7255  ins_encode %{
7256    assert(UseAVX > 2, "required");
7257    int vlen_enc = vector_length_encoding(this);
7258    if (!VM_Version::supports_avx512vl()) {
7259      vlen_enc = Assembler::AVX_512bit;
7260    }
7261    __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7262  %}
7263  ins_pipe( pipe_slow );
7264%}
7265
7266// --------------------------------- ABSNEG --------------------------------------
7267
7268instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
7269  predicate(vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
7270  match(Set dst (AbsVF src));
7271  match(Set dst (NegVF src));
7272  effect(TEMP scratch);
7273  format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
7274  ins_cost(150);
7275  ins_encode %{
7276    int opcode = this->ideal_Opcode();
7277    int vlen = vector_length(this);
7278    if (vlen == 2) {
7279      __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7280    } else {
7281      assert(vlen == 8 || vlen == 16, "required");
7282      int vlen_enc = vector_length_encoding(this);
7283      __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7284    }
7285  %}
7286  ins_pipe( pipe_slow );
7287%}
7288
7289instruct vabsneg4F(vec dst, rRegI scratch) %{
7290  predicate(vector_length(n) == 4);
7291  match(Set dst (AbsVF dst));
7292  match(Set dst (NegVF dst));
7293  effect(TEMP scratch);
7294  format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7295  ins_cost(150);
7296  ins_encode %{
7297    int opcode = this->ideal_Opcode();
7298    __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7299  %}
7300  ins_pipe( pipe_slow );
7301%}
7302
7303instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7304  match(Set dst (AbsVD  src));
7305  match(Set dst (NegVD  src));
7306  effect(TEMP scratch);
7307  format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7308  ins_encode %{
7309    int opcode = this->ideal_Opcode();
7310    uint vlen = vector_length(this);
7311    if (vlen == 2) {
7312      assert(UseSSE >= 2, "required");
7313      __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7314    } else {
7315      int vlen_enc = vector_length_encoding(this);
7316      __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7317    }
7318  %}
7319  ins_pipe( pipe_slow );
7320%}
7321
7322//------------------------------------- VectorTest --------------------------------------------
7323
7324#ifdef _LP64
7325instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, legVec vtmp2, rFlagsReg cr) %{
7326  predicate(vector_length_in_bytes(n->in(1)) >= 4 &&
7327            vector_length_in_bytes(n->in(1)) < 16 &&
7328            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7329  match(Set dst (VectorTest src1 src2 ));
7330  effect(TEMP vtmp1, TEMP vtmp2, KILL cr);
7331  format %{ "vector_test $dst,$src1, $src2\t! using $vtmp1, $vtmp2 and $cr as TEMP" %}
7332  ins_encode %{
7333    int vlen = vector_length_in_bytes(this, $src1);
7334    __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
7335    __ setb(Assembler::carrySet, $dst$$Register);
7336    __ movzbl($dst$$Register, $dst$$Register);
7337  %}
7338  ins_pipe( pipe_slow );
7339%}
7340
7341instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7342  predicate(vector_length_in_bytes(n->in(1)) >= 16 &&
7343            vector_length_in_bytes(n->in(1)) <  64 &&
7344            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7345  match(Set dst (VectorTest src1 src2 ));
7346  effect(KILL cr);
7347  format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7348  ins_encode %{
7349    int vlen = vector_length_in_bytes(this, $src1);
7350    __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7351    __ setb(Assembler::carrySet, $dst$$Register);
7352    __ movzbl($dst$$Register, $dst$$Register);
7353  %}
7354  ins_pipe( pipe_slow );
7355%}
7356
7357instruct vptest_alltrue_evex(rRegI dst, legVec src1, legVec src2, kReg ktmp, rFlagsReg cr) %{
7358  predicate(vector_length_in_bytes(n->in(1)) == 64 &&
7359            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7360  match(Set dst (VectorTest src1 src2 ));
7361  effect(KILL cr, TEMP ktmp);
7362  format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7363  ins_encode %{
7364    int vlen = vector_length_in_bytes(this, $src1);
7365    __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7366    __ setb(Assembler::carrySet, $dst$$Register);
7367    __ movzbl($dst$$Register, $dst$$Register);
7368  %}
7369  ins_pipe( pipe_slow );
7370%}
7371
7372instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, rFlagsReg cr) %{
7373  predicate(vector_length_in_bytes(n->in(1)) >= 4 &&
7374            vector_length_in_bytes(n->in(1)) < 16 &&
7375            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7376  match(Set dst (VectorTest src1 src2 ));
7377  effect(TEMP vtmp, KILL cr);
7378  format %{ "vector_test_any_true $dst,$src1,$src2\t! using $vtmp, $cr as TEMP" %}
7379  ins_encode %{
7380    int vlen = vector_length_in_bytes(this, $src1);
7381    __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7382    __ setb(Assembler::notZero, $dst$$Register);
7383    __ movzbl($dst$$Register, $dst$$Register);
7384  %}
7385  ins_pipe( pipe_slow );
7386%}
7387
7388instruct vptest_anytrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7389  predicate(vector_length_in_bytes(n->in(1)) >= 16 &&
7390            vector_length_in_bytes(n->in(1)) < 64  &&
7391            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7392  match(Set dst (VectorTest src1 src2 ));
7393  effect(KILL cr);
7394  format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %}
7395  ins_encode %{
7396    int vlen = vector_length_in_bytes(this, $src1);
7397    __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7398    __ setb(Assembler::notZero, $dst$$Register);
7399    __ movzbl($dst$$Register, $dst$$Register);
7400  %}
7401  ins_pipe( pipe_slow );
7402%}
7403
7404instruct vptest_anytrue_evex(rRegI dst, legVec src1, legVec src2, kReg ktmp, rFlagsReg cr) %{
7405  predicate(vector_length_in_bytes(n->in(1)) == 64 &&
7406            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7407  match(Set dst (VectorTest src1 src2 ));
7408  effect(KILL cr, TEMP ktmp);
7409  format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %}
7410  ins_encode %{
7411    int vlen = vector_length_in_bytes(this, $src1);
7412    __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7413    __ setb(Assembler::notZero, $dst$$Register);
7414    __ movzbl($dst$$Register, $dst$$Register);
7415  %}
7416  ins_pipe( pipe_slow );
7417%}
7418
7419instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, legVec vtmp) %{
7420  predicate(vector_length_in_bytes(n->in(1)->in(1)) >= 4 &&
7421            vector_length_in_bytes(n->in(1)->in(1)) < 16 &&
7422            static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7423  match(Set cr (CmpI (VectorTest src1 src2) zero));
7424  effect(TEMP vtmp);
7425  format %{ "cmp_vector_test_any_true $src1,$src2\t! using $vtmp as TEMP" %}
7426  ins_encode %{
7427    int vlen = vector_length_in_bytes(this, $src1);
7428    __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7429  %}
7430  ins_pipe( pipe_slow );
7431%}
7432
7433instruct cmpvptest_anytrue(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{
7434  predicate(vector_length_in_bytes(n->in(1)->in(1)) >= 16 &&
7435            vector_length_in_bytes(n->in(1)->in(1)) <  64 &&
7436            static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7437  match(Set cr (CmpI (VectorTest src1 src2) zero));
7438  format %{ "cmp_vector_test_any_true $src1,$src2\t!" %}
7439  ins_encode %{
7440    int vlen = vector_length_in_bytes(this, $src1);
7441    __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7442  %}
7443  ins_pipe( pipe_slow );
7444%}
7445
7446instruct cmpvptest_anytrue_evex(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, kReg ktmp) %{
7447  predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 &&
7448            static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7449  match(Set cr (CmpI (VectorTest src1 src2) zero));
7450  effect(TEMP ktmp);
7451  format %{ "cmp_vector_test_any_true $src1,$src2\t!" %}
7452  ins_encode %{
7453    int vlen = vector_length_in_bytes(this, $src1);
7454    __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7455  %}
7456  ins_pipe( pipe_slow );
7457%}
7458#endif
7459
7460//------------------------------------- LoadMask --------------------------------------------
7461
7462instruct loadMask(legVec dst, legVec src) %{
7463  predicate(!VM_Version::supports_avx512vlbw());
7464  match(Set dst (VectorLoadMask src));
7465  effect(TEMP dst);
7466  format %{ "vector_loadmask_byte $dst,$src\n\t" %}
7467  ins_encode %{
7468    int vlen_in_bytes = vector_length_in_bytes(this);
7469    BasicType elem_bt = vector_element_basic_type(this);
7470
7471    __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
7472  %}
7473  ins_pipe( pipe_slow );
7474%}
7475
7476instruct loadMask_evex(vec dst, vec src) %{
7477  predicate(VM_Version::supports_avx512vlbw());
7478  match(Set dst (VectorLoadMask src));
7479  effect(TEMP dst);
7480  format %{ "vector_loadmask_byte $dst,$src\n\t" %}
7481  ins_encode %{
7482    int vlen_in_bytes = vector_length_in_bytes(this);
7483    BasicType elem_bt = vector_element_basic_type(this);
7484
7485    __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, false);
7486  %}
7487  ins_pipe( pipe_slow );
7488%}
7489
7490//------------------------------------- StoreMask --------------------------------------------
7491
7492instruct storeMask1B(vec dst, vec src, immI_1 size) %{
7493  predicate(vector_length(n) < 64 || VM_Version::supports_avx512vlbw());
7494  match(Set dst (VectorStoreMask src size));
7495  format %{ "vector_store_mask $dst,$src\t!" %}
7496  ins_encode %{
7497    assert(UseSSE >= 3, "required");
7498    if (vector_length_in_bytes(this) <= 16) {
7499      __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7500    } else {
7501      assert(UseAVX >= 2, "required");
7502      int src_vlen_enc = vector_length_encoding(this, $src);
7503      __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7504    }
7505  %}
7506  ins_pipe( pipe_slow );
7507%}
7508
7509instruct storeMask2B(vec dst, vec src, immI_2 size) %{
7510  predicate(vector_length(n) <= 8);
7511  match(Set dst (VectorStoreMask src size));
7512  format %{ "vector_store_mask $dst,$src\n\t" %}
7513  ins_encode %{
7514    assert(UseSSE >= 3, "required");
7515    __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7516    __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7517  %}
7518  ins_pipe( pipe_slow );
7519%}
7520
7521instruct vstoreMask2B(vec dst, vec src, immI_2 size) %{
7522  predicate(vector_length(n) == 16 && !VM_Version::supports_avx512bw());
7523  match(Set dst (VectorStoreMask src size));
7524  effect(TEMP dst);
7525  format %{ "vector_store_mask $dst,$src\t!" %}
7526  ins_encode %{
7527    int vlen_enc = Assembler::AVX_128bit;
7528    __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7529    __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister,vlen_enc);
7530    __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7531  %}
7532  ins_pipe( pipe_slow );
7533%}
7534
7535instruct vstoreMask2B_evex(vec dst, vec src, immI_2 size) %{
7536  predicate(VM_Version::supports_avx512bw());
7537  match(Set dst (VectorStoreMask src size));
7538  format %{ "vector_store_mask $dst,$src\t!" %}
7539  ins_encode %{
7540    int src_vlen_enc = vector_length_encoding(this, $src);
7541    int dst_vlen_enc = vector_length_encoding(this);
7542    __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7543    __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7544  %}
7545  ins_pipe( pipe_slow );
7546%}
7547
7548instruct storeMask4B(vec dst, vec src, immI_4 size) %{
7549  predicate (vector_length(n) <= 4 && UseAVX <= 2);
7550  match(Set dst (VectorStoreMask src size));
7551  format %{ "vector_store_mask $dst,$src\t!" %}
7552  ins_encode %{
7553    assert(UseSSE >= 3, "required");
7554    __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7555    __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7556    __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7557  %}
7558  ins_pipe( pipe_slow );
7559%}
7560
7561instruct vstoreMask4B(vec dst, vec src, immI_4 size) %{
7562  predicate(vector_length(n) == 8 && UseAVX <= 2);
7563  match(Set dst (VectorStoreMask src size));
7564  format %{ "vector_store_mask $dst,$src\t!" %}
7565  effect(TEMP dst);
7566  ins_encode %{
7567    int vlen_enc = Assembler::AVX_128bit;
7568    __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7569    __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7570    __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7571    __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7572  %}
7573  ins_pipe( pipe_slow );
7574%}
7575
7576instruct vstoreMask4B_evex(vec dst, vec src, immI_4 size) %{
7577  predicate(UseAVX > 2);
7578  match(Set dst (VectorStoreMask src size));
7579  format %{ "vector_store_mask $dst,$src\t!" %}
7580  ins_encode %{
7581    int src_vlen_enc = vector_length_encoding(this, $src);
7582    int dst_vlen_enc = vector_length_encoding(this);
7583    if (!VM_Version::supports_avx512vl()) {
7584      src_vlen_enc = Assembler::AVX_512bit;
7585    }
7586    __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7587    __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7588  %}
7589  ins_pipe( pipe_slow );
7590%}
7591
7592instruct storeMask8B(vec dst, vec src, immI_8 size) %{
7593  predicate(vector_length(n) == 2 && UseAVX <= 2);
7594  match(Set dst (VectorStoreMask src size));
7595  format %{ "vector_store_mask $dst,$src\t!" %}
7596  ins_encode %{
7597    assert(UseSSE >= 3, "required");
7598    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
7599    __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7600    __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7601    __ pabsb($dst$$XMMRegister, $dst$$XMMRegister);
7602  %}
7603  ins_pipe( pipe_slow );
7604%}
7605
7606instruct storeMask8B_avx(vec dst, vec src, immI_8 size, legVec vtmp) %{
7607  predicate(vector_length(n) == 4 && UseAVX <= 2);
7608  match(Set dst (VectorStoreMask src size));
7609  format %{ "vector_store_mask $dst,$src\t! using $vtmp as TEMP" %}
7610  effect(TEMP dst, TEMP vtmp);
7611  ins_encode %{
7612    int vlen_enc = Assembler::AVX_128bit;
7613    __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
7614    __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
7615    __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
7616    __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7617    __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7618    __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7619  %}
7620  ins_pipe( pipe_slow );
7621%}
7622
7623instruct vstoreMask8B_evex(vec dst, vec src, immI_8 size) %{
7624  predicate(UseAVX > 2);
7625  match(Set dst (VectorStoreMask src size));
7626  format %{ "vector_store_mask $dst,$src\t!" %}
7627  ins_encode %{
7628    int src_vlen_enc = vector_length_encoding(this, $src);
7629    int dst_vlen_enc = vector_length_encoding(this);
7630    if (!VM_Version::supports_avx512vl()) {
7631      src_vlen_enc = Assembler::AVX_512bit;
7632    }
7633    __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7634    __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7635  %}
7636  ins_pipe( pipe_slow );
7637%}
7638
7639instruct vmaskcast(vec dst) %{
7640  predicate((vector_length(n) == vector_length(n->in(1))) &&
7641            (vector_length_in_bytes(n) == vector_length_in_bytes(n->in(1))));
7642  match(Set dst (VectorMaskCast dst));
7643  ins_cost(0);
7644  format %{ "vector_mask_cast $dst" %}
7645  ins_encode %{
7646    // empty
7647  %}
7648  ins_pipe(empty);
7649%}
7650
7651//-------------------------------- Load Iota Indices ----------------------------------
7652
7653instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
7654  predicate(vector_element_basic_type(n) == T_BYTE);
7655  match(Set dst (VectorLoadConst src));
7656  effect(TEMP scratch);
7657  format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
7658  ins_encode %{
7659     int vlen_in_bytes = vector_length_in_bytes(this);
7660     __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
7661  %}
7662  ins_pipe( pipe_slow );
7663%}
7664
7665//-------------------------------- Rearrange ----------------------------------
7666
7667// LoadShuffle/Rearrange for Byte
7668
7669instruct loadShuffleB(vec dst) %{
7670  predicate(vector_element_basic_type(n) == T_BYTE);
7671  match(Set dst (VectorLoadShuffle dst));
7672  format %{ "vector_load_shuffle $dst, $dst" %}
7673  ins_encode %{
7674    // empty
7675  %}
7676  ins_pipe( pipe_slow );
7677%}
7678
7679instruct rearrangeB(vec dst, vec shuffle) %{
7680  predicate(vector_element_basic_type(n) == T_BYTE &&
7681            vector_length(n) < 32);
7682  match(Set dst (VectorRearrange dst shuffle));
7683  format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7684  ins_encode %{
7685    assert(UseSSE >= 4, "required");
7686    __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7687  %}
7688  ins_pipe( pipe_slow );
7689%}
7690
7691instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7692  predicate(vector_element_basic_type(n) == T_BYTE &&
7693            vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
7694  match(Set dst (VectorRearrange src shuffle));
7695  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7696  format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
7697  ins_encode %{
7698    assert(UseAVX >= 2, "required");
7699    // Swap src into vtmp1
7700    __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
7701    // Shuffle swapped src to get entries from other 128 bit lane
7702    __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7703    // Shuffle original src to get entries from self 128 bit lane
7704    __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7705    // Create a blend mask by setting high bits for entries coming from other lane in shuffle
7706    __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
7707    // Perform the blend
7708    __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
7709  %}
7710  ins_pipe( pipe_slow );
7711%}
7712
7713instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
7714  predicate(vector_element_basic_type(n) == T_BYTE &&
7715            vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
7716  match(Set dst (VectorRearrange src shuffle));
7717  format %{ "vector_rearrange $dst, $shuffle, $src" %}
7718  ins_encode %{
7719    int vlen_enc = vector_length_encoding(this);
7720    __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7721  %}
7722  ins_pipe( pipe_slow );
7723%}
7724
7725// LoadShuffle/Rearrange for Short
7726
7727instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
7728  predicate(vector_element_basic_type(n) == T_SHORT &&
7729            vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
7730  match(Set dst (VectorLoadShuffle src));
7731  effect(TEMP dst, TEMP vtmp, TEMP scratch);
7732  format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7733  ins_encode %{
7734    // Create a byte shuffle mask from short shuffle mask
7735    // only byte shuffle instruction available on these platforms
7736    int vlen_in_bytes = vector_length_in_bytes(this);
7737    if (UseAVX == 0) {
7738      assert(vlen_in_bytes <= 16, "required");
7739      // Multiply each shuffle by two to get byte index
7740      __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
7741      __ psllw($vtmp$$XMMRegister, 1);
7742
7743      // Duplicate to create 2 copies of byte index
7744      __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7745      __ psllw($dst$$XMMRegister, 8);
7746      __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
7747
7748      // Add one to get alternate byte index
7749      __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
7750      __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7751    } else {
7752      assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
7753      int vlen_enc = vector_length_encoding(this);
7754      // Multiply each shuffle by two to get byte index
7755      __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
7756      __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
7757
7758      // Duplicate to create 2 copies of byte index
7759      __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
7760      __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7761
7762      // Add one to get alternate byte index
7763      __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
7764    }
7765  %}
7766  ins_pipe( pipe_slow );
7767%}
7768
7769instruct rearrangeS(vec dst, vec shuffle) %{
7770  predicate(vector_element_basic_type(n) == T_SHORT &&
7771            vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
7772  match(Set dst (VectorRearrange dst shuffle));
7773  format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7774  ins_encode %{
7775    assert(UseSSE >= 4, "required");
7776    __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7777  %}
7778  ins_pipe( pipe_slow );
7779%}
7780
7781instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7782  predicate(vector_element_basic_type(n) == T_SHORT &&
7783            vector_length(n) == 16 && !VM_Version::supports_avx512bw());
7784  match(Set dst (VectorRearrange src shuffle));
7785  effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7786  format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
7787  ins_encode %{
7788    assert(UseAVX >= 2, "required");
7789    // Swap src into vtmp1
7790    __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
7791    // Shuffle swapped src to get entries from other 128 bit lane
7792    __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7793    // Shuffle original src to get entries from self 128 bit lane
7794    __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7795    // Create a blend mask by setting high bits for entries coming from other lane in shuffle
7796    __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
7797    // Perform the blend
7798    __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
7799  %}
7800  ins_pipe( pipe_slow );
7801%}
7802
7803instruct loadShuffleS_evex(vec dst, vec src) %{
7804  predicate(vector_element_basic_type(n) == T_SHORT &&
7805            VM_Version::supports_avx512bw());
7806  match(Set dst (VectorLoadShuffle src));
7807  format %{ "vector_load_shuffle $dst, $src" %}
7808  ins_encode %{
7809    int vlen_enc = vector_length_encoding(this);
7810    if (!VM_Version::supports_avx512vl()) {
7811      vlen_enc = Assembler::AVX_512bit;
7812    }
7813    __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7814  %}
7815  ins_pipe( pipe_slow );
7816%}
7817
7818instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
7819  predicate(vector_element_basic_type(n) == T_SHORT &&
7820            VM_Version::supports_avx512bw());
7821  match(Set dst (VectorRearrange src shuffle));
7822  format %{ "vector_rearrange $dst, $shuffle, $src" %}
7823  ins_encode %{
7824    int vlen_enc = vector_length_encoding(this);
7825    if (!VM_Version::supports_avx512vl()) {
7826      vlen_enc = Assembler::AVX_512bit;
7827    }
7828    __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7829  %}
7830  ins_pipe( pipe_slow );
7831%}
7832
7833// LoadShuffle/Rearrange for Integer and Float
7834
7835instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
7836  predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7837            vector_length(n) == 4 && UseAVX < 2);
7838  match(Set dst (VectorLoadShuffle src));
7839  effect(TEMP dst, TEMP vtmp, TEMP scratch);
7840  format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7841  ins_encode %{
7842    assert(UseSSE >= 4, "required");
7843
7844    // Create a byte shuffle mask from int shuffle mask
7845    // only byte shuffle instruction available on these platforms
7846
7847    // Duplicate and multiply each shuffle by 4
7848    __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
7849    __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7850    __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7851    __ psllw($vtmp$$XMMRegister, 2);
7852
7853    // Duplicate again to create 4 copies of byte index
7854    __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7855    __ psllw($dst$$XMMRegister, 8);
7856    __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
7857
7858    // Add 3,2,1,0 to get alternate byte index
7859    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
7860    __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7861  %}
7862  ins_pipe( pipe_slow );
7863%}
7864
7865instruct rearrangeI(vec dst, vec shuffle) %{
7866 predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7867           vector_length(n) == 4 && UseAVX < 2);
7868  match(Set dst (VectorRearrange dst shuffle));
7869  format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7870  ins_encode %{
7871    assert(UseSSE >= 4, "required");
7872    __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7873  %}
7874  ins_pipe( pipe_slow );
7875%}
7876
7877instruct loadShuffleI_avx(vec dst, vec src) %{
7878  predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7879            UseAVX >= 2);
7880  match(Set dst (VectorLoadShuffle src));
7881  format %{ "vector_load_shuffle $dst, $src" %}
7882  ins_encode %{
7883  int vlen_enc = vector_length_encoding(this);
7884    __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7885  %}
7886  ins_pipe( pipe_slow );
7887%}
7888
7889instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
7890  predicate((vector_element_basic_type(n) == T_INT || vector_element_basic_type(n) == T_FLOAT) &&
7891            UseAVX >= 2);
7892  match(Set dst (VectorRearrange src shuffle));
7893  format %{ "vector_rearrange $dst, $shuffle, $src" %}
7894  ins_encode %{
7895    int vlen_enc = vector_length_encoding(this);
7896    if (vlen_enc == Assembler::AVX_128bit) {
7897      vlen_enc = Assembler::AVX_256bit;
7898    }
7899    __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7900  %}
7901  ins_pipe( pipe_slow );
7902%}
7903
7904// LoadShuffle/Rearrange for Long and Double
7905
7906instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
7907  predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7908            vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7909  match(Set dst (VectorLoadShuffle src));
7910  effect(TEMP dst, TEMP vtmp, TEMP scratch);
7911  format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7912  ins_encode %{
7913    assert(UseAVX >= 2, "required");
7914
7915    int vlen_enc = vector_length_encoding(this);
7916    // Create a double word shuffle mask from long shuffle mask
7917    // only double word shuffle instruction available on these platforms
7918
7919    // Multiply each shuffle by two to get double word index
7920    __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
7921    __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
7922
7923    // Duplicate each double word shuffle
7924    __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
7925    __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7926
7927    // Add one to get alternate double word index
7928    __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
7929  %}
7930  ins_pipe( pipe_slow );
7931%}
7932
7933instruct rearrangeL(vec dst, vec src, vec shuffle) %{
7934  predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7935            vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7936  match(Set dst (VectorRearrange src shuffle));
7937  format %{ "vector_rearrange $dst, $shuffle, $src" %}
7938  ins_encode %{
7939    assert(UseAVX >= 2, "required");
7940
7941    int vlen_enc = vector_length_encoding(this);
7942    __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7943  %}
7944  ins_pipe( pipe_slow );
7945%}
7946
7947instruct loadShuffleL_evex(vec dst, vec src) %{
7948  predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7949            (vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7950  match(Set dst (VectorLoadShuffle src));
7951  format %{ "vector_load_shuffle $dst, $src" %}
7952  ins_encode %{
7953    assert(UseAVX > 2, "required");
7954
7955    int vlen_enc = vector_length_encoding(this);
7956    __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7957  %}
7958  ins_pipe( pipe_slow );
7959%}
7960
7961instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
7962  predicate(is_double_word_type(vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7963            (vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7964  match(Set dst (VectorRearrange src shuffle));
7965  format %{ "vector_rearrange $dst, $shuffle, $src" %}
7966  ins_encode %{
7967    assert(UseAVX > 2, "required");
7968
7969    int vlen_enc = vector_length_encoding(this);
7970    if (vlen_enc == Assembler::AVX_128bit) {
7971      vlen_enc = Assembler::AVX_256bit;
7972    }
7973    __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7974  %}
7975  ins_pipe( pipe_slow );
7976%}
7977
7978// --------------------------------- FMA --------------------------------------
7979// a * b + c
7980
7981instruct vfmaF_reg(vec a, vec b, vec c) %{
7982  match(Set c (FmaVF  c (Binary a b)));
7983  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
7984  ins_cost(150);
7985  ins_encode %{
7986    assert(UseFMA, "not enabled");
7987    int vlen_enc = vector_length_encoding(this);
7988    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
7989  %}
7990  ins_pipe( pipe_slow );
7991%}
7992
7993instruct vfmaF_mem(vec a, memory b, vec c) %{
7994  predicate(vector_length_in_bytes(n->in(1)) > 8);
7995  match(Set c (FmaVF  c (Binary a (LoadVector b))));
7996  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
7997  ins_cost(150);
7998  ins_encode %{
7999    assert(UseFMA, "not enabled");
8000    int vlen_enc = vector_length_encoding(this);
8001    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8002  %}
8003  ins_pipe( pipe_slow );
8004%}
8005
8006instruct vfmaD_reg(vec a, vec b, vec c) %{
8007  match(Set c (FmaVD  c (Binary a b)));
8008  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8009  ins_cost(150);
8010  ins_encode %{
8011    assert(UseFMA, "not enabled");
8012    int vlen_enc = vector_length_encoding(this);
8013    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8014  %}
8015  ins_pipe( pipe_slow );
8016%}
8017
8018instruct vfmaD_mem(vec a, memory b, vec c) %{
8019  predicate(vector_length_in_bytes(n->in(1)) > 8);
8020  match(Set c (FmaVD  c (Binary a (LoadVector b))));
8021  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8022  ins_cost(150);
8023  ins_encode %{
8024    assert(UseFMA, "not enabled");
8025    int vlen_enc = vector_length_encoding(this);
8026    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8027  %}
8028  ins_pipe( pipe_slow );
8029%}
8030
8031// --------------------------------- Vector Multiply Add --------------------------------------
8032
8033instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
8034  predicate(UseAVX == 0);
8035  match(Set dst (MulAddVS2VI dst src1));
8036  format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
8037  ins_encode %{
8038    __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
8039  %}
8040  ins_pipe( pipe_slow );
8041%}
8042
8043instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
8044  predicate(UseAVX > 0);
8045  match(Set dst (MulAddVS2VI src1 src2));
8046  format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
8047  ins_encode %{
8048    int vlen_enc = vector_length_encoding(this);
8049    __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8050  %}
8051  ins_pipe( pipe_slow );
8052%}
8053
8054// --------------------------------- Vector Multiply Add Add ----------------------------------
8055
8056instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
8057  predicate(VM_Version::supports_avx512_vnni());
8058  match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
8059  format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
8060  ins_encode %{
8061    assert(UseAVX > 2, "required");
8062    int vlen_enc = vector_length_encoding(this);
8063    __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8064  %}
8065  ins_pipe( pipe_slow );
8066  ins_cost(10);
8067%}
8068
8069// --------------------------------- PopCount --------------------------------------
8070
8071instruct vpopcountI(vec dst, vec src) %{
8072  match(Set dst (PopCountVI src));
8073  format %{ "vpopcntd  $dst,$src\t! vector popcount packedI" %}
8074  ins_encode %{
8075    assert(UsePopCountInstruction, "not enabled");
8076
8077    int vlen_enc = vector_length_encoding(this);
8078    __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8079  %}
8080  ins_pipe( pipe_slow );
8081%}
8082
8083// --------------------------------- Bitwise Ternary Logic ----------------------------------
8084
8085instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
8086  match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
8087  effect(TEMP dst);
8088  format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8089  ins_encode %{
8090    int vector_len = vector_length_encoding(this);
8091    __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
8092  %}
8093  ins_pipe( pipe_slow );
8094%}
8095
8096instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
8097  predicate(vector_length_in_bytes(n->in(1)->in(1)) > 8);
8098  match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
8099  effect(TEMP dst);
8100  format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8101  ins_encode %{
8102    int vector_len = vector_length_encoding(this);
8103    __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
8104  %}
8105  ins_pipe( pipe_slow );
8106%}
8107
8108// --------------------------------- Rotation Operations ----------------------------------
8109instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
8110  match(Set dst (RotateLeftV src shift));
8111  match(Set dst (RotateRightV src shift));
8112  format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
8113  ins_encode %{
8114    int opcode      = this->ideal_Opcode();
8115    int vector_len  = vector_length_encoding(this);
8116    BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8117    __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
8118  %}
8119  ins_pipe( pipe_slow );
8120%}
8121
8122instruct vprorate(vec dst, vec src, vec shift) %{
8123  match(Set dst (RotateLeftV src shift));
8124  match(Set dst (RotateRightV src shift));
8125  format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
8126  ins_encode %{
8127    int opcode      = this->ideal_Opcode();
8128    int vector_len  = vector_length_encoding(this);
8129    BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8130    __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8131  %}
8132  ins_pipe( pipe_slow );
8133%}
8134
8135#ifdef _LP64
8136// ---------------------------------- Masked Operations ------------------------------------
8137
8138instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
8139  match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
8140  effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
8141  format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
8142  ins_encode %{
8143    assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
8144    assert(vector_element_basic_type(this, $src1) == vector_element_basic_type(this, $src2), "mismatch");
8145
8146    Label DONE;
8147    int vlen_enc = vector_length_encoding(this, $src1);
8148    BasicType elem_bt = vector_element_basic_type(this, $src1);
8149
8150    __ knotql($ktmp2$$KRegister, $mask$$KRegister);
8151    __ mov64($dst$$Register, -1L);
8152    __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
8153    __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
8154    __ jccb(Assembler::carrySet, DONE);
8155    __ kmovql($dst$$Register, $ktmp1$$KRegister);
8156    __ notq($dst$$Register);
8157    __ tzcntq($dst$$Register, $dst$$Register);
8158    __ bind(DONE);
8159  %}
8160  ins_pipe( pipe_slow );
8161%}
8162
8163
8164instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
8165  match(Set dst (LoadVectorMasked mem mask));
8166  format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
8167  ins_encode %{
8168    BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
8169    int vector_len = vector_length_encoding(this);
8170    __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, vector_len);
8171  %}
8172  ins_pipe( pipe_slow );
8173%}
8174
8175instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
8176  match(Set dst (VectorMaskGen len));
8177  effect(TEMP temp);
8178  format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
8179  ins_encode %{
8180    __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
8181  %}
8182  ins_pipe( pipe_slow );
8183%}
8184
8185instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
8186  match(Set dst (VectorMaskGen len));
8187  format %{ "vector_mask_gen $len \t! vector mask generator" %}
8188  effect(TEMP temp);
8189  ins_encode %{
8190    __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
8191    __ kmovql($dst$$KRegister, $temp$$Register);
8192  %}
8193  ins_pipe( pipe_slow );
8194%}
8195
8196instruct vmasked_store64(memory mem, vec src, kReg mask) %{
8197  match(Set mem (StoreVectorMasked mem (Binary src mask)));
8198  format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
8199  ins_encode %{
8200    const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
8201    BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
8202    int vector_len = vector_length_encoding(src_node);
8203    __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, vector_len);
8204  %}
8205  ins_pipe( pipe_slow );
8206%}
8207
8208instruct vmask_truecount_evex(rRegI dst, vec mask, rRegL tmp, kReg ktmp, vec xtmp) %{
8209  predicate(VM_Version::supports_avx512vlbw());
8210  match(Set dst (VectorMaskTrueCount mask));
8211  effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp, TEMP xtmp);
8212  format %{ "vector_truecount_evex $mask \t! vector mask true count" %}
8213  ins_encode %{
8214    int opcode = this->ideal_Opcode();
8215    int vlen_enc = vector_length_encoding(this, $mask);
8216    int mask_len = vector_length(this, $mask);
8217    __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8218                             $tmp$$Register, $ktmp$$KRegister, mask_len, vlen_enc);
8219  %}
8220  ins_pipe( pipe_slow );
8221%}
8222
8223instruct vmask_first_or_last_true_evex(rRegI dst, vec mask, rRegL tmp, kReg ktmp, vec xtmp, rFlagsReg cr) %{
8224  predicate(VM_Version::supports_avx512vlbw());
8225  match(Set dst (VectorMaskFirstTrue mask));
8226  match(Set dst (VectorMaskLastTrue mask));
8227  effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp, TEMP xtmp, KILL cr);
8228  format %{ "vector_mask_first_or_last_true_evex $mask \t! vector first/last true location" %}
8229  ins_encode %{
8230    int opcode = this->ideal_Opcode();
8231    int vlen_enc = vector_length_encoding(this, $mask);
8232    int mask_len = vector_length(this, $mask);
8233    __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8234                             $tmp$$Register, $ktmp$$KRegister, mask_len, vlen_enc);
8235  %}
8236  ins_pipe( pipe_slow );
8237%}
8238
8239instruct vmask_truecount_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1) %{
8240  predicate(!VM_Version::supports_avx512vlbw());
8241  match(Set dst (VectorMaskTrueCount mask));
8242  effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1);
8243  format %{ "vector_truecount_avx $mask \t! vector mask true count" %}
8244  ins_encode %{
8245    int opcode = this->ideal_Opcode();
8246    int vlen_enc = vector_length_encoding(this, $mask);
8247    int mask_len = vector_length(this, $mask);
8248    __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8249                             $xtmp1$$XMMRegister, $tmp$$Register, mask_len, vlen_enc);
8250  %}
8251  ins_pipe( pipe_slow );
8252%}
8253
8254instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1, rFlagsReg cr) %{
8255  predicate(!VM_Version::supports_avx512vlbw());
8256  match(Set dst (VectorMaskFirstTrue mask));
8257  match(Set dst (VectorMaskLastTrue mask));
8258  effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1, KILL cr);
8259  format %{ "vector_mask_first_or_last_true_avx $mask \t! vector first/last true location" %}
8260  ins_encode %{
8261    int opcode = this->ideal_Opcode();
8262    int vlen_enc = vector_length_encoding(this, $mask);
8263    int mask_len = vector_length(this, $mask);
8264    __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8265                             $xtmp1$$XMMRegister, $tmp$$Register, mask_len, vlen_enc);
8266  %}
8267  ins_pipe( pipe_slow );
8268%}
8269#endif // _LP64
8270
8271instruct castVV(vec dst)
8272%{
8273  match(Set dst (CastVV dst));
8274
8275  size(0);
8276  format %{ "# castVV of $dst" %}
8277  ins_encode(/* empty encoding */);
8278  ins_cost(0);
8279  ins_pipe(empty);
8280%}
8281
8282instruct castVVLeg(legVec dst)
8283%{
8284  match(Set dst (CastVV dst));
8285
8286  size(0);
8287  format %{ "# castVV of $dst" %}
8288  ins_encode(/* empty encoding */);
8289  ins_cost(0);
8290  ins_pipe(empty);
8291%}
8292