1//
2// Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved.
3// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4//
5// This code is free software; you can redistribute it and/or modify it
6// under the terms of the GNU General Public License version 2 only, as
7// published by the Free Software Foundation.
8//
9// This code is distributed in the hope that it will be useful, but WITHOUT
10// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12// version 2 for more details (a copy is included in the LICENSE file that
13// accompanied this code).
14//
15// You should have received a copy of the GNU General Public License version
16// 2 along with this work; if not, write to the Free Software Foundation,
17// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18//
19// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20// or visit www.oracle.com if you need additional information or have any
21// questions.
22//
23//
24
25// X86 Common Architecture Description File
26
27//----------REGISTER DEFINITION BLOCK------------------------------------------
28// This information is used by the matcher and the register allocator to
29// describe individual registers and classes of registers within the target
30// archtecture.
31
32register %{
33//----------Architecture Description Register Definitions----------------------
34// General Registers
35// "reg_def"  name ( register save type, C convention save type,
36//                   ideal register type, encoding );
37// Register Save Types:
38//
39// NS  = No-Save:       The register allocator assumes that these registers
40//                      can be used without saving upon entry to the method, &
41//                      that they do not need to be saved at call sites.
42//
43// SOC = Save-On-Call:  The register allocator assumes that these registers
44//                      can be used without saving upon entry to the method,
45//                      but that they must be saved at call sites.
46//
47// SOE = Save-On-Entry: The register allocator assumes that these registers
48//                      must be saved before using them upon entry to the
49//                      method, but they do not need to be saved at call
50//                      sites.
51//
52// AS  = Always-Save:   The register allocator assumes that these registers
53//                      must be saved before using them upon entry to the
54//                      method, & that they must be saved at call sites.
55//
56// Ideal Register Type is used to determine how to save & restore a
57// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
58// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
59//
60// The encoding number is the actual bit-pattern placed into the opcodes.
61
62// XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
63// Word a in each register holds a Float, words ab hold a Double.
64// The whole registers are used in SSE4.2 version intrinsics,
65// array copy stubs and superword operations (see UseSSE42Intrinsics,
66// UseXMMForArrayCopy and UseSuperword flags).
67// For pre EVEX enabled architectures:
68//      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
69// For EVEX enabled architectures:
70//      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
71//
72// Linux ABI:   No register preserved across function calls
73//              XMM0-XMM7 might hold parameters
74// Windows ABI: XMM6-XMM31 preserved across function calls
75//              XMM0-XMM3 might hold parameters
76
77reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
78reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
79reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
80reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
81reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
82reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
83reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
84reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
85reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
86reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
87reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
88reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
89reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
90reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
91reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
92reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
93
94reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
95reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
96reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
97reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
98reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
99reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
100reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
101reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
102reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
103reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
104reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
105reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
106reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
107reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
108reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
109reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
110
111reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
112reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
113reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
114reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
115reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
116reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
117reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
118reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
119reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
120reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
121reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
122reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
123reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
124reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
125reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
126reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
127
128reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
129reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
130reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
131reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
132reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
133reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
134reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
135reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
136reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
137reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
138reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
139reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
140reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
141reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
142reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
143reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
144
145reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
146reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
147reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
148reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
149reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
150reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
151reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
152reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
153reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
154reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
155reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
156reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
157reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
158reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
159reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
160reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
161
162reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
163reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
164reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
165reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
166reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
167reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
168reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
169reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
170reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
171reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
172reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
173reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
174reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
175reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
176reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
177reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
178
179reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
180reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
181reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
182reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
183reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
184reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
185reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
186reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
187reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
188reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
189reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
190reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
191reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
192reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
193reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
194reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
195
196reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
197reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
198reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
199reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
200reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
201reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
202reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
203reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
204reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
205reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
206reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
207reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
208reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
209reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
210reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
211reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
212
213#ifdef _LP64
214
215reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
216reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
217reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
218reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
219reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
220reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
221reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
222reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
223reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
224reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
225reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
226reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
227reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
228reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
229reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
230reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
231
232reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
233reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
234reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
235reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
236reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
237reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
238reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
239reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
240reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
241reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
242reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
243reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
244reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
245reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
246reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
247reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
248
249reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
250reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
251reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
252reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
253reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
254reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
255reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
256reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
257reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
258reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
259reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
260reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
261reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
262reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
263reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
264reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
265
266reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
267reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
268reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
269reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
270reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
271reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
272reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
273reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
274reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
275reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
276reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
277reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
278reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
279reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
280reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
281reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
282
283reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
284reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
285reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
286reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
287reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
288reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
289reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
290reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
291reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
292reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
293reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
294reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
295reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
296reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
297reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
298reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
299
300reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
301reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
302reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
303reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
304reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
305reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
306reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
307reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
308reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
309reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
310reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
311reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
312reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
313reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
314reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
315reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
316
317reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
318reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
319reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
320reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
321reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
322reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
323reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
324reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
325reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
326reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
327reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
328reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
329reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
330reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
331reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
332reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
333
334reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
335reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
336reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
337reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
338reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
339reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
340reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
341reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
342reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
343reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
344reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
345reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
346reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
347reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
348reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
349reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
350
351reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
352reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
353reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
354reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
355reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
356reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
357reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
358reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
359reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
360reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
361reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
362reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
363reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
364reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
365reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
366reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
367
368reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
369reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
370reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
371reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
372reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
373reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
374reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
375reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
376reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
377reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
378reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
379reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
380reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
381reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
382reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
383reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
384
385reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
386reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
387reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
388reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
389reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
390reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
391reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
392reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
393reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
394reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
395reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
396reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
397reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
398reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
399reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
400reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
401
402reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
403reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
404reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
405reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
406reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
407reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
408reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
409reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
410reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
411reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
412reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
413reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
414reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
415reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
416reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
417reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
418
419reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
420reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
421reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
422reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
423reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
424reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
425reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
426reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
427reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
428reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
429reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
430reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
431reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
432reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
433reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
434reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
435
436reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
437reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
438reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
439reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
440reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
441reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
442reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
443reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
444reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
445reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
446reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
447reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
448reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
449reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
450reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
451reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
452
453reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
454reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
455reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
456reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
457reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
458reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
459reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
460reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
461reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
462reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
463reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
464reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
465reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
466reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
467reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
468reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
469
470reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
471reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
472reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
473reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
474reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
475reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
476reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
477reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
478reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
479reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
480reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
481reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
482reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
483reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
484reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
485reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
486
487reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
488reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
489reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
490reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
491reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
492reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
493reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
494reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
495reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
496reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
497reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
498reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
499reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
500reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
501reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
502reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
503
504reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
505reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
506reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
507reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
508reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
509reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
510reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
511reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
512reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
513reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
514reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
515reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
516reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
517reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
518reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
519reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
520
521reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
522reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
523reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
524reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
525reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
526reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
527reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
528reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
529reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
530reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
531reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
532reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
533reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
534reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
535reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
536reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
537
538reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
539reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
540reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
541reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
542reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
543reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
544reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
545reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
546reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
547reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
548reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
549reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
550reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
551reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
552reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
553reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
554
555reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
556reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
557reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
558reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
559reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
560reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
561reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
562reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
563reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
564reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
565reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
566reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
567reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
568reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
569reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
570reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
571
572reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
573reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
574reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
575reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
576reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
577reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
578reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
579reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
580reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
581reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
582reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
583reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
584reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
585reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
586reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
587reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
588
589reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
590reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
591reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
592reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
593reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
594reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
595reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
596reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
597reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
598reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
599reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
600reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
601reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
602reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
603reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
604reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
605
606reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
607reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
608reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
609reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
610reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
611reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
612reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
613reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
614reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
615reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
616reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
617reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
618reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
619reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
620reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
621reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
622
623#endif // _LP64
624
625#ifdef _LP64
626reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
627#else
628reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
629#endif // _LP64
630
631alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
632                   XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
633                   XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
634                   XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
635                   XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
636                   XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
637                   XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
638                   XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
639#ifdef _LP64
640                  ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
641                   XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
642                   XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
643                   XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
644                   XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
645                   XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
646                   XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
647                   XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
648                  ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
649                   XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
650                   XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
651                   XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
652                   XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
653                   XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
654                   XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
655                   XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
656                   XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
657                   XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
658                   XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
659                   XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
660                   XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
661                   XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
662                   XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
663                   XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
664#endif
665                      );
666
667// flags allocation class should be last.
668alloc_class chunk2(RFLAGS);
669
670// Singleton class for condition codes
671reg_class int_flags(RFLAGS);
672
673// Class for pre evex float registers
674reg_class float_reg_legacy(XMM0,
675                    XMM1,
676                    XMM2,
677                    XMM3,
678                    XMM4,
679                    XMM5,
680                    XMM6,
681                    XMM7
682#ifdef _LP64
683                   ,XMM8,
684                    XMM9,
685                    XMM10,
686                    XMM11,
687                    XMM12,
688                    XMM13,
689                    XMM14,
690                    XMM15
691#endif
692                    );
693
694// Class for evex float registers
695reg_class float_reg_evex(XMM0,
696                    XMM1,
697                    XMM2,
698                    XMM3,
699                    XMM4,
700                    XMM5,
701                    XMM6,
702                    XMM7
703#ifdef _LP64
704                   ,XMM8,
705                    XMM9,
706                    XMM10,
707                    XMM11,
708                    XMM12,
709                    XMM13,
710                    XMM14,
711                    XMM15,
712                    XMM16,
713                    XMM17,
714                    XMM18,
715                    XMM19,
716                    XMM20,
717                    XMM21,
718                    XMM22,
719                    XMM23,
720                    XMM24,
721                    XMM25,
722                    XMM26,
723                    XMM27,
724                    XMM28,
725                    XMM29,
726                    XMM30,
727                    XMM31
728#endif
729                    );
730
731reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
732reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
733
734// Class for pre evex double registers
735reg_class double_reg_legacy(XMM0,  XMM0b,
736                     XMM1,  XMM1b,
737                     XMM2,  XMM2b,
738                     XMM3,  XMM3b,
739                     XMM4,  XMM4b,
740                     XMM5,  XMM5b,
741                     XMM6,  XMM6b,
742                     XMM7,  XMM7b
743#ifdef _LP64
744                    ,XMM8,  XMM8b,
745                     XMM9,  XMM9b,
746                     XMM10, XMM10b,
747                     XMM11, XMM11b,
748                     XMM12, XMM12b,
749                     XMM13, XMM13b,
750                     XMM14, XMM14b,
751                     XMM15, XMM15b
752#endif
753                     );
754
755// Class for evex double registers
756reg_class double_reg_evex(XMM0,  XMM0b,
757                     XMM1,  XMM1b,
758                     XMM2,  XMM2b,
759                     XMM3,  XMM3b,
760                     XMM4,  XMM4b,
761                     XMM5,  XMM5b,
762                     XMM6,  XMM6b,
763                     XMM7,  XMM7b
764#ifdef _LP64
765                    ,XMM8,  XMM8b,
766                     XMM9,  XMM9b,
767                     XMM10, XMM10b,
768                     XMM11, XMM11b,
769                     XMM12, XMM12b,
770                     XMM13, XMM13b,
771                     XMM14, XMM14b,
772                     XMM15, XMM15b,
773                     XMM16, XMM16b,
774                     XMM17, XMM17b,
775                     XMM18, XMM18b,
776                     XMM19, XMM19b,
777                     XMM20, XMM20b,
778                     XMM21, XMM21b,
779                     XMM22, XMM22b,
780                     XMM23, XMM23b,
781                     XMM24, XMM24b,
782                     XMM25, XMM25b,
783                     XMM26, XMM26b,
784                     XMM27, XMM27b,
785                     XMM28, XMM28b,
786                     XMM29, XMM29b,
787                     XMM30, XMM30b,
788                     XMM31, XMM31b
789#endif
790                     );
791
792reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
793reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
794
795// Class for pre evex 32bit vector registers
796reg_class vectors_reg_legacy(XMM0,
797                      XMM1,
798                      XMM2,
799                      XMM3,
800                      XMM4,
801                      XMM5,
802                      XMM6,
803                      XMM7
804#ifdef _LP64
805                     ,XMM8,
806                      XMM9,
807                      XMM10,
808                      XMM11,
809                      XMM12,
810                      XMM13,
811                      XMM14,
812                      XMM15
813#endif
814                      );
815
816// Class for evex 32bit vector registers
817reg_class vectors_reg_evex(XMM0,
818                      XMM1,
819                      XMM2,
820                      XMM3,
821                      XMM4,
822                      XMM5,
823                      XMM6,
824                      XMM7
825#ifdef _LP64
826                     ,XMM8,
827                      XMM9,
828                      XMM10,
829                      XMM11,
830                      XMM12,
831                      XMM13,
832                      XMM14,
833                      XMM15,
834                      XMM16,
835                      XMM17,
836                      XMM18,
837                      XMM19,
838                      XMM20,
839                      XMM21,
840                      XMM22,
841                      XMM23,
842                      XMM24,
843                      XMM25,
844                      XMM26,
845                      XMM27,
846                      XMM28,
847                      XMM29,
848                      XMM30,
849                      XMM31
850#endif
851                      );
852
853reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
854reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
855
856// Class for all 64bit vector registers
857reg_class vectord_reg_legacy(XMM0,  XMM0b,
858                      XMM1,  XMM1b,
859                      XMM2,  XMM2b,
860                      XMM3,  XMM3b,
861                      XMM4,  XMM4b,
862                      XMM5,  XMM5b,
863                      XMM6,  XMM6b,
864                      XMM7,  XMM7b
865#ifdef _LP64
866                     ,XMM8,  XMM8b,
867                      XMM9,  XMM9b,
868                      XMM10, XMM10b,
869                      XMM11, XMM11b,
870                      XMM12, XMM12b,
871                      XMM13, XMM13b,
872                      XMM14, XMM14b,
873                      XMM15, XMM15b
874#endif
875                      );
876
877// Class for all 64bit vector registers
878reg_class vectord_reg_evex(XMM0,  XMM0b,
879                      XMM1,  XMM1b,
880                      XMM2,  XMM2b,
881                      XMM3,  XMM3b,
882                      XMM4,  XMM4b,
883                      XMM5,  XMM5b,
884                      XMM6,  XMM6b,
885                      XMM7,  XMM7b
886#ifdef _LP64
887                     ,XMM8,  XMM8b,
888                      XMM9,  XMM9b,
889                      XMM10, XMM10b,
890                      XMM11, XMM11b,
891                      XMM12, XMM12b,
892                      XMM13, XMM13b,
893                      XMM14, XMM14b,
894                      XMM15, XMM15b,
895                      XMM16, XMM16b,
896                      XMM17, XMM17b,
897                      XMM18, XMM18b,
898                      XMM19, XMM19b,
899                      XMM20, XMM20b,
900                      XMM21, XMM21b,
901                      XMM22, XMM22b,
902                      XMM23, XMM23b,
903                      XMM24, XMM24b,
904                      XMM25, XMM25b,
905                      XMM26, XMM26b,
906                      XMM27, XMM27b,
907                      XMM28, XMM28b,
908                      XMM29, XMM29b,
909                      XMM30, XMM30b,
910                      XMM31, XMM31b
911#endif
912                      );
913
914reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
915reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
916
917// Class for all 128bit vector registers
918reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
919                      XMM1,  XMM1b,  XMM1c,  XMM1d,
920                      XMM2,  XMM2b,  XMM2c,  XMM2d,
921                      XMM3,  XMM3b,  XMM3c,  XMM3d,
922                      XMM4,  XMM4b,  XMM4c,  XMM4d,
923                      XMM5,  XMM5b,  XMM5c,  XMM5d,
924                      XMM6,  XMM6b,  XMM6c,  XMM6d,
925                      XMM7,  XMM7b,  XMM7c,  XMM7d
926#ifdef _LP64
927                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,
928                      XMM9,  XMM9b,  XMM9c,  XMM9d,
929                      XMM10, XMM10b, XMM10c, XMM10d,
930                      XMM11, XMM11b, XMM11c, XMM11d,
931                      XMM12, XMM12b, XMM12c, XMM12d,
932                      XMM13, XMM13b, XMM13c, XMM13d,
933                      XMM14, XMM14b, XMM14c, XMM14d,
934                      XMM15, XMM15b, XMM15c, XMM15d
935#endif
936                      );
937
938// Class for all 128bit vector registers
939reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
940                      XMM1,  XMM1b,  XMM1c,  XMM1d,
941                      XMM2,  XMM2b,  XMM2c,  XMM2d,
942                      XMM3,  XMM3b,  XMM3c,  XMM3d,
943                      XMM4,  XMM4b,  XMM4c,  XMM4d,
944                      XMM5,  XMM5b,  XMM5c,  XMM5d,
945                      XMM6,  XMM6b,  XMM6c,  XMM6d,
946                      XMM7,  XMM7b,  XMM7c,  XMM7d
947#ifdef _LP64
948                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,
949                      XMM9,  XMM9b,  XMM9c,  XMM9d,
950                      XMM10, XMM10b, XMM10c, XMM10d,
951                      XMM11, XMM11b, XMM11c, XMM11d,
952                      XMM12, XMM12b, XMM12c, XMM12d,
953                      XMM13, XMM13b, XMM13c, XMM13d,
954                      XMM14, XMM14b, XMM14c, XMM14d,
955                      XMM15, XMM15b, XMM15c, XMM15d,
956                      XMM16, XMM16b, XMM16c, XMM16d,
957                      XMM17, XMM17b, XMM17c, XMM17d,
958                      XMM18, XMM18b, XMM18c, XMM18d,
959                      XMM19, XMM19b, XMM19c, XMM19d,
960                      XMM20, XMM20b, XMM20c, XMM20d,
961                      XMM21, XMM21b, XMM21c, XMM21d,
962                      XMM22, XMM22b, XMM22c, XMM22d,
963                      XMM23, XMM23b, XMM23c, XMM23d,
964                      XMM24, XMM24b, XMM24c, XMM24d,
965                      XMM25, XMM25b, XMM25c, XMM25d,
966                      XMM26, XMM26b, XMM26c, XMM26d,
967                      XMM27, XMM27b, XMM27c, XMM27d,
968                      XMM28, XMM28b, XMM28c, XMM28d,
969                      XMM29, XMM29b, XMM29c, XMM29d,
970                      XMM30, XMM30b, XMM30c, XMM30d,
971                      XMM31, XMM31b, XMM31c, XMM31d
972#endif
973                      );
974
975reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
976reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
977
978// Class for all 256bit vector registers
979reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
980                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
981                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
982                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
983                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
984                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
985                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
986                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
987#ifdef _LP64
988                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
989                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
990                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
991                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
992                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
993                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
994                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
995                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
996#endif
997                      );
998
999// Class for all 256bit vector registers
1000reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008#ifdef _LP64
1009                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                      XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                      XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                      XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                      XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                      XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                      XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                      XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                      XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                      XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                      XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                      XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                      XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                      XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                      XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                      XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                      XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033#endif
1034                      );
1035
1036reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038
1039// Class for all 512bit vector registers
1040reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048#ifdef _LP64
1049                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                     ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                      XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                      XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                      XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                      XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                      XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                      XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                      XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                      XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                      XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                      XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                      XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                      XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                      XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                      XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                      XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073#endif
1074                      );
1075
1076// Class for restricted 512bit vector registers
1077reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085#ifdef _LP64
1086                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094#endif
1095                      );
1096
1097reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099
1100%}
1101
1102
1103//----------SOURCE BLOCK-------------------------------------------------------
1104// This is a block of C++ code which provides values, functions, and
1105// definitions necessary in the rest of the architecture description
1106
1107source_hpp %{
1108// Header information of the source block.
1109// Method declarations/definitions which are used outside
1110// the ad-scope can conveniently be defined here.
1111//
1112// To keep related declarations/definitions/uses close together,
1113// we switch between source %{ }% and source_hpp %{ }% freely as needed.
1114
1115class NativeJump;
1116
1117class CallStubImpl {
1118
1119  //--------------------------------------------------------------
1120  //---<  Used for optimization in Compile::shorten_branches  >---
1121  //--------------------------------------------------------------
1122
1123 public:
1124  // Size of call trampoline stub.
1125  static uint size_call_trampoline() {
1126    return 0; // no call trampolines on this platform
1127  }
1128
1129  // number of relocations needed by a call trampoline stub
1130  static uint reloc_call_trampoline() {
1131    return 0; // no call trampolines on this platform
1132  }
1133};
1134
1135class HandlerImpl {
1136
1137 public:
1138
1139  static int emit_exception_handler(CodeBuffer &cbuf);
1140  static int emit_deopt_handler(CodeBuffer& cbuf);
1141
1142  static uint size_exception_handler() {
1143    // NativeCall instruction size is the same as NativeJump.
1144    // exception handler starts out as jump and can be patched to
1145    // a call be deoptimization.  (4932387)
1146    // Note that this value is also credited (in output.cpp) to
1147    // the size of the code section.
1148    return NativeJump::instruction_size;
1149  }
1150
1151#ifdef _LP64
1152  static uint size_deopt_handler() {
1153    // three 5 byte instructions plus one move for unreachable address.
1154    return 15+3;
1155  }
1156#else
1157  static uint size_deopt_handler() {
1158    // NativeCall instruction size is the same as NativeJump.
1159    // exception handler starts out as jump and can be patched to
1160    // a call be deoptimization.  (4932387)
1161    // Note that this value is also credited (in output.cpp) to
1162    // the size of the code section.
1163    return 5 + NativeJump::instruction_size; // pushl(); jmp;
1164  }
1165#endif
1166};
1167
1168%} // end source_hpp
1169
1170source %{
1171
1172#include "opto/addnode.hpp"
1173
1174// Emit exception handler code.
1175// Stuff framesize into a register and call a VM stub routine.
1176int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1177
1178  // Note that the code buffer's insts_mark is always relative to insts.
1179  // That's why we must use the macroassembler to generate a handler.
1180  MacroAssembler _masm(&cbuf);
1181  address base = __ start_a_stub(size_exception_handler());
1182  if (base == NULL) {
1183    ciEnv::current()->record_failure("CodeCache is full");
1184    return 0;  // CodeBuffer::expand failed
1185  }
1186  int offset = __ offset();
1187  __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1188  assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1189  __ end_a_stub();
1190  return offset;
1191}
1192
1193// Emit deopt handler code.
1194int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1195
1196  // Note that the code buffer's insts_mark is always relative to insts.
1197  // That's why we must use the macroassembler to generate a handler.
1198  MacroAssembler _masm(&cbuf);
1199  address base = __ start_a_stub(size_deopt_handler());
1200  if (base == NULL) {
1201    ciEnv::current()->record_failure("CodeCache is full");
1202    return 0;  // CodeBuffer::expand failed
1203  }
1204  int offset = __ offset();
1205
1206#ifdef _LP64
1207  address the_pc = (address) __ pc();
1208  Label next;
1209  // push a "the_pc" on the stack without destroying any registers
1210  // as they all may be live.
1211
1212  // push address of "next"
1213  __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1214  __ bind(next);
1215  // adjust it so it matches "the_pc"
1216  __ subptr(Address(rsp, 0), __ offset() - offset);
1217#else
1218  InternalAddress here(__ pc());
1219  __ pushptr(here.addr());
1220#endif
1221
1222  __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1223  assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1224  __ end_a_stub();
1225  return offset;
1226}
1227
1228
1229//=============================================================================
1230
1231  // Float masks come from different places depending on platform.
1232#ifdef _LP64
1233  static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1234  static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1235  static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1236  static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1237#else
1238  static address float_signmask()  { return (address)float_signmask_pool; }
1239  static address float_signflip()  { return (address)float_signflip_pool; }
1240  static address double_signmask() { return (address)double_signmask_pool; }
1241  static address double_signflip() { return (address)double_signflip_pool; }
1242#endif
1243  static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1244  static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1245  static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1246
1247//=============================================================================
1248const bool Matcher::match_rule_supported(int opcode) {
1249  if (!has_match_rule(opcode)) {
1250    return false; // no match rule present
1251  }
1252  switch (opcode) {
1253    case Op_AbsVL:
1254      if (UseAVX < 3) {
1255        return false;
1256      }
1257      break;
1258    case Op_PopCountI:
1259    case Op_PopCountL:
1260      if (!UsePopCountInstruction) {
1261        return false;
1262      }
1263      break;
1264    case Op_PopCountVI:
1265      if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq()) {
1266        return false;
1267      }
1268      break;
1269    case Op_MulVI:
1270      if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1271        return false;
1272      }
1273      break;
1274    case Op_MulVL:
1275    case Op_MulReductionVL:
1276      if (VM_Version::supports_avx512dq() == false) {
1277        return false;
1278      }
1279      break;
1280    case Op_AddReductionVL:
1281      if (UseAVX < 3) { // only EVEX : vector connectivity becomes an issue here
1282        return false;
1283      }
1284      break;
1285    case Op_AbsVB:
1286    case Op_AbsVS:
1287    case Op_AbsVI:
1288    case Op_AddReductionVI:
1289      if (UseSSE < 3 || !VM_Version::supports_ssse3()) { // requires at least SSSE3
1290        return false;
1291      }
1292      break;
1293    case Op_MulReductionVI:
1294      if (UseSSE < 4) { // requires at least SSE4
1295        return false;
1296      }
1297      break;
1298    case Op_AddReductionVF:
1299    case Op_AddReductionVD:
1300    case Op_MulReductionVF:
1301    case Op_MulReductionVD:
1302      if (UseSSE < 1) { // requires at least SSE
1303        return false;
1304      }
1305      break;
1306    case Op_SqrtVD:
1307    case Op_SqrtVF:
1308      if (UseAVX < 1) { // enabled for AVX only
1309        return false;
1310      }
1311      break;
1312    case Op_CompareAndSwapL:
1313#ifdef _LP64
1314    case Op_CompareAndSwapP:
1315#endif
1316      if (!VM_Version::supports_cx8()) {
1317        return false;
1318      }
1319      break;
1320    case Op_CMoveVF:
1321    case Op_CMoveVD:
1322      if (UseAVX < 1 || UseAVX > 2) {
1323        return false;
1324      }
1325      break;
1326    case Op_StrIndexOf:
1327      if (!UseSSE42Intrinsics) {
1328        return false;
1329      }
1330      break;
1331    case Op_StrIndexOfChar:
1332      if (!UseSSE42Intrinsics) {
1333        return false;
1334      }
1335      break;
1336    case Op_OnSpinWait:
1337      if (VM_Version::supports_on_spin_wait() == false) {
1338        return false;
1339      }
1340      break;
1341    case Op_MulAddVS2VI:
1342    case Op_RShiftVL:
1343    case Op_AbsVD:
1344    case Op_NegVD:
1345      if (UseSSE < 2) {
1346        return false;
1347      }
1348      break;
1349    case Op_MulVB:
1350    case Op_LShiftVB:
1351    case Op_RShiftVB:
1352    case Op_URShiftVB:
1353      if (UseSSE < 4) {
1354        return false;
1355      }
1356      break;
1357#ifdef _LP64
1358    case Op_MaxD:
1359    case Op_MaxF:
1360    case Op_MinD:
1361    case Op_MinF:
1362      if (UseAVX < 1) { // enabled for AVX only
1363        return false;
1364      }
1365      break;
1366#endif
1367    case Op_CacheWB:
1368    case Op_CacheWBPreSync:
1369    case Op_CacheWBPostSync:
1370      if (!VM_Version::supports_data_cache_line_flush()) {
1371        return false;
1372      }
1373      break;
1374    case Op_RoundDoubleMode:
1375      if (UseSSE < 4) {
1376        return false;
1377      }
1378      break;
1379    case Op_RoundDoubleModeV:
1380      if (VM_Version::supports_avx() == false) {
1381        return false; // 128bit vroundpd is not available
1382      }
1383      break;
1384  }
1385  return true;  // Match rules are supported by default.
1386}
1387
1388//------------------------------------------------------------------------
1389
1390// Identify extra cases that we might want to provide match rules for vector nodes and
1391// other intrinsics guarded with vector length (vlen) and element type (bt).
1392const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1393  if (!match_rule_supported(opcode)) {
1394    return false;
1395  }
1396  // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1397  //   * SSE2 supports 128bit vectors for all types;
1398  //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1399  //   * AVX2 supports 256bit vectors for all types;
1400  //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1401  //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1402  // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1403  // And MaxVectorSize is taken into account as well.
1404  if (!vector_size_supported(bt, vlen)) {
1405    return false;
1406  }
1407  // Special cases which require vector length follow:
1408  //   * implementation limitations
1409  //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1410  //   * 128bit vroundpd instruction is present only in AVX1
1411  switch (opcode) {
1412    case Op_AbsVF:
1413    case Op_NegVF:
1414      if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1415        return false; // 512bit vandps and vxorps are not available
1416      }
1417      break;
1418    case Op_AbsVD:
1419    case Op_NegVD:
1420      if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1421        return false; // 512bit vandpd and vxorpd are not available
1422      }
1423      break;
1424    case Op_CMoveVF:
1425      if (vlen != 8) {
1426        return false; // implementation limitation (only vcmov8F_reg is present)
1427      }
1428      break;
1429    case Op_CMoveVD:
1430      if (vlen != 4) {
1431        return false; // implementation limitation (only vcmov4D_reg is present)
1432      }
1433      break;
1434  }
1435  return true;  // Per default match rules are supported.
1436}
1437
1438// x86 supports generic vector operands: vec and legVec.
1439const bool Matcher::supports_generic_vector_operands = true;
1440
1441MachOper* Matcher::specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
1442  assert(Matcher::is_generic_vector(generic_opnd), "not generic");
1443  bool legacy = (generic_opnd->opcode() == LEGVEC);
1444  if (!VM_Version::supports_avx512vlbwdq() && // KNL
1445      is_temp && !legacy && (ideal_reg == Op_VecZ)) {
1446    // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
1447    return new legVecZOper();
1448  }
1449  if (legacy) {
1450    switch (ideal_reg) {
1451      case Op_VecS: return new legVecSOper();
1452      case Op_VecD: return new legVecDOper();
1453      case Op_VecX: return new legVecXOper();
1454      case Op_VecY: return new legVecYOper();
1455      case Op_VecZ: return new legVecZOper();
1456    }
1457  } else {
1458    switch (ideal_reg) {
1459      case Op_VecS: return new vecSOper();
1460      case Op_VecD: return new vecDOper();
1461      case Op_VecX: return new vecXOper();
1462      case Op_VecY: return new vecYOper();
1463      case Op_VecZ: return new vecZOper();
1464    }
1465  }
1466  ShouldNotReachHere();
1467  return NULL;
1468}
1469
1470bool Matcher::is_generic_reg2reg_move(MachNode* m) {
1471  switch (m->rule()) {
1472    case MoveVec2Leg_rule:
1473    case MoveLeg2Vec_rule:
1474      return true;
1475    default:
1476      return false;
1477  }
1478}
1479
1480bool Matcher::is_generic_vector(MachOper* opnd) {
1481  switch (opnd->opcode()) {
1482    case VEC:
1483    case LEGVEC:
1484      return true;
1485    default:
1486      return false;
1487  }
1488}
1489
1490//------------------------------------------------------------------------
1491
1492const bool Matcher::has_predicated_vectors(void) {
1493  bool ret_value = false;
1494  if (UseAVX > 2) {
1495    ret_value = VM_Version::supports_avx512vl();
1496  }
1497
1498  return ret_value;
1499}
1500
1501const int Matcher::float_pressure(int default_pressure_threshold) {
1502  int float_pressure_threshold = default_pressure_threshold;
1503#ifdef _LP64
1504  if (UseAVX > 2) {
1505    // Increase pressure threshold on machines with AVX3 which have
1506    // 2x more XMM registers.
1507    float_pressure_threshold = default_pressure_threshold * 2;
1508  }
1509#endif
1510  return float_pressure_threshold;
1511}
1512
1513// Max vector size in bytes. 0 if not supported.
1514const int Matcher::vector_width_in_bytes(BasicType bt) {
1515  assert(is_java_primitive(bt), "only primitive type vectors");
1516  if (UseSSE < 2) return 0;
1517  // SSE2 supports 128bit vectors for all types.
1518  // AVX2 supports 256bit vectors for all types.
1519  // AVX2/EVEX supports 512bit vectors for all types.
1520  int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1521  // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1522  if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1523    size = (UseAVX > 2) ? 64 : 32;
1524  if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1525    size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1526  // Use flag to limit vector size.
1527  size = MIN2(size,(int)MaxVectorSize);
1528  // Minimum 2 values in vector (or 4 for bytes).
1529  switch (bt) {
1530  case T_DOUBLE:
1531  case T_LONG:
1532    if (size < 16) return 0;
1533    break;
1534  case T_FLOAT:
1535  case T_INT:
1536    if (size < 8) return 0;
1537    break;
1538  case T_BOOLEAN:
1539    if (size < 4) return 0;
1540    break;
1541  case T_CHAR:
1542    if (size < 4) return 0;
1543    break;
1544  case T_BYTE:
1545    if (size < 4) return 0;
1546    break;
1547  case T_SHORT:
1548    if (size < 4) return 0;
1549    break;
1550  default:
1551    ShouldNotReachHere();
1552  }
1553  return size;
1554}
1555
1556// Limits on vector size (number of elements) loaded into vector.
1557const int Matcher::max_vector_size(const BasicType bt) {
1558  return vector_width_in_bytes(bt)/type2aelembytes(bt);
1559}
1560const int Matcher::min_vector_size(const BasicType bt) {
1561  int max_size = max_vector_size(bt);
1562  // Min size which can be loaded into vector is 4 bytes.
1563  int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1564  return MIN2(size,max_size);
1565}
1566
1567// Vector ideal reg corresponding to specified size in bytes
1568const uint Matcher::vector_ideal_reg(int size) {
1569  assert(MaxVectorSize >= size, "");
1570  switch(size) {
1571    case  4: return Op_VecS;
1572    case  8: return Op_VecD;
1573    case 16: return Op_VecX;
1574    case 32: return Op_VecY;
1575    case 64: return Op_VecZ;
1576  }
1577  ShouldNotReachHere();
1578  return 0;
1579}
1580
1581// Only lowest bits of xmm reg are used for vector shift count.
1582const uint Matcher::vector_shift_count_ideal_reg(int size) {
1583  return Op_VecS;
1584}
1585
1586// x86 supports misaligned vectors store/load.
1587const bool Matcher::misaligned_vectors_ok() {
1588  return true;
1589}
1590
1591// x86 AES instructions are compatible with SunJCE expanded
1592// keys, hence we do not need to pass the original key to stubs
1593const bool Matcher::pass_original_key_for_aes() {
1594  return false;
1595}
1596
1597
1598const bool Matcher::convi2l_type_required = true;
1599
1600// Check for shift by small constant as well
1601static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1602  if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1603      shift->in(2)->get_int() <= 3 &&
1604      // Are there other uses besides address expressions?
1605      !matcher->is_visited(shift)) {
1606    address_visited.set(shift->_idx); // Flag as address_visited
1607    mstack.push(shift->in(2), Matcher::Visit);
1608    Node *conv = shift->in(1);
1609#ifdef _LP64
1610    // Allow Matcher to match the rule which bypass
1611    // ConvI2L operation for an array index on LP64
1612    // if the index value is positive.
1613    if (conv->Opcode() == Op_ConvI2L &&
1614        conv->as_Type()->type()->is_long()->_lo >= 0 &&
1615        // Are there other uses besides address expressions?
1616        !matcher->is_visited(conv)) {
1617      address_visited.set(conv->_idx); // Flag as address_visited
1618      mstack.push(conv->in(1), Matcher::Pre_Visit);
1619    } else
1620#endif
1621      mstack.push(conv, Matcher::Pre_Visit);
1622    return true;
1623  }
1624  return false;
1625}
1626
1627// Should the Matcher clone shifts on addressing modes, expecting them
1628// to be subsumed into complex addressing expressions or compute them
1629// into registers?
1630bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1631  Node *off = m->in(AddPNode::Offset);
1632  if (off->is_Con()) {
1633    address_visited.test_set(m->_idx); // Flag as address_visited
1634    Node *adr = m->in(AddPNode::Address);
1635
1636    // Intel can handle 2 adds in addressing mode
1637    // AtomicAdd is not an addressing expression.
1638    // Cheap to find it by looking for screwy base.
1639    if (adr->is_AddP() &&
1640        !adr->in(AddPNode::Base)->is_top() &&
1641        LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
1642        // Are there other uses besides address expressions?
1643        !is_visited(adr)) {
1644      address_visited.set(adr->_idx); // Flag as address_visited
1645      Node *shift = adr->in(AddPNode::Offset);
1646      if (!clone_shift(shift, this, mstack, address_visited)) {
1647        mstack.push(shift, Pre_Visit);
1648      }
1649      mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1650      mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1651    } else {
1652      mstack.push(adr, Pre_Visit);
1653    }
1654
1655    // Clone X+offset as it also folds into most addressing expressions
1656    mstack.push(off, Visit);
1657    mstack.push(m->in(AddPNode::Base), Pre_Visit);
1658    return true;
1659  } else if (clone_shift(off, this, mstack, address_visited)) {
1660    address_visited.test_set(m->_idx); // Flag as address_visited
1661    mstack.push(m->in(AddPNode::Address), Pre_Visit);
1662    mstack.push(m->in(AddPNode::Base), Pre_Visit);
1663    return true;
1664  }
1665  return false;
1666}
1667
1668void Compile::reshape_address(AddPNode* addp) {
1669}
1670
1671static inline uint vector_length(const MachNode* n) {
1672  const TypeVect* vt = n->bottom_type()->is_vect();
1673  return vt->length();
1674}
1675
1676static inline uint vector_length_in_bytes(const MachNode* n) {
1677  const TypeVect* vt = n->bottom_type()->is_vect();
1678  return vt->length_in_bytes();
1679}
1680
1681static inline uint vector_length_in_bytes(const MachNode* use, MachOper* opnd) {
1682  uint def_idx = use->operand_index(opnd);
1683  Node* def = use->in(def_idx);
1684  return def->bottom_type()->is_vect()->length_in_bytes();
1685}
1686
1687static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* n) {
1688  switch(vector_length_in_bytes(n)) {
1689    case  4: // fall-through
1690    case  8: // fall-through
1691    case 16: return Assembler::AVX_128bit;
1692    case 32: return Assembler::AVX_256bit;
1693    case 64: return Assembler::AVX_512bit;
1694
1695    default: {
1696      ShouldNotReachHere();
1697      return Assembler::AVX_NoVec;
1698    }
1699  }
1700}
1701
1702// Helper methods for MachSpillCopyNode::implementation().
1703static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1704                          int src_hi, int dst_hi, uint ireg, outputStream* st) {
1705  // In 64-bit VM size calculation is very complex. Emitting instructions
1706  // into scratch buffer is used to get size in 64-bit VM.
1707  LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1708  assert(ireg == Op_VecS || // 32bit vector
1709         (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1710         (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1711         "no non-adjacent vector moves" );
1712  if (cbuf) {
1713    MacroAssembler _masm(cbuf);
1714    int offset = __ offset();
1715    switch (ireg) {
1716    case Op_VecS: // copy whole register
1717    case Op_VecD:
1718    case Op_VecX:
1719#ifndef _LP64
1720      __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1721#else
1722      if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1723        __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1724      } else {
1725        __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1726     }
1727#endif
1728      break;
1729    case Op_VecY:
1730#ifndef _LP64
1731      __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1732#else
1733      if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1734        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1735      } else {
1736        __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1737     }
1738#endif
1739      break;
1740    case Op_VecZ:
1741      __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1742      break;
1743    default:
1744      ShouldNotReachHere();
1745    }
1746    int size = __ offset() - offset;
1747#ifdef ASSERT
1748    // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1749    assert(!do_size || size == 4, "incorrect size calculattion");
1750#endif
1751    return size;
1752#ifndef PRODUCT
1753  } else if (!do_size) {
1754    switch (ireg) {
1755    case Op_VecS:
1756    case Op_VecD:
1757    case Op_VecX:
1758      st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1759      break;
1760    case Op_VecY:
1761    case Op_VecZ:
1762      st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1763      break;
1764    default:
1765      ShouldNotReachHere();
1766    }
1767#endif
1768  }
1769  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1770  return (UseAVX > 2) ? 6 : 4;
1771}
1772
1773int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1774                     int stack_offset, int reg, uint ireg, outputStream* st) {
1775  // In 64-bit VM size calculation is very complex. Emitting instructions
1776  // into scratch buffer is used to get size in 64-bit VM.
1777  LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1778  if (cbuf) {
1779    MacroAssembler _masm(cbuf);
1780    int offset = __ offset();
1781    if (is_load) {
1782      switch (ireg) {
1783      case Op_VecS:
1784        __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1785        break;
1786      case Op_VecD:
1787        __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1788        break;
1789      case Op_VecX:
1790#ifndef _LP64
1791        __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1792#else
1793        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1794          __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1795        } else {
1796          __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1797          __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1798        }
1799#endif
1800        break;
1801      case Op_VecY:
1802#ifndef _LP64
1803        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1804#else
1805        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1806          __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1807        } else {
1808          __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1809          __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1810        }
1811#endif
1812        break;
1813      case Op_VecZ:
1814        __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1815        break;
1816      default:
1817        ShouldNotReachHere();
1818      }
1819    } else { // store
1820      switch (ireg) {
1821      case Op_VecS:
1822        __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1823        break;
1824      case Op_VecD:
1825        __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1826        break;
1827      case Op_VecX:
1828#ifndef _LP64
1829        __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1830#else
1831        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1832          __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1833        }
1834        else {
1835          __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1836        }
1837#endif
1838        break;
1839      case Op_VecY:
1840#ifndef _LP64
1841        __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1842#else
1843        if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1844          __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1845        }
1846        else {
1847          __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1848        }
1849#endif
1850        break;
1851      case Op_VecZ:
1852        __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1853        break;
1854      default:
1855        ShouldNotReachHere();
1856      }
1857    }
1858    int size = __ offset() - offset;
1859#ifdef ASSERT
1860    int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1861    // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1862    assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1863#endif
1864    return size;
1865#ifndef PRODUCT
1866  } else if (!do_size) {
1867    if (is_load) {
1868      switch (ireg) {
1869      case Op_VecS:
1870        st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1871        break;
1872      case Op_VecD:
1873        st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1874        break;
1875       case Op_VecX:
1876        st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1877        break;
1878      case Op_VecY:
1879      case Op_VecZ:
1880        st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1881        break;
1882      default:
1883        ShouldNotReachHere();
1884      }
1885    } else { // store
1886      switch (ireg) {
1887      case Op_VecS:
1888        st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1889        break;
1890      case Op_VecD:
1891        st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1892        break;
1893       case Op_VecX:
1894        st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1895        break;
1896      case Op_VecY:
1897      case Op_VecZ:
1898        st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1899        break;
1900      default:
1901        ShouldNotReachHere();
1902      }
1903    }
1904#endif
1905  }
1906  bool is_single_byte = false;
1907  int vec_len = 0;
1908  if ((UseAVX > 2) && (stack_offset != 0)) {
1909    int tuple_type = Assembler::EVEX_FVM;
1910    int input_size = Assembler::EVEX_32bit;
1911    switch (ireg) {
1912    case Op_VecS:
1913      tuple_type = Assembler::EVEX_T1S;
1914      break;
1915    case Op_VecD:
1916      tuple_type = Assembler::EVEX_T1S;
1917      input_size = Assembler::EVEX_64bit;
1918      break;
1919    case Op_VecX:
1920      break;
1921    case Op_VecY:
1922      vec_len = 1;
1923      break;
1924    case Op_VecZ:
1925      vec_len = 2;
1926      break;
1927    }
1928    is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1929  }
1930  int offset_size = 0;
1931  int size = 5;
1932  if (UseAVX > 2 ) {
1933    if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1934      offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1935      size += 2; // Need an additional two bytes for EVEX encoding
1936    } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1937      offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1938    } else {
1939      offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1940      size += 2; // Need an additional two bytes for EVEX encodding
1941    }
1942  } else {
1943    offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1944  }
1945  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1946  return size+offset_size;
1947}
1948
1949static inline jint replicate4_imm(int con, int width) {
1950  // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1951  assert(width == 1 || width == 2, "only byte or short types here");
1952  int bit_width = width * 8;
1953  jint val = con;
1954  val &= (1 << bit_width) - 1;  // mask off sign bits
1955  while(bit_width < 32) {
1956    val |= (val << bit_width);
1957    bit_width <<= 1;
1958  }
1959  return val;
1960}
1961
1962static inline jlong replicate8_imm(int con, int width) {
1963  // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1964  assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1965  int bit_width = width * 8;
1966  jlong val = con;
1967  val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1968  while(bit_width < 64) {
1969    val |= (val << bit_width);
1970    bit_width <<= 1;
1971  }
1972  return val;
1973}
1974
1975#ifndef PRODUCT
1976  void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1977    st->print("nop \t# %d bytes pad for loops and calls", _count);
1978  }
1979#endif
1980
1981  void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1982    MacroAssembler _masm(&cbuf);
1983    __ nop(_count);
1984  }
1985
1986  uint MachNopNode::size(PhaseRegAlloc*) const {
1987    return _count;
1988  }
1989
1990#ifndef PRODUCT
1991  void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1992    st->print("# breakpoint");
1993  }
1994#endif
1995
1996  void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1997    MacroAssembler _masm(&cbuf);
1998    __ int3();
1999  }
2000
2001  uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2002    return MachNode::size(ra_);
2003  }
2004
2005%}
2006
2007encode %{
2008
2009  enc_class call_epilog %{
2010    if (VerifyStackAtCalls) {
2011      // Check that stack depth is unchanged: find majik cookie on stack
2012      int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2013      MacroAssembler _masm(&cbuf);
2014      Label L;
2015      __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2016      __ jccb(Assembler::equal, L);
2017      // Die if stack mismatch
2018      __ int3();
2019      __ bind(L);
2020    }
2021  %}
2022
2023%}
2024
2025
2026//----------OPERANDS-----------------------------------------------------------
2027// Operand definitions must precede instruction definitions for correct parsing
2028// in the ADLC because operands constitute user defined types which are used in
2029// instruction definitions.
2030
2031// Vectors
2032
2033// Dummy generic vector class. Should be used for all vector operands.
2034// Replaced with vec[SDXYZ] during post-selection pass.
2035operand vec() %{
2036  constraint(ALLOC_IN_RC(dynamic));
2037  match(VecX);
2038  match(VecY);
2039  match(VecZ);
2040  match(VecS);
2041  match(VecD);
2042
2043  format %{ %}
2044  interface(REG_INTER);
2045%}
2046
2047// Dummy generic legacy vector class. Should be used for all legacy vector operands.
2048// Replaced with legVec[SDXYZ] during post-selection cleanup.
2049// Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
2050// runtime code generation via reg_class_dynamic.
2051operand legVec() %{
2052  constraint(ALLOC_IN_RC(dynamic));
2053  match(VecX);
2054  match(VecY);
2055  match(VecZ);
2056  match(VecS);
2057  match(VecD);
2058
2059  format %{ %}
2060  interface(REG_INTER);
2061%}
2062
2063// Replaces vec during post-selection cleanup. See above.
2064operand vecS() %{
2065  constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
2066  match(VecS);
2067
2068  format %{ %}
2069  interface(REG_INTER);
2070%}
2071
2072// Replaces legVec during post-selection cleanup. See above.
2073operand legVecS() %{
2074  constraint(ALLOC_IN_RC(vectors_reg_legacy));
2075  match(VecS);
2076
2077  format %{ %}
2078  interface(REG_INTER);
2079%}
2080
2081// Replaces vec during post-selection cleanup. See above.
2082operand vecD() %{
2083  constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
2084  match(VecD);
2085
2086  format %{ %}
2087  interface(REG_INTER);
2088%}
2089
2090// Replaces legVec during post-selection cleanup. See above.
2091operand legVecD() %{
2092  constraint(ALLOC_IN_RC(vectord_reg_legacy));
2093  match(VecD);
2094
2095  format %{ %}
2096  interface(REG_INTER);
2097%}
2098
2099// Replaces vec during post-selection cleanup. See above.
2100operand vecX() %{
2101  constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
2102  match(VecX);
2103
2104  format %{ %}
2105  interface(REG_INTER);
2106%}
2107
2108// Replaces legVec during post-selection cleanup. See above.
2109operand legVecX() %{
2110  constraint(ALLOC_IN_RC(vectorx_reg_legacy));
2111  match(VecX);
2112
2113  format %{ %}
2114  interface(REG_INTER);
2115%}
2116
2117// Replaces vec during post-selection cleanup. See above.
2118operand vecY() %{
2119  constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
2120  match(VecY);
2121
2122  format %{ %}
2123  interface(REG_INTER);
2124%}
2125
2126// Replaces legVec during post-selection cleanup. See above.
2127operand legVecY() %{
2128  constraint(ALLOC_IN_RC(vectory_reg_legacy));
2129  match(VecY);
2130
2131  format %{ %}
2132  interface(REG_INTER);
2133%}
2134
2135// Replaces vec during post-selection cleanup. See above.
2136operand vecZ() %{
2137  constraint(ALLOC_IN_RC(vectorz_reg));
2138  match(VecZ);
2139
2140  format %{ %}
2141  interface(REG_INTER);
2142%}
2143
2144// Replaces legVec during post-selection cleanup. See above.
2145operand legVecZ() %{
2146  constraint(ALLOC_IN_RC(vectorz_reg_legacy));
2147  match(VecZ);
2148
2149  format %{ %}
2150  interface(REG_INTER);
2151%}
2152
2153// Comparison Code for FP conditional move
2154operand cmpOp_vcmppd() %{
2155  match(Bool);
2156
2157  predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2158            n->as_Bool()->_test._test != BoolTest::no_overflow);
2159  format %{ "" %}
2160  interface(COND_INTER) %{
2161    equal        (0x0, "eq");
2162    less         (0x1, "lt");
2163    less_equal   (0x2, "le");
2164    not_equal    (0xC, "ne");
2165    greater_equal(0xD, "ge");
2166    greater      (0xE, "gt");
2167    //TODO cannot compile (adlc breaks) without two next lines with error:
2168    // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2169    // equal' for overflow.
2170    overflow     (0x20, "o");  // not really supported by the instruction
2171    no_overflow  (0x21, "no"); // not really supported by the instruction
2172  %}
2173%}
2174
2175
2176// INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2177
2178// ============================================================================
2179
2180instruct ShouldNotReachHere() %{
2181  match(Halt);
2182  format %{ "ud2\t# ShouldNotReachHere" %}
2183  ins_encode %{
2184    __ stop(_halt_reason);
2185  %}
2186  ins_pipe(pipe_slow);
2187%}
2188
2189// =================================EVEX special===============================
2190
2191instruct setMask(rRegI dst, rRegI src) %{
2192  predicate(Matcher::has_predicated_vectors());
2193  match(Set dst (SetVectMaskI  src));
2194  effect(TEMP dst);
2195  format %{ "setvectmask   $dst, $src" %}
2196  ins_encode %{
2197    __ setvectmask($dst$$Register, $src$$Register);
2198  %}
2199  ins_pipe(pipe_slow);
2200%}
2201
2202// ============================================================================
2203
2204instruct addF_reg(regF dst, regF src) %{
2205  predicate((UseSSE>=1) && (UseAVX == 0));
2206  match(Set dst (AddF dst src));
2207
2208  format %{ "addss   $dst, $src" %}
2209  ins_cost(150);
2210  ins_encode %{
2211    __ addss($dst$$XMMRegister, $src$$XMMRegister);
2212  %}
2213  ins_pipe(pipe_slow);
2214%}
2215
2216instruct addF_mem(regF dst, memory src) %{
2217  predicate((UseSSE>=1) && (UseAVX == 0));
2218  match(Set dst (AddF dst (LoadF src)));
2219
2220  format %{ "addss   $dst, $src" %}
2221  ins_cost(150);
2222  ins_encode %{
2223    __ addss($dst$$XMMRegister, $src$$Address);
2224  %}
2225  ins_pipe(pipe_slow);
2226%}
2227
2228instruct addF_imm(regF dst, immF con) %{
2229  predicate((UseSSE>=1) && (UseAVX == 0));
2230  match(Set dst (AddF dst con));
2231  format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2232  ins_cost(150);
2233  ins_encode %{
2234    __ addss($dst$$XMMRegister, $constantaddress($con));
2235  %}
2236  ins_pipe(pipe_slow);
2237%}
2238
2239instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2240  predicate(UseAVX > 0);
2241  match(Set dst (AddF src1 src2));
2242
2243  format %{ "vaddss  $dst, $src1, $src2" %}
2244  ins_cost(150);
2245  ins_encode %{
2246    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2247  %}
2248  ins_pipe(pipe_slow);
2249%}
2250
2251instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2252  predicate(UseAVX > 0);
2253  match(Set dst (AddF src1 (LoadF src2)));
2254
2255  format %{ "vaddss  $dst, $src1, $src2" %}
2256  ins_cost(150);
2257  ins_encode %{
2258    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2259  %}
2260  ins_pipe(pipe_slow);
2261%}
2262
2263instruct addF_reg_imm(regF dst, regF src, immF con) %{
2264  predicate(UseAVX > 0);
2265  match(Set dst (AddF src con));
2266
2267  format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2268  ins_cost(150);
2269  ins_encode %{
2270    __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2271  %}
2272  ins_pipe(pipe_slow);
2273%}
2274
2275instruct addD_reg(regD dst, regD src) %{
2276  predicate((UseSSE>=2) && (UseAVX == 0));
2277  match(Set dst (AddD dst src));
2278
2279  format %{ "addsd   $dst, $src" %}
2280  ins_cost(150);
2281  ins_encode %{
2282    __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2283  %}
2284  ins_pipe(pipe_slow);
2285%}
2286
2287instruct addD_mem(regD dst, memory src) %{
2288  predicate((UseSSE>=2) && (UseAVX == 0));
2289  match(Set dst (AddD dst (LoadD src)));
2290
2291  format %{ "addsd   $dst, $src" %}
2292  ins_cost(150);
2293  ins_encode %{
2294    __ addsd($dst$$XMMRegister, $src$$Address);
2295  %}
2296  ins_pipe(pipe_slow);
2297%}
2298
2299instruct addD_imm(regD dst, immD con) %{
2300  predicate((UseSSE>=2) && (UseAVX == 0));
2301  match(Set dst (AddD dst con));
2302  format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2303  ins_cost(150);
2304  ins_encode %{
2305    __ addsd($dst$$XMMRegister, $constantaddress($con));
2306  %}
2307  ins_pipe(pipe_slow);
2308%}
2309
2310instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2311  predicate(UseAVX > 0);
2312  match(Set dst (AddD src1 src2));
2313
2314  format %{ "vaddsd  $dst, $src1, $src2" %}
2315  ins_cost(150);
2316  ins_encode %{
2317    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2318  %}
2319  ins_pipe(pipe_slow);
2320%}
2321
2322instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2323  predicate(UseAVX > 0);
2324  match(Set dst (AddD src1 (LoadD src2)));
2325
2326  format %{ "vaddsd  $dst, $src1, $src2" %}
2327  ins_cost(150);
2328  ins_encode %{
2329    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2330  %}
2331  ins_pipe(pipe_slow);
2332%}
2333
2334instruct addD_reg_imm(regD dst, regD src, immD con) %{
2335  predicate(UseAVX > 0);
2336  match(Set dst (AddD src con));
2337
2338  format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2339  ins_cost(150);
2340  ins_encode %{
2341    __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2342  %}
2343  ins_pipe(pipe_slow);
2344%}
2345
2346instruct subF_reg(regF dst, regF src) %{
2347  predicate((UseSSE>=1) && (UseAVX == 0));
2348  match(Set dst (SubF dst src));
2349
2350  format %{ "subss   $dst, $src" %}
2351  ins_cost(150);
2352  ins_encode %{
2353    __ subss($dst$$XMMRegister, $src$$XMMRegister);
2354  %}
2355  ins_pipe(pipe_slow);
2356%}
2357
2358instruct subF_mem(regF dst, memory src) %{
2359  predicate((UseSSE>=1) && (UseAVX == 0));
2360  match(Set dst (SubF dst (LoadF src)));
2361
2362  format %{ "subss   $dst, $src" %}
2363  ins_cost(150);
2364  ins_encode %{
2365    __ subss($dst$$XMMRegister, $src$$Address);
2366  %}
2367  ins_pipe(pipe_slow);
2368%}
2369
2370instruct subF_imm(regF dst, immF con) %{
2371  predicate((UseSSE>=1) && (UseAVX == 0));
2372  match(Set dst (SubF dst con));
2373  format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2374  ins_cost(150);
2375  ins_encode %{
2376    __ subss($dst$$XMMRegister, $constantaddress($con));
2377  %}
2378  ins_pipe(pipe_slow);
2379%}
2380
2381instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2382  predicate(UseAVX > 0);
2383  match(Set dst (SubF src1 src2));
2384
2385  format %{ "vsubss  $dst, $src1, $src2" %}
2386  ins_cost(150);
2387  ins_encode %{
2388    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2389  %}
2390  ins_pipe(pipe_slow);
2391%}
2392
2393instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2394  predicate(UseAVX > 0);
2395  match(Set dst (SubF src1 (LoadF src2)));
2396
2397  format %{ "vsubss  $dst, $src1, $src2" %}
2398  ins_cost(150);
2399  ins_encode %{
2400    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2401  %}
2402  ins_pipe(pipe_slow);
2403%}
2404
2405instruct subF_reg_imm(regF dst, regF src, immF con) %{
2406  predicate(UseAVX > 0);
2407  match(Set dst (SubF src con));
2408
2409  format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2410  ins_cost(150);
2411  ins_encode %{
2412    __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2413  %}
2414  ins_pipe(pipe_slow);
2415%}
2416
2417instruct subD_reg(regD dst, regD src) %{
2418  predicate((UseSSE>=2) && (UseAVX == 0));
2419  match(Set dst (SubD dst src));
2420
2421  format %{ "subsd   $dst, $src" %}
2422  ins_cost(150);
2423  ins_encode %{
2424    __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2425  %}
2426  ins_pipe(pipe_slow);
2427%}
2428
2429instruct subD_mem(regD dst, memory src) %{
2430  predicate((UseSSE>=2) && (UseAVX == 0));
2431  match(Set dst (SubD dst (LoadD src)));
2432
2433  format %{ "subsd   $dst, $src" %}
2434  ins_cost(150);
2435  ins_encode %{
2436    __ subsd($dst$$XMMRegister, $src$$Address);
2437  %}
2438  ins_pipe(pipe_slow);
2439%}
2440
2441instruct subD_imm(regD dst, immD con) %{
2442  predicate((UseSSE>=2) && (UseAVX == 0));
2443  match(Set dst (SubD dst con));
2444  format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2445  ins_cost(150);
2446  ins_encode %{
2447    __ subsd($dst$$XMMRegister, $constantaddress($con));
2448  %}
2449  ins_pipe(pipe_slow);
2450%}
2451
2452instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2453  predicate(UseAVX > 0);
2454  match(Set dst (SubD src1 src2));
2455
2456  format %{ "vsubsd  $dst, $src1, $src2" %}
2457  ins_cost(150);
2458  ins_encode %{
2459    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2460  %}
2461  ins_pipe(pipe_slow);
2462%}
2463
2464instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2465  predicate(UseAVX > 0);
2466  match(Set dst (SubD src1 (LoadD src2)));
2467
2468  format %{ "vsubsd  $dst, $src1, $src2" %}
2469  ins_cost(150);
2470  ins_encode %{
2471    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2472  %}
2473  ins_pipe(pipe_slow);
2474%}
2475
2476instruct subD_reg_imm(regD dst, regD src, immD con) %{
2477  predicate(UseAVX > 0);
2478  match(Set dst (SubD src con));
2479
2480  format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2481  ins_cost(150);
2482  ins_encode %{
2483    __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2484  %}
2485  ins_pipe(pipe_slow);
2486%}
2487
2488instruct mulF_reg(regF dst, regF src) %{
2489  predicate((UseSSE>=1) && (UseAVX == 0));
2490  match(Set dst (MulF dst src));
2491
2492  format %{ "mulss   $dst, $src" %}
2493  ins_cost(150);
2494  ins_encode %{
2495    __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2496  %}
2497  ins_pipe(pipe_slow);
2498%}
2499
2500instruct mulF_mem(regF dst, memory src) %{
2501  predicate((UseSSE>=1) && (UseAVX == 0));
2502  match(Set dst (MulF dst (LoadF src)));
2503
2504  format %{ "mulss   $dst, $src" %}
2505  ins_cost(150);
2506  ins_encode %{
2507    __ mulss($dst$$XMMRegister, $src$$Address);
2508  %}
2509  ins_pipe(pipe_slow);
2510%}
2511
2512instruct mulF_imm(regF dst, immF con) %{
2513  predicate((UseSSE>=1) && (UseAVX == 0));
2514  match(Set dst (MulF dst con));
2515  format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2516  ins_cost(150);
2517  ins_encode %{
2518    __ mulss($dst$$XMMRegister, $constantaddress($con));
2519  %}
2520  ins_pipe(pipe_slow);
2521%}
2522
2523instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2524  predicate(UseAVX > 0);
2525  match(Set dst (MulF src1 src2));
2526
2527  format %{ "vmulss  $dst, $src1, $src2" %}
2528  ins_cost(150);
2529  ins_encode %{
2530    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2531  %}
2532  ins_pipe(pipe_slow);
2533%}
2534
2535instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2536  predicate(UseAVX > 0);
2537  match(Set dst (MulF src1 (LoadF src2)));
2538
2539  format %{ "vmulss  $dst, $src1, $src2" %}
2540  ins_cost(150);
2541  ins_encode %{
2542    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2543  %}
2544  ins_pipe(pipe_slow);
2545%}
2546
2547instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2548  predicate(UseAVX > 0);
2549  match(Set dst (MulF src con));
2550
2551  format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2552  ins_cost(150);
2553  ins_encode %{
2554    __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2555  %}
2556  ins_pipe(pipe_slow);
2557%}
2558
2559instruct mulD_reg(regD dst, regD src) %{
2560  predicate((UseSSE>=2) && (UseAVX == 0));
2561  match(Set dst (MulD dst src));
2562
2563  format %{ "mulsd   $dst, $src" %}
2564  ins_cost(150);
2565  ins_encode %{
2566    __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2567  %}
2568  ins_pipe(pipe_slow);
2569%}
2570
2571instruct mulD_mem(regD dst, memory src) %{
2572  predicate((UseSSE>=2) && (UseAVX == 0));
2573  match(Set dst (MulD dst (LoadD src)));
2574
2575  format %{ "mulsd   $dst, $src" %}
2576  ins_cost(150);
2577  ins_encode %{
2578    __ mulsd($dst$$XMMRegister, $src$$Address);
2579  %}
2580  ins_pipe(pipe_slow);
2581%}
2582
2583instruct mulD_imm(regD dst, immD con) %{
2584  predicate((UseSSE>=2) && (UseAVX == 0));
2585  match(Set dst (MulD dst con));
2586  format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2587  ins_cost(150);
2588  ins_encode %{
2589    __ mulsd($dst$$XMMRegister, $constantaddress($con));
2590  %}
2591  ins_pipe(pipe_slow);
2592%}
2593
2594instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2595  predicate(UseAVX > 0);
2596  match(Set dst (MulD src1 src2));
2597
2598  format %{ "vmulsd  $dst, $src1, $src2" %}
2599  ins_cost(150);
2600  ins_encode %{
2601    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2602  %}
2603  ins_pipe(pipe_slow);
2604%}
2605
2606instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2607  predicate(UseAVX > 0);
2608  match(Set dst (MulD src1 (LoadD src2)));
2609
2610  format %{ "vmulsd  $dst, $src1, $src2" %}
2611  ins_cost(150);
2612  ins_encode %{
2613    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2614  %}
2615  ins_pipe(pipe_slow);
2616%}
2617
2618instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2619  predicate(UseAVX > 0);
2620  match(Set dst (MulD src con));
2621
2622  format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2623  ins_cost(150);
2624  ins_encode %{
2625    __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2626  %}
2627  ins_pipe(pipe_slow);
2628%}
2629
2630instruct divF_reg(regF dst, regF src) %{
2631  predicate((UseSSE>=1) && (UseAVX == 0));
2632  match(Set dst (DivF dst src));
2633
2634  format %{ "divss   $dst, $src" %}
2635  ins_cost(150);
2636  ins_encode %{
2637    __ divss($dst$$XMMRegister, $src$$XMMRegister);
2638  %}
2639  ins_pipe(pipe_slow);
2640%}
2641
2642instruct divF_mem(regF dst, memory src) %{
2643  predicate((UseSSE>=1) && (UseAVX == 0));
2644  match(Set dst (DivF dst (LoadF src)));
2645
2646  format %{ "divss   $dst, $src" %}
2647  ins_cost(150);
2648  ins_encode %{
2649    __ divss($dst$$XMMRegister, $src$$Address);
2650  %}
2651  ins_pipe(pipe_slow);
2652%}
2653
2654instruct divF_imm(regF dst, immF con) %{
2655  predicate((UseSSE>=1) && (UseAVX == 0));
2656  match(Set dst (DivF dst con));
2657  format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2658  ins_cost(150);
2659  ins_encode %{
2660    __ divss($dst$$XMMRegister, $constantaddress($con));
2661  %}
2662  ins_pipe(pipe_slow);
2663%}
2664
2665instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2666  predicate(UseAVX > 0);
2667  match(Set dst (DivF src1 src2));
2668
2669  format %{ "vdivss  $dst, $src1, $src2" %}
2670  ins_cost(150);
2671  ins_encode %{
2672    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2673  %}
2674  ins_pipe(pipe_slow);
2675%}
2676
2677instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2678  predicate(UseAVX > 0);
2679  match(Set dst (DivF src1 (LoadF src2)));
2680
2681  format %{ "vdivss  $dst, $src1, $src2" %}
2682  ins_cost(150);
2683  ins_encode %{
2684    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2685  %}
2686  ins_pipe(pipe_slow);
2687%}
2688
2689instruct divF_reg_imm(regF dst, regF src, immF con) %{
2690  predicate(UseAVX > 0);
2691  match(Set dst (DivF src con));
2692
2693  format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2694  ins_cost(150);
2695  ins_encode %{
2696    __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2697  %}
2698  ins_pipe(pipe_slow);
2699%}
2700
2701instruct divD_reg(regD dst, regD src) %{
2702  predicate((UseSSE>=2) && (UseAVX == 0));
2703  match(Set dst (DivD dst src));
2704
2705  format %{ "divsd   $dst, $src" %}
2706  ins_cost(150);
2707  ins_encode %{
2708    __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2709  %}
2710  ins_pipe(pipe_slow);
2711%}
2712
2713instruct divD_mem(regD dst, memory src) %{
2714  predicate((UseSSE>=2) && (UseAVX == 0));
2715  match(Set dst (DivD dst (LoadD src)));
2716
2717  format %{ "divsd   $dst, $src" %}
2718  ins_cost(150);
2719  ins_encode %{
2720    __ divsd($dst$$XMMRegister, $src$$Address);
2721  %}
2722  ins_pipe(pipe_slow);
2723%}
2724
2725instruct divD_imm(regD dst, immD con) %{
2726  predicate((UseSSE>=2) && (UseAVX == 0));
2727  match(Set dst (DivD dst con));
2728  format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2729  ins_cost(150);
2730  ins_encode %{
2731    __ divsd($dst$$XMMRegister, $constantaddress($con));
2732  %}
2733  ins_pipe(pipe_slow);
2734%}
2735
2736instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2737  predicate(UseAVX > 0);
2738  match(Set dst (DivD src1 src2));
2739
2740  format %{ "vdivsd  $dst, $src1, $src2" %}
2741  ins_cost(150);
2742  ins_encode %{
2743    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2744  %}
2745  ins_pipe(pipe_slow);
2746%}
2747
2748instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2749  predicate(UseAVX > 0);
2750  match(Set dst (DivD src1 (LoadD src2)));
2751
2752  format %{ "vdivsd  $dst, $src1, $src2" %}
2753  ins_cost(150);
2754  ins_encode %{
2755    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2756  %}
2757  ins_pipe(pipe_slow);
2758%}
2759
2760instruct divD_reg_imm(regD dst, regD src, immD con) %{
2761  predicate(UseAVX > 0);
2762  match(Set dst (DivD src con));
2763
2764  format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2765  ins_cost(150);
2766  ins_encode %{
2767    __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2768  %}
2769  ins_pipe(pipe_slow);
2770%}
2771
2772instruct absF_reg(regF dst) %{
2773  predicate((UseSSE>=1) && (UseAVX == 0));
2774  match(Set dst (AbsF dst));
2775  ins_cost(150);
2776  format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2777  ins_encode %{
2778    __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2779  %}
2780  ins_pipe(pipe_slow);
2781%}
2782
2783instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2784  predicate(UseAVX > 0);
2785  match(Set dst (AbsF src));
2786  ins_cost(150);
2787  format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2788  ins_encode %{
2789    int vector_len = 0;
2790    __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2791              ExternalAddress(float_signmask()), vector_len);
2792  %}
2793  ins_pipe(pipe_slow);
2794%}
2795
2796instruct absD_reg(regD dst) %{
2797  predicate((UseSSE>=2) && (UseAVX == 0));
2798  match(Set dst (AbsD dst));
2799  ins_cost(150);
2800  format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2801            "# abs double by sign masking" %}
2802  ins_encode %{
2803    __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2804  %}
2805  ins_pipe(pipe_slow);
2806%}
2807
2808instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2809  predicate(UseAVX > 0);
2810  match(Set dst (AbsD src));
2811  ins_cost(150);
2812  format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2813            "# abs double by sign masking" %}
2814  ins_encode %{
2815    int vector_len = 0;
2816    __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2817              ExternalAddress(double_signmask()), vector_len);
2818  %}
2819  ins_pipe(pipe_slow);
2820%}
2821
2822instruct negF_reg(regF dst) %{
2823  predicate((UseSSE>=1) && (UseAVX == 0));
2824  match(Set dst (NegF dst));
2825  ins_cost(150);
2826  format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2827  ins_encode %{
2828    __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2829  %}
2830  ins_pipe(pipe_slow);
2831%}
2832
2833instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2834  predicate(UseAVX > 0);
2835  match(Set dst (NegF src));
2836  ins_cost(150);
2837  format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2838  ins_encode %{
2839    __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2840                 ExternalAddress(float_signflip()));
2841  %}
2842  ins_pipe(pipe_slow);
2843%}
2844
2845instruct negD_reg(regD dst) %{
2846  predicate((UseSSE>=2) && (UseAVX == 0));
2847  match(Set dst (NegD dst));
2848  ins_cost(150);
2849  format %{ "xorpd   $dst, [0x8000000000000000]\t"
2850            "# neg double by sign flipping" %}
2851  ins_encode %{
2852    __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2853  %}
2854  ins_pipe(pipe_slow);
2855%}
2856
2857instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2858  predicate(UseAVX > 0);
2859  match(Set dst (NegD src));
2860  ins_cost(150);
2861  format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2862            "# neg double by sign flipping" %}
2863  ins_encode %{
2864    __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2865                 ExternalAddress(double_signflip()));
2866  %}
2867  ins_pipe(pipe_slow);
2868%}
2869
2870instruct sqrtF_reg(regF dst, regF src) %{
2871  predicate(UseSSE>=1);
2872  match(Set dst (SqrtF src));
2873
2874  format %{ "sqrtss  $dst, $src" %}
2875  ins_cost(150);
2876  ins_encode %{
2877    __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2878  %}
2879  ins_pipe(pipe_slow);
2880%}
2881
2882instruct sqrtF_mem(regF dst, memory src) %{
2883  predicate(UseSSE>=1);
2884  match(Set dst (SqrtF (LoadF src)));
2885
2886  format %{ "sqrtss  $dst, $src" %}
2887  ins_cost(150);
2888  ins_encode %{
2889    __ sqrtss($dst$$XMMRegister, $src$$Address);
2890  %}
2891  ins_pipe(pipe_slow);
2892%}
2893
2894instruct sqrtF_imm(regF dst, immF con) %{
2895  predicate(UseSSE>=1);
2896  match(Set dst (SqrtF con));
2897
2898  format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2899  ins_cost(150);
2900  ins_encode %{
2901    __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2902  %}
2903  ins_pipe(pipe_slow);
2904%}
2905
2906instruct sqrtD_reg(regD dst, regD src) %{
2907  predicate(UseSSE>=2);
2908  match(Set dst (SqrtD src));
2909
2910  format %{ "sqrtsd  $dst, $src" %}
2911  ins_cost(150);
2912  ins_encode %{
2913    __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2914  %}
2915  ins_pipe(pipe_slow);
2916%}
2917
2918instruct sqrtD_mem(regD dst, memory src) %{
2919  predicate(UseSSE>=2);
2920  match(Set dst (SqrtD (LoadD src)));
2921
2922  format %{ "sqrtsd  $dst, $src" %}
2923  ins_cost(150);
2924  ins_encode %{
2925    __ sqrtsd($dst$$XMMRegister, $src$$Address);
2926  %}
2927  ins_pipe(pipe_slow);
2928%}
2929
2930instruct sqrtD_imm(regD dst, immD con) %{
2931  predicate(UseSSE>=2);
2932  match(Set dst (SqrtD con));
2933  format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2934  ins_cost(150);
2935  ins_encode %{
2936    __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2937  %}
2938  ins_pipe(pipe_slow);
2939%}
2940
2941
2942#ifdef _LP64
2943instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
2944  match(Set dst (RoundDoubleMode src rmode));
2945  format %{ "roundsd $dst,$src" %}
2946  ins_cost(150);
2947  ins_encode %{
2948    assert(UseSSE >= 4, "required");
2949    __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
2950  %}
2951  ins_pipe(pipe_slow);
2952%}
2953
2954instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
2955  match(Set dst (RoundDoubleMode (LoadD src) rmode));
2956  format %{ "roundsd $dst,$src" %}
2957  ins_cost(150);
2958  ins_encode %{
2959    assert(UseSSE >= 4, "required");
2960    __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
2961  %}
2962  ins_pipe(pipe_slow);
2963%}
2964
2965instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
2966  match(Set dst (RoundDoubleMode con rmode));
2967  effect(TEMP scratch_reg);
2968  format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
2969  ins_cost(150);
2970  ins_encode %{
2971    assert(UseSSE >= 4, "required");
2972    __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
2973  %}
2974  ins_pipe(pipe_slow);
2975%}
2976
2977instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
2978  predicate(n->as_Vector()->length() < 8);
2979  match(Set dst (RoundDoubleModeV src rmode));
2980  format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
2981  ins_encode %{
2982    assert(UseAVX > 0, "required");
2983    int vector_len = vector_length_encoding(this);
2984    __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vector_len);
2985  %}
2986  ins_pipe( pipe_slow );
2987%}
2988
2989instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
2990  predicate(n->as_Vector()->length() == 8);
2991  match(Set dst (RoundDoubleModeV src rmode));
2992  format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
2993  ins_encode %{
2994    assert(UseAVX > 2, "required");
2995    __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
2996  %}
2997  ins_pipe( pipe_slow );
2998%}
2999
3000instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3001  predicate(n->as_Vector()->length() < 8);
3002  match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3003  format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3004  ins_encode %{
3005    assert(UseAVX > 0, "required");
3006    int vector_len = vector_length_encoding(this);
3007    __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vector_len);
3008  %}
3009  ins_pipe( pipe_slow );
3010%}
3011
3012instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3013  predicate(n->as_Vector()->length() == 8);
3014  match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3015  format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3016  ins_encode %{
3017    assert(UseAVX > 2, "required");
3018    __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3019  %}
3020  ins_pipe( pipe_slow );
3021%}
3022#endif // _LP64
3023
3024instruct onspinwait() %{
3025  match(OnSpinWait);
3026  ins_cost(200);
3027
3028  format %{
3029    $$template
3030    $$emit$$"pause\t! membar_onspinwait"
3031  %}
3032  ins_encode %{
3033    __ pause();
3034  %}
3035  ins_pipe(pipe_slow);
3036%}
3037
3038// a * b + c
3039instruct fmaD_reg(regD a, regD b, regD c) %{
3040  predicate(UseFMA);
3041  match(Set c (FmaD  c (Binary a b)));
3042  format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3043  ins_cost(150);
3044  ins_encode %{
3045    __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3046  %}
3047  ins_pipe( pipe_slow );
3048%}
3049
3050// a * b + c
3051instruct fmaF_reg(regF a, regF b, regF c) %{
3052  predicate(UseFMA);
3053  match(Set c (FmaF  c (Binary a b)));
3054  format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3055  ins_cost(150);
3056  ins_encode %{
3057    __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3058  %}
3059  ins_pipe( pipe_slow );
3060%}
3061
3062// ====================VECTOR INSTRUCTIONS=====================================
3063
3064// Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
3065instruct MoveVec2Leg(legVec dst, vec src) %{
3066  match(Set dst src);
3067  format %{ "" %}
3068  ins_encode %{
3069    ShouldNotReachHere();
3070  %}
3071  ins_pipe( fpu_reg_reg );
3072%}
3073
3074instruct MoveLeg2Vec(vec dst, legVec src) %{
3075  match(Set dst src);
3076  format %{ "" %}
3077  ins_encode %{
3078    ShouldNotReachHere();
3079  %}
3080  ins_pipe( fpu_reg_reg );
3081%}
3082
3083// ============================================================================
3084
3085// Load vectors
3086instruct loadV(vec dst, memory mem) %{
3087  match(Set dst (LoadVector mem));
3088  ins_cost(125);
3089  format %{ "load_vector $dst,$mem" %}
3090  ins_encode %{
3091    switch (vector_length_in_bytes(this)) {
3092      case  4: __ movdl    ($dst$$XMMRegister, $mem$$Address); break;
3093      case  8: __ movq     ($dst$$XMMRegister, $mem$$Address); break;
3094      case 16: __ movdqu   ($dst$$XMMRegister, $mem$$Address); break;
3095      case 32: __ vmovdqu  ($dst$$XMMRegister, $mem$$Address); break;
3096      case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3097      default: ShouldNotReachHere();
3098    }
3099  %}
3100  ins_pipe( pipe_slow );
3101%}
3102
3103// Store vectors generic operand pattern.
3104instruct storeV(memory mem, vec src) %{
3105  match(Set mem (StoreVector mem src));
3106  ins_cost(145);
3107  format %{ "store_vector $mem,$src\n\t" %}
3108  ins_encode %{
3109    switch (vector_length_in_bytes(this, $src)) {
3110      case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
3111      case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
3112      case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
3113      case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
3114      case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3115      default: ShouldNotReachHere();
3116    }
3117  %}
3118  ins_pipe( pipe_slow );
3119%}
3120
3121// ====================LEGACY REPLICATE=======================================
3122
3123instruct Repl16B(vec dst, rRegI src) %{
3124  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3125  match(Set dst (ReplicateB src));
3126  format %{ "movd    $dst,$src\n\t"
3127            "punpcklbw $dst,$dst\n\t"
3128            "pshuflw $dst,$dst,0x00\n\t"
3129            "punpcklqdq $dst,$dst\t! replicate16B" %}
3130  ins_encode %{
3131    __ movdl($dst$$XMMRegister, $src$$Register);
3132    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3133    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3134    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3135  %}
3136  ins_pipe( pipe_slow );
3137%}
3138
3139instruct Repl32B(vec dst, rRegI src) %{
3140  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3141  match(Set dst (ReplicateB src));
3142  format %{ "movd    $dst,$src\n\t"
3143            "punpcklbw $dst,$dst\n\t"
3144            "pshuflw $dst,$dst,0x00\n\t"
3145            "punpcklqdq $dst,$dst\n\t"
3146            "vinserti128_high $dst,$dst\t! replicate32B" %}
3147  ins_encode %{
3148    __ movdl($dst$$XMMRegister, $src$$Register);
3149    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3150    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3151    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3152    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3153  %}
3154  ins_pipe( pipe_slow );
3155%}
3156
3157instruct Repl64B(legVec dst, rRegI src) %{
3158  predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3159  match(Set dst (ReplicateB src));
3160  format %{ "movd    $dst,$src\n\t"
3161            "punpcklbw $dst,$dst\n\t"
3162            "pshuflw $dst,$dst,0x00\n\t"
3163            "punpcklqdq $dst,$dst\n\t"
3164            "vinserti128_high $dst,$dst\t"
3165            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3166  ins_encode %{
3167    __ movdl($dst$$XMMRegister, $src$$Register);
3168    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3169    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3170    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3171    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3172    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3173  %}
3174  ins_pipe( pipe_slow );
3175%}
3176
3177instruct Repl16B_imm(vec dst, immI con) %{
3178  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3179  match(Set dst (ReplicateB con));
3180  format %{ "movq    $dst,[$constantaddress]\n\t"
3181            "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3182  ins_encode %{
3183    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3184    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3185  %}
3186  ins_pipe( pipe_slow );
3187%}
3188
3189instruct Repl32B_imm(vec dst, immI con) %{
3190  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3191  match(Set dst (ReplicateB con));
3192  format %{ "movq    $dst,[$constantaddress]\n\t"
3193            "punpcklqdq $dst,$dst\n\t"
3194            "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3195  ins_encode %{
3196    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3197    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3198    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3199  %}
3200  ins_pipe( pipe_slow );
3201%}
3202
3203instruct Repl64B_imm(legVec dst, immI con) %{
3204  predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3205  match(Set dst (ReplicateB con));
3206  format %{ "movq    $dst,[$constantaddress]\n\t"
3207            "punpcklqdq $dst,$dst\n\t"
3208            "vinserti128_high $dst,$dst\t"
3209            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3210  ins_encode %{
3211    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3212    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3213    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3214    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3215  %}
3216  ins_pipe( pipe_slow );
3217%}
3218
3219instruct Repl4S(vec dst, rRegI src) %{
3220  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3221  match(Set dst (ReplicateS src));
3222  format %{ "movd    $dst,$src\n\t"
3223            "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3224  ins_encode %{
3225    __ movdl($dst$$XMMRegister, $src$$Register);
3226    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3227  %}
3228  ins_pipe( pipe_slow );
3229%}
3230
3231instruct Repl4S_mem(vec dst, memory mem) %{
3232  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3233  match(Set dst (ReplicateS (LoadS mem)));
3234  format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3235  ins_encode %{
3236    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3237  %}
3238  ins_pipe( pipe_slow );
3239%}
3240
3241instruct Repl8S(vec dst, rRegI src) %{
3242  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3243  match(Set dst (ReplicateS src));
3244  format %{ "movd    $dst,$src\n\t"
3245            "pshuflw $dst,$dst,0x00\n\t"
3246            "punpcklqdq $dst,$dst\t! replicate8S" %}
3247  ins_encode %{
3248    __ movdl($dst$$XMMRegister, $src$$Register);
3249    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3250    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3251  %}
3252  ins_pipe( pipe_slow );
3253%}
3254
3255instruct Repl8S_mem(vec dst, memory mem) %{
3256  predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3257  match(Set dst (ReplicateS (LoadS mem)));
3258  format %{ "pshuflw $dst,$mem,0x00\n\t"
3259            "punpcklqdq $dst,$dst\t! replicate8S" %}
3260  ins_encode %{
3261    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3262    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3263  %}
3264  ins_pipe( pipe_slow );
3265%}
3266
3267instruct Repl8S_imm(vec dst, immI con) %{
3268  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3269  match(Set dst (ReplicateS con));
3270  format %{ "movq    $dst,[$constantaddress]\n\t"
3271            "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3272  ins_encode %{
3273    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3274    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3275  %}
3276  ins_pipe( pipe_slow );
3277%}
3278
3279instruct Repl16S(vec dst, rRegI src) %{
3280  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3281  match(Set dst (ReplicateS src));
3282  format %{ "movd    $dst,$src\n\t"
3283            "pshuflw $dst,$dst,0x00\n\t"
3284            "punpcklqdq $dst,$dst\n\t"
3285            "vinserti128_high $dst,$dst\t! replicate16S" %}
3286  ins_encode %{
3287    __ movdl($dst$$XMMRegister, $src$$Register);
3288    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3289    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3290    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3291  %}
3292  ins_pipe( pipe_slow );
3293%}
3294
3295instruct Repl16S_mem(vec dst, memory mem) %{
3296  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3297  match(Set dst (ReplicateS (LoadS mem)));
3298  format %{ "pshuflw $dst,$mem,0x00\n\t"
3299            "punpcklqdq $dst,$dst\n\t"
3300            "vinserti128_high $dst,$dst\t! replicate16S" %}
3301  ins_encode %{
3302    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3303    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3304    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3305  %}
3306  ins_pipe( pipe_slow );
3307%}
3308
3309instruct Repl16S_imm(vec dst, immI con) %{
3310  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3311  match(Set dst (ReplicateS con));
3312  format %{ "movq    $dst,[$constantaddress]\n\t"
3313            "punpcklqdq $dst,$dst\n\t"
3314            "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3315  ins_encode %{
3316    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3317    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3318    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3319  %}
3320  ins_pipe( pipe_slow );
3321%}
3322
3323instruct Repl32S(legVec dst, rRegI src) %{
3324  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3325  match(Set dst (ReplicateS src));
3326  format %{ "movd    $dst,$src\n\t"
3327            "pshuflw $dst,$dst,0x00\n\t"
3328            "punpcklqdq $dst,$dst\n\t"
3329            "vinserti128_high $dst,$dst\t"
3330            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3331  ins_encode %{
3332    __ movdl($dst$$XMMRegister, $src$$Register);
3333    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3334    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3335    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3336    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3337  %}
3338  ins_pipe( pipe_slow );
3339%}
3340
3341instruct Repl32S_mem(legVec dst, memory mem) %{
3342  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3343  match(Set dst (ReplicateS (LoadS mem)));
3344  format %{ "pshuflw $dst,$mem,0x00\n\t"
3345            "punpcklqdq $dst,$dst\n\t"
3346            "vinserti128_high $dst,$dst\t"
3347            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3348  ins_encode %{
3349    __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3350    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3351    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3352    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3353  %}
3354  ins_pipe( pipe_slow );
3355%}
3356
3357instruct Repl32S_imm(legVec dst, immI con) %{
3358  predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3359  match(Set dst (ReplicateS con));
3360  format %{ "movq    $dst,[$constantaddress]\n\t"
3361            "punpcklqdq $dst,$dst\n\t"
3362            "vinserti128_high $dst,$dst\t"
3363            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
3364  ins_encode %{
3365    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3366    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3367    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3368    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3369  %}
3370  ins_pipe( pipe_slow );
3371%}
3372
3373instruct Repl4I(vec dst, rRegI src) %{
3374  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3375  match(Set dst (ReplicateI src));
3376  format %{ "movd    $dst,$src\n\t"
3377            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3378  ins_encode %{
3379    __ movdl($dst$$XMMRegister, $src$$Register);
3380    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3381  %}
3382  ins_pipe( pipe_slow );
3383%}
3384
3385instruct Repl4I_mem(vec dst, memory mem) %{
3386  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3387  match(Set dst (ReplicateI (LoadI mem)));
3388  format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3389  ins_encode %{
3390    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3391  %}
3392  ins_pipe( pipe_slow );
3393%}
3394
3395instruct Repl8I(vec dst, rRegI src) %{
3396  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3397  match(Set dst (ReplicateI src));
3398  format %{ "movd    $dst,$src\n\t"
3399            "pshufd  $dst,$dst,0x00\n\t"
3400            "vinserti128_high $dst,$dst\t! replicate8I" %}
3401  ins_encode %{
3402    __ movdl($dst$$XMMRegister, $src$$Register);
3403    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3404    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3405  %}
3406  ins_pipe( pipe_slow );
3407%}
3408
3409instruct Repl8I_mem(vec dst, memory mem) %{
3410  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3411  match(Set dst (ReplicateI (LoadI mem)));
3412  format %{ "pshufd  $dst,$mem,0x00\n\t"
3413            "vinserti128_high $dst,$dst\t! replicate8I" %}
3414  ins_encode %{
3415    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3416    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3417  %}
3418  ins_pipe( pipe_slow );
3419%}
3420
3421instruct Repl16I(legVec dst, rRegI src) %{
3422  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3423  match(Set dst (ReplicateI src));
3424  format %{ "movd    $dst,$src\n\t"
3425            "pshufd  $dst,$dst,0x00\n\t"
3426            "vinserti128_high $dst,$dst\t"
3427            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3428  ins_encode %{
3429    __ movdl($dst$$XMMRegister, $src$$Register);
3430    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3431    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3432    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3433  %}
3434  ins_pipe( pipe_slow );
3435%}
3436
3437instruct Repl16I_mem(legVec dst, memory mem) %{
3438  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3439  match(Set dst (ReplicateI (LoadI mem)));
3440  format %{ "pshufd  $dst,$mem,0x00\n\t"
3441            "vinserti128_high $dst,$dst\t"
3442            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3443  ins_encode %{
3444    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3445    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3446    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3447  %}
3448  ins_pipe( pipe_slow );
3449%}
3450
3451instruct Repl4I_imm(vec dst, immI con) %{
3452  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3453  match(Set dst (ReplicateI con));
3454  format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3455            "punpcklqdq $dst,$dst" %}
3456  ins_encode %{
3457    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3458    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3459  %}
3460  ins_pipe( pipe_slow );
3461%}
3462
3463instruct Repl8I_imm(vec dst, immI con) %{
3464  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3465  match(Set dst (ReplicateI con));
3466  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3467            "punpcklqdq $dst,$dst\n\t"
3468            "vinserti128_high $dst,$dst" %}
3469  ins_encode %{
3470    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3471    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3472    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3473  %}
3474  ins_pipe( pipe_slow );
3475%}
3476
3477instruct Repl16I_imm(legVec dst, immI con) %{
3478  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3479  match(Set dst (ReplicateI con));
3480  format %{ "movq    $dst,[$constantaddress]\t"
3481            "punpcklqdq $dst,$dst\n\t"
3482            "vinserti128_high $dst,$dst"
3483            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
3484  ins_encode %{
3485    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3486    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3487    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3488    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3489  %}
3490  ins_pipe( pipe_slow );
3491%}
3492
3493// Long could be loaded into xmm register directly from memory.
3494instruct Repl2L_mem(vec dst, memory mem) %{
3495  predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3496  match(Set dst (ReplicateL (LoadL mem)));
3497  format %{ "movq    $dst,$mem\n\t"
3498            "punpcklqdq $dst,$dst\t! replicate2L" %}
3499  ins_encode %{
3500    __ movq($dst$$XMMRegister, $mem$$Address);
3501    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3502  %}
3503  ins_pipe( pipe_slow );
3504%}
3505
3506// Replicate long (8 byte) scalar to be vector
3507#ifdef _LP64
3508instruct Repl4L(vec dst, rRegL src) %{
3509  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3510  match(Set dst (ReplicateL src));
3511  format %{ "movdq   $dst,$src\n\t"
3512            "punpcklqdq $dst,$dst\n\t"
3513            "vinserti128_high $dst,$dst\t! replicate4L" %}
3514  ins_encode %{
3515    __ movdq($dst$$XMMRegister, $src$$Register);
3516    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3517    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3518  %}
3519  ins_pipe( pipe_slow );
3520%}
3521
3522instruct Repl8L(legVec dst, rRegL src) %{
3523  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3524  match(Set dst (ReplicateL src));
3525  format %{ "movdq   $dst,$src\n\t"
3526            "punpcklqdq $dst,$dst\n\t"
3527            "vinserti128_high $dst,$dst\t"
3528            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3529  ins_encode %{
3530    __ movdq($dst$$XMMRegister, $src$$Register);
3531    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3532    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3533    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3534  %}
3535  ins_pipe( pipe_slow );
3536%}
3537#else // _LP64
3538instruct Repl4L(vec dst, eRegL src, vec tmp) %{
3539  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3540  match(Set dst (ReplicateL src));
3541  effect(TEMP dst, USE src, TEMP tmp);
3542  format %{ "movdl   $dst,$src.lo\n\t"
3543            "movdl   $tmp,$src.hi\n\t"
3544            "punpckldq $dst,$tmp\n\t"
3545            "punpcklqdq $dst,$dst\n\t"
3546            "vinserti128_high $dst,$dst\t! replicate4L" %}
3547  ins_encode %{
3548    __ movdl($dst$$XMMRegister, $src$$Register);
3549    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3550    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3551    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3552    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3553  %}
3554  ins_pipe( pipe_slow );
3555%}
3556
3557instruct Repl8L(legVec dst, eRegL src, legVec tmp) %{
3558  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3559  match(Set dst (ReplicateL src));
3560  effect(TEMP dst, USE src, TEMP tmp);
3561  format %{ "movdl   $dst,$src.lo\n\t"
3562            "movdl   $tmp,$src.hi\n\t"
3563            "punpckldq $dst,$tmp\n\t"
3564            "punpcklqdq $dst,$dst\n\t"
3565            "vinserti128_high $dst,$dst\t"
3566            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3567  ins_encode %{
3568    __ movdl($dst$$XMMRegister, $src$$Register);
3569    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3570    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3571    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3572    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3573    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3574  %}
3575  ins_pipe( pipe_slow );
3576%}
3577#endif // _LP64
3578
3579instruct Repl4L_imm(vec dst, immL con) %{
3580  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3581  match(Set dst (ReplicateL con));
3582  format %{ "movq    $dst,[$constantaddress]\n\t"
3583            "punpcklqdq $dst,$dst\n\t"
3584            "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3585  ins_encode %{
3586    __ movq($dst$$XMMRegister, $constantaddress($con));
3587    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3588    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3589  %}
3590  ins_pipe( pipe_slow );
3591%}
3592
3593instruct Repl8L_imm(legVec dst, immL con) %{
3594  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3595  match(Set dst (ReplicateL con));
3596  format %{ "movq    $dst,[$constantaddress]\n\t"
3597            "punpcklqdq $dst,$dst\n\t"
3598            "vinserti128_high $dst,$dst\t"
3599            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
3600  ins_encode %{
3601    __ movq($dst$$XMMRegister, $constantaddress($con));
3602    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3603    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3604    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3605  %}
3606  ins_pipe( pipe_slow );
3607%}
3608
3609instruct Repl4L_mem(vec dst, memory mem) %{
3610  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3611  match(Set dst (ReplicateL (LoadL mem)));
3612  format %{ "movq    $dst,$mem\n\t"
3613            "punpcklqdq $dst,$dst\n\t"
3614            "vinserti128_high $dst,$dst\t! replicate4L" %}
3615  ins_encode %{
3616    __ movq($dst$$XMMRegister, $mem$$Address);
3617    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3618    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3619  %}
3620  ins_pipe( pipe_slow );
3621%}
3622
3623instruct Repl8L_mem(legVec dst, memory mem) %{
3624  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3625  match(Set dst (ReplicateL (LoadL mem)));
3626  format %{ "movq    $dst,$mem\n\t"
3627            "punpcklqdq $dst,$dst\n\t"
3628            "vinserti128_high $dst,$dst\t"
3629            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3630  ins_encode %{
3631    __ movq($dst$$XMMRegister, $mem$$Address);
3632    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3633    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3634    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3635  %}
3636  ins_pipe( pipe_slow );
3637%}
3638
3639instruct Repl2F_mem(vec dst, memory mem) %{
3640  predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3641  match(Set dst (ReplicateF (LoadF mem)));
3642  format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3643  ins_encode %{
3644    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3645  %}
3646  ins_pipe( pipe_slow );
3647%}
3648
3649instruct Repl4F_mem(vec dst, memory mem) %{
3650  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3651  match(Set dst (ReplicateF (LoadF mem)));
3652  format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3653  ins_encode %{
3654    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3655  %}
3656  ins_pipe( pipe_slow );
3657%}
3658
3659instruct Repl8F(vec dst, vlRegF src) %{
3660  predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3661  match(Set dst (ReplicateF src));
3662  format %{ "pshufd  $dst,$src,0x00\n\t"
3663            "vinsertf128_high $dst,$dst\t! replicate8F" %}
3664  ins_encode %{
3665    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3666    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3667  %}
3668  ins_pipe( pipe_slow );
3669%}
3670
3671instruct Repl8F_mem(vec dst, memory mem) %{
3672  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3673  match(Set dst (ReplicateF (LoadF mem)));
3674  format %{ "pshufd  $dst,$mem,0x00\n\t"
3675            "vinsertf128_high $dst,$dst\t! replicate8F" %}
3676  ins_encode %{
3677    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3678    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3679  %}
3680  ins_pipe( pipe_slow );
3681%}
3682
3683instruct Repl16F(legVec dst, vlRegF src) %{
3684  predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3685  match(Set dst (ReplicateF src));
3686  format %{ "pshufd  $dst,$src,0x00\n\t"
3687            "vinsertf128_high $dst,$dst\t"
3688            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3689  ins_encode %{
3690    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3691    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3692    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3693  %}
3694  ins_pipe( pipe_slow );
3695%}
3696
3697instruct Repl16F_mem(legVec dst, memory mem) %{
3698  predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3699  match(Set dst (ReplicateF (LoadF mem)));
3700  format %{ "pshufd  $dst,$mem,0x00\n\t"
3701            "vinsertf128_high $dst,$dst\t"
3702            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3703  ins_encode %{
3704    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3705    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3706    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3707  %}
3708  ins_pipe( pipe_slow );
3709%}
3710
3711instruct Repl2F_zero(vec dst, immF0 zero) %{
3712  predicate(n->as_Vector()->length() == 2);
3713  match(Set dst (ReplicateF zero));
3714  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3715  ins_encode %{
3716    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3717  %}
3718  ins_pipe( fpu_reg_reg );
3719%}
3720
3721instruct Repl4F_zero(vec dst, immF0 zero) %{
3722  predicate(n->as_Vector()->length() == 4);
3723  match(Set dst (ReplicateF zero));
3724  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3725  ins_encode %{
3726    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3727  %}
3728  ins_pipe( fpu_reg_reg );
3729%}
3730
3731instruct Repl8F_zero(vec dst, immF0 zero) %{
3732  predicate(n->as_Vector()->length() == 8 && UseAVX > 0);
3733  match(Set dst (ReplicateF zero));
3734  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3735  ins_encode %{
3736    int vector_len = 1;
3737    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3738  %}
3739  ins_pipe( fpu_reg_reg );
3740%}
3741
3742instruct Repl2D_mem(vec dst, memory mem) %{
3743  predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3744  match(Set dst (ReplicateD (LoadD mem)));
3745  format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3746  ins_encode %{
3747    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3748  %}
3749  ins_pipe( pipe_slow );
3750%}
3751
3752instruct Repl4D(vec dst, vlRegD src) %{
3753  predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3754  match(Set dst (ReplicateD src));
3755  format %{ "pshufd  $dst,$src,0x44\n\t"
3756            "vinsertf128_high $dst,$dst\t! replicate4D" %}
3757  ins_encode %{
3758    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3759    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3760  %}
3761  ins_pipe( pipe_slow );
3762%}
3763
3764instruct Repl4D_mem(vec dst, memory mem) %{
3765  predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3766  match(Set dst (ReplicateD (LoadD mem)));
3767  format %{ "pshufd  $dst,$mem,0x44\n\t"
3768            "vinsertf128_high $dst,$dst\t! replicate4D" %}
3769  ins_encode %{
3770    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3771    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3772  %}
3773  ins_pipe( pipe_slow );
3774%}
3775
3776instruct Repl8D(legVec dst, vlRegD src) %{
3777  predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3778  match(Set dst (ReplicateD src));
3779  format %{ "pshufd  $dst,$src,0x44\n\t"
3780            "vinsertf128_high $dst,$dst\t"
3781            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3782  ins_encode %{
3783    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3784    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3785    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3786  %}
3787  ins_pipe( pipe_slow );
3788%}
3789
3790instruct Repl8D_mem(legVec dst, memory mem) %{
3791  predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3792  match(Set dst (ReplicateD (LoadD mem)));
3793  format %{ "pshufd  $dst,$mem,0x44\n\t"
3794            "vinsertf128_high $dst,$dst\t"
3795            "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3796  ins_encode %{
3797    __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3798    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3799    __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3800  %}
3801  ins_pipe( pipe_slow );
3802%}
3803
3804// Replicate double (8 byte) scalar zero to be vector
3805instruct Repl2D_zero(vec dst, immD0 zero) %{
3806  predicate(n->as_Vector()->length() == 2);
3807  match(Set dst (ReplicateD zero));
3808  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3809  ins_encode %{
3810    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3811  %}
3812  ins_pipe( fpu_reg_reg );
3813%}
3814
3815instruct Repl4D_zero(vec dst, immD0 zero) %{
3816  predicate(n->as_Vector()->length() == 4 && UseAVX > 0);
3817  match(Set dst (ReplicateD zero));
3818  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3819  ins_encode %{
3820    int vector_len = 1;
3821    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3822  %}
3823  ins_pipe( fpu_reg_reg );
3824%}
3825
3826// ====================GENERIC REPLICATE==========================================
3827
3828// Replicate byte scalar to be vector
3829instruct Repl4B(vec dst, rRegI src) %{
3830  predicate(n->as_Vector()->length() == 4);
3831  match(Set dst (ReplicateB src));
3832  format %{ "movd    $dst,$src\n\t"
3833            "punpcklbw $dst,$dst\n\t"
3834            "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3835  ins_encode %{
3836    __ movdl($dst$$XMMRegister, $src$$Register);
3837    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3838    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3839  %}
3840  ins_pipe( pipe_slow );
3841%}
3842
3843instruct Repl8B(vec dst, rRegI src) %{
3844  predicate(n->as_Vector()->length() == 8);
3845  match(Set dst (ReplicateB src));
3846  format %{ "movd    $dst,$src\n\t"
3847            "punpcklbw $dst,$dst\n\t"
3848            "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3849  ins_encode %{
3850    __ movdl($dst$$XMMRegister, $src$$Register);
3851    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3852    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3853  %}
3854  ins_pipe( pipe_slow );
3855%}
3856
3857// Replicate byte scalar immediate to be vector by loading from const table.
3858instruct Repl4B_imm(vec dst, immI con) %{
3859  predicate(n->as_Vector()->length() == 4);
3860  match(Set dst (ReplicateB con));
3861  format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3862  ins_encode %{
3863    __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3864  %}
3865  ins_pipe( pipe_slow );
3866%}
3867
3868instruct Repl8B_imm(vec dst, immI con) %{
3869  predicate(n->as_Vector()->length() == 8);
3870  match(Set dst (ReplicateB con));
3871  format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3872  ins_encode %{
3873    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3874  %}
3875  ins_pipe( pipe_slow );
3876%}
3877
3878// Replicate byte scalar zero to be vector
3879instruct Repl4B_zero(vec dst, immI0 zero) %{
3880  predicate(n->as_Vector()->length() == 4);
3881  match(Set dst (ReplicateB zero));
3882  format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3883  ins_encode %{
3884    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3885  %}
3886  ins_pipe( fpu_reg_reg );
3887%}
3888
3889instruct Repl8B_zero(vec dst, immI0 zero) %{
3890  predicate(n->as_Vector()->length() == 8);
3891  match(Set dst (ReplicateB zero));
3892  format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3893  ins_encode %{
3894    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3895  %}
3896  ins_pipe( fpu_reg_reg );
3897%}
3898
3899instruct Repl16B_zero(vec dst, immI0 zero) %{
3900  predicate(n->as_Vector()->length() == 16);
3901  match(Set dst (ReplicateB zero));
3902  format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3903  ins_encode %{
3904    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3905  %}
3906  ins_pipe( fpu_reg_reg );
3907%}
3908
3909instruct Repl32B_zero(vec dst, immI0 zero) %{
3910  predicate(n->as_Vector()->length() == 32);
3911  match(Set dst (ReplicateB zero));
3912  format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3913  ins_encode %{
3914    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3915    int vector_len = 1;
3916    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3917  %}
3918  ins_pipe( fpu_reg_reg );
3919%}
3920
3921// Replicate char/short (2 byte) scalar to be vector
3922instruct Repl2S(vec dst, rRegI src) %{
3923  predicate(n->as_Vector()->length() == 2);
3924  match(Set dst (ReplicateS src));
3925  format %{ "movd    $dst,$src\n\t"
3926            "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3927  ins_encode %{
3928    __ movdl($dst$$XMMRegister, $src$$Register);
3929    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3930  %}
3931  ins_pipe( fpu_reg_reg );
3932%}
3933
3934// Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3935instruct Repl2S_imm(vec dst, immI con) %{
3936  predicate(n->as_Vector()->length() == 2);
3937  match(Set dst (ReplicateS con));
3938  format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3939  ins_encode %{
3940    __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3941  %}
3942  ins_pipe( fpu_reg_reg );
3943%}
3944
3945instruct Repl4S_imm(vec dst, immI con) %{
3946  predicate(n->as_Vector()->length() == 4);
3947  match(Set dst (ReplicateS con));
3948  format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3949  ins_encode %{
3950    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3951  %}
3952  ins_pipe( fpu_reg_reg );
3953%}
3954
3955// Replicate char/short (2 byte) scalar zero to be vector
3956instruct Repl2S_zero(vec dst, immI0 zero) %{
3957  predicate(n->as_Vector()->length() == 2);
3958  match(Set dst (ReplicateS zero));
3959  format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3960  ins_encode %{
3961    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3962  %}
3963  ins_pipe( fpu_reg_reg );
3964%}
3965
3966instruct Repl4S_zero(vec dst, immI0 zero) %{
3967  predicate(n->as_Vector()->length() == 4);
3968  match(Set dst (ReplicateS zero));
3969  format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
3970  ins_encode %{
3971    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3972  %}
3973  ins_pipe( fpu_reg_reg );
3974%}
3975
3976instruct Repl8S_zero(vec dst, immI0 zero) %{
3977  predicate(n->as_Vector()->length() == 8);
3978  match(Set dst (ReplicateS zero));
3979  format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
3980  ins_encode %{
3981    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3982  %}
3983  ins_pipe( fpu_reg_reg );
3984%}
3985
3986instruct Repl16S_zero(vec dst, immI0 zero) %{
3987  predicate(n->as_Vector()->length() == 16);
3988  match(Set dst (ReplicateS zero));
3989  format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
3990  ins_encode %{
3991    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3992    int vector_len = 1;
3993    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3994  %}
3995  ins_pipe( fpu_reg_reg );
3996%}
3997
3998// Replicate integer (4 byte) scalar to be vector
3999instruct Repl2I(vec dst, rRegI src) %{
4000  predicate(n->as_Vector()->length() == 2);
4001  match(Set dst (ReplicateI src));
4002  format %{ "movd    $dst,$src\n\t"
4003            "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4004  ins_encode %{
4005    __ movdl($dst$$XMMRegister, $src$$Register);
4006    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4007  %}
4008  ins_pipe( fpu_reg_reg );
4009%}
4010
4011// Integer could be loaded into xmm register directly from memory.
4012instruct Repl2I_mem(vec dst, memory mem) %{
4013  predicate(n->as_Vector()->length() == 2);
4014  match(Set dst (ReplicateI (LoadI mem)));
4015  format %{ "movd    $dst,$mem\n\t"
4016            "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4017  ins_encode %{
4018    __ movdl($dst$$XMMRegister, $mem$$Address);
4019    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4020  %}
4021  ins_pipe( fpu_reg_reg );
4022%}
4023
4024// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4025instruct Repl2I_imm(vec dst, immI con) %{
4026  predicate(n->as_Vector()->length() == 2);
4027  match(Set dst (ReplicateI con));
4028  format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4029  ins_encode %{
4030    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4031  %}
4032  ins_pipe( fpu_reg_reg );
4033%}
4034
4035// Replicate integer (4 byte) scalar zero to be vector
4036instruct Repl2I_zero(vec dst, immI0 zero) %{
4037  predicate(n->as_Vector()->length() == 2);
4038  match(Set dst (ReplicateI zero));
4039  format %{ "pxor    $dst,$dst\t! replicate2I" %}
4040  ins_encode %{
4041    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4042  %}
4043  ins_pipe( fpu_reg_reg );
4044%}
4045
4046instruct Repl4I_zero(vec dst, immI0 zero) %{
4047  predicate(n->as_Vector()->length() == 4);
4048  match(Set dst (ReplicateI zero));
4049  format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4050  ins_encode %{
4051    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4052  %}
4053  ins_pipe( fpu_reg_reg );
4054%}
4055
4056instruct Repl8I_zero(vec dst, immI0 zero) %{
4057  predicate(n->as_Vector()->length() == 8);
4058  match(Set dst (ReplicateI zero));
4059  format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4060  ins_encode %{
4061    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4062    int vector_len = 1;
4063    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4064  %}
4065  ins_pipe( fpu_reg_reg );
4066%}
4067
4068// Replicate long (8 byte) scalar to be vector
4069#ifdef _LP64
4070instruct Repl2L(vec dst, rRegL src) %{
4071  predicate(n->as_Vector()->length() == 2);
4072  match(Set dst (ReplicateL src));
4073  format %{ "movdq   $dst,$src\n\t"
4074            "punpcklqdq $dst,$dst\t! replicate2L" %}
4075  ins_encode %{
4076    __ movdq($dst$$XMMRegister, $src$$Register);
4077    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4078  %}
4079  ins_pipe( pipe_slow );
4080%}
4081#else // _LP64
4082instruct Repl2L(vec dst, eRegL src, vec tmp) %{
4083  predicate(n->as_Vector()->length() == 2);
4084  match(Set dst (ReplicateL src));
4085  effect(TEMP dst, USE src, TEMP tmp);
4086  format %{ "movdl   $dst,$src.lo\n\t"
4087            "movdl   $tmp,$src.hi\n\t"
4088            "punpckldq $dst,$tmp\n\t"
4089            "punpcklqdq $dst,$dst\t! replicate2L"%}
4090  ins_encode %{
4091    __ movdl($dst$$XMMRegister, $src$$Register);
4092    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4093    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4094    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4095  %}
4096  ins_pipe( pipe_slow );
4097%}
4098#endif // _LP64
4099
4100// Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4101instruct Repl2L_imm(vec dst, immL con) %{
4102  predicate(n->as_Vector()->length() == 2);
4103  match(Set dst (ReplicateL con));
4104  format %{ "movq    $dst,[$constantaddress]\n\t"
4105            "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4106  ins_encode %{
4107    __ movq($dst$$XMMRegister, $constantaddress($con));
4108    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4109  %}
4110  ins_pipe( pipe_slow );
4111%}
4112
4113// Replicate long (8 byte) scalar zero to be vector
4114instruct Repl2L_zero(vec dst, immL0 zero) %{
4115  predicate(n->as_Vector()->length() == 2);
4116  match(Set dst (ReplicateL zero));
4117  format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4118  ins_encode %{
4119    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4120  %}
4121  ins_pipe( fpu_reg_reg );
4122%}
4123
4124instruct Repl4L_zero(vec dst, immL0 zero) %{
4125  predicate(n->as_Vector()->length() == 4);
4126  match(Set dst (ReplicateL zero));
4127  format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4128  ins_encode %{
4129    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4130    int vector_len = 1;
4131    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4132  %}
4133  ins_pipe( fpu_reg_reg );
4134%}
4135
4136// Replicate float (4 byte) scalar to be vector
4137instruct Repl2F(vec dst, vlRegF src) %{
4138  predicate(n->as_Vector()->length() == 2);
4139  match(Set dst (ReplicateF src));
4140  format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4141  ins_encode %{
4142    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4143  %}
4144  ins_pipe( fpu_reg_reg );
4145%}
4146
4147instruct Repl4F(vec dst, vlRegF src) %{
4148  predicate(n->as_Vector()->length() == 4);
4149  match(Set dst (ReplicateF src));
4150  format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4151  ins_encode %{
4152    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4153  %}
4154  ins_pipe( pipe_slow );
4155%}
4156
4157// Replicate double (8 bytes) scalar to be vector
4158instruct Repl2D(vec dst, vlRegD src) %{
4159  predicate(n->as_Vector()->length() == 2);
4160  match(Set dst (ReplicateD src));
4161  format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4162  ins_encode %{
4163    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4164  %}
4165  ins_pipe( pipe_slow );
4166%}
4167
4168// ====================EVEX REPLICATE=============================================
4169
4170instruct Repl4B_mem_evex(vec dst, memory mem) %{
4171  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4172  match(Set dst (ReplicateB (LoadB mem)));
4173  format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4174  ins_encode %{
4175    int vector_len = 0;
4176    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4177  %}
4178  ins_pipe( pipe_slow );
4179%}
4180
4181instruct Repl8B_mem_evex(vec dst, memory mem) %{
4182  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4183  match(Set dst (ReplicateB (LoadB mem)));
4184  format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4185  ins_encode %{
4186    int vector_len = 0;
4187    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4188  %}
4189  ins_pipe( pipe_slow );
4190%}
4191
4192instruct Repl16B_evex(vec dst, rRegI src) %{
4193  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4194  match(Set dst (ReplicateB src));
4195  format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4196  ins_encode %{
4197   int vector_len = 0;
4198    __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4199  %}
4200  ins_pipe( pipe_slow );
4201%}
4202
4203instruct Repl16B_mem_evex(vec dst, memory mem) %{
4204  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4205  match(Set dst (ReplicateB (LoadB mem)));
4206  format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4207  ins_encode %{
4208    int vector_len = 0;
4209    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4210  %}
4211  ins_pipe( pipe_slow );
4212%}
4213
4214instruct Repl32B_evex(vec dst, rRegI src) %{
4215  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4216  match(Set dst (ReplicateB src));
4217  format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4218  ins_encode %{
4219   int vector_len = 1;
4220    __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4221  %}
4222  ins_pipe( pipe_slow );
4223%}
4224
4225instruct Repl32B_mem_evex(vec dst, memory mem) %{
4226  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4227  match(Set dst (ReplicateB (LoadB mem)));
4228  format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4229  ins_encode %{
4230    int vector_len = 1;
4231    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4232  %}
4233  ins_pipe( pipe_slow );
4234%}
4235
4236instruct Repl64B_evex(vec dst, rRegI src) %{
4237  predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4238  match(Set dst (ReplicateB src));
4239  format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4240  ins_encode %{
4241   int vector_len = 2;
4242    __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4243  %}
4244  ins_pipe( pipe_slow );
4245%}
4246
4247instruct Repl64B_mem_evex(vec dst, memory mem) %{
4248  predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4249  match(Set dst (ReplicateB (LoadB mem)));
4250  format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4251  ins_encode %{
4252    int vector_len = 2;
4253    __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4254  %}
4255  ins_pipe( pipe_slow );
4256%}
4257
4258instruct Repl16B_imm_evex(vec dst, immI con) %{
4259  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4260  match(Set dst (ReplicateB con));
4261  format %{ "movq    $dst,[$constantaddress]\n\t"
4262            "vpbroadcastb $dst,$dst\t! replicate16B" %}
4263  ins_encode %{
4264   int vector_len = 0;
4265    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4266    __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4267  %}
4268  ins_pipe( pipe_slow );
4269%}
4270
4271instruct Repl32B_imm_evex(vec dst, immI con) %{
4272  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4273  match(Set dst (ReplicateB con));
4274  format %{ "movq    $dst,[$constantaddress]\n\t"
4275            "vpbroadcastb $dst,$dst\t! replicate32B" %}
4276  ins_encode %{
4277   int vector_len = 1;
4278    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4279    __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4280  %}
4281  ins_pipe( pipe_slow );
4282%}
4283
4284instruct Repl64B_imm_evex(vec dst, immI con) %{
4285  predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4286  match(Set dst (ReplicateB con));
4287  format %{ "movq    $dst,[$constantaddress]\n\t"
4288            "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4289  ins_encode %{
4290   int vector_len = 2;
4291    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4292    __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4293  %}
4294  ins_pipe( pipe_slow );
4295%}
4296
4297instruct Repl64B_zero_evex(vec dst, immI0 zero) %{
4298  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4299  match(Set dst (ReplicateB zero));
4300  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4301  ins_encode %{
4302    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4303    int vector_len = 2;
4304    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4305  %}
4306  ins_pipe( fpu_reg_reg );
4307%}
4308
4309instruct Repl4S_evex(vec dst, rRegI src) %{
4310  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4311  match(Set dst (ReplicateS src));
4312  format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
4313  ins_encode %{
4314   int vector_len = 0;
4315    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4316  %}
4317  ins_pipe( pipe_slow );
4318%}
4319
4320instruct Repl4S_mem_evex(vec dst, memory mem) %{
4321  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4322  match(Set dst (ReplicateS (LoadS mem)));
4323  format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4324  ins_encode %{
4325    int vector_len = 0;
4326    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4327  %}
4328  ins_pipe( pipe_slow );
4329%}
4330
4331instruct Repl8S_evex(vec dst, rRegI src) %{
4332  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4333  match(Set dst (ReplicateS src));
4334  format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
4335  ins_encode %{
4336   int vector_len = 0;
4337    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4338  %}
4339  ins_pipe( pipe_slow );
4340%}
4341
4342instruct Repl8S_mem_evex(vec dst, memory mem) %{
4343  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4344  match(Set dst (ReplicateS (LoadS mem)));
4345  format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4346  ins_encode %{
4347    int vector_len = 0;
4348    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4349  %}
4350  ins_pipe( pipe_slow );
4351%}
4352
4353instruct Repl16S_evex(vec dst, rRegI src) %{
4354  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4355  match(Set dst (ReplicateS src));
4356  format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
4357  ins_encode %{
4358   int vector_len = 1;
4359    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4360  %}
4361  ins_pipe( pipe_slow );
4362%}
4363
4364instruct Repl16S_mem_evex(vec dst, memory mem) %{
4365  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4366  match(Set dst (ReplicateS (LoadS mem)));
4367  format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4368  ins_encode %{
4369    int vector_len = 1;
4370    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4371  %}
4372  ins_pipe( pipe_slow );
4373%}
4374
4375instruct Repl32S_evex(vec dst, rRegI src) %{
4376  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4377  match(Set dst (ReplicateS src));
4378  format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
4379  ins_encode %{
4380   int vector_len = 2;
4381    __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4382  %}
4383  ins_pipe( pipe_slow );
4384%}
4385
4386instruct Repl32S_mem_evex(vec dst, memory mem) %{
4387  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4388  match(Set dst (ReplicateS (LoadS mem)));
4389  format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4390  ins_encode %{
4391    int vector_len = 2;
4392    __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4393  %}
4394  ins_pipe( pipe_slow );
4395%}
4396
4397instruct Repl8S_imm_evex(vec dst, immI con) %{
4398  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4399  match(Set dst (ReplicateS con));
4400  format %{ "movq    $dst,[$constantaddress]\n\t"
4401            "vpbroadcastw $dst,$dst\t! replicate8S" %}
4402  ins_encode %{
4403   int vector_len = 0;
4404    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4405    __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4406  %}
4407  ins_pipe( pipe_slow );
4408%}
4409
4410instruct Repl16S_imm_evex(vec dst, immI con) %{
4411  predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4412  match(Set dst (ReplicateS con));
4413  format %{ "movq    $dst,[$constantaddress]\n\t"
4414            "vpbroadcastw $dst,$dst\t! replicate16S" %}
4415  ins_encode %{
4416   int vector_len = 1;
4417    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4418    __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4419  %}
4420  ins_pipe( pipe_slow );
4421%}
4422
4423instruct Repl32S_imm_evex(vec dst, immI con) %{
4424  predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4425  match(Set dst (ReplicateS con));
4426  format %{ "movq    $dst,[$constantaddress]\n\t"
4427            "vpbroadcastw $dst,$dst\t! replicate32S" %}
4428  ins_encode %{
4429   int vector_len = 2;
4430    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4431    __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4432  %}
4433  ins_pipe( pipe_slow );
4434%}
4435
4436instruct Repl32S_zero_evex(vec dst, immI0 zero) %{
4437  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4438  match(Set dst (ReplicateS zero));
4439  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4440  ins_encode %{
4441    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4442    int vector_len = 2;
4443    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4444  %}
4445  ins_pipe( fpu_reg_reg );
4446%}
4447
4448instruct Repl4I_evex(vec dst, rRegI src) %{
4449  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4450  match(Set dst (ReplicateI src));
4451  format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
4452  ins_encode %{
4453    int vector_len = 0;
4454    __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4455  %}
4456  ins_pipe( pipe_slow );
4457%}
4458
4459instruct Repl4I_mem_evex(vec dst, memory mem) %{
4460  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4461  match(Set dst (ReplicateI (LoadI mem)));
4462  format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4463  ins_encode %{
4464    int vector_len = 0;
4465    __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4466  %}
4467  ins_pipe( pipe_slow );
4468%}
4469
4470instruct Repl8I_evex(vec dst, rRegI src) %{
4471  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4472  match(Set dst (ReplicateI src));
4473  format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
4474  ins_encode %{
4475    int vector_len = 1;
4476    __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4477  %}
4478  ins_pipe( pipe_slow );
4479%}
4480
4481instruct Repl8I_mem_evex(vec dst, memory mem) %{
4482  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4483  match(Set dst (ReplicateI (LoadI mem)));
4484  format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4485  ins_encode %{
4486    int vector_len = 1;
4487    __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4488  %}
4489  ins_pipe( pipe_slow );
4490%}
4491
4492instruct Repl16I_evex(vec dst, rRegI src) %{
4493  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4494  match(Set dst (ReplicateI src));
4495  format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
4496  ins_encode %{
4497    int vector_len = 2;
4498    __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4499  %}
4500  ins_pipe( pipe_slow );
4501%}
4502
4503instruct Repl16I_mem_evex(vec dst, memory mem) %{
4504  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4505  match(Set dst (ReplicateI (LoadI mem)));
4506  format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4507  ins_encode %{
4508    int vector_len = 2;
4509    __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4510  %}
4511  ins_pipe( pipe_slow );
4512%}
4513
4514instruct Repl4I_imm_evex(vec dst, immI con) %{
4515  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4516  match(Set dst (ReplicateI con));
4517  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4518            "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4519  ins_encode %{
4520    int vector_len = 0;
4521    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4522    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4523  %}
4524  ins_pipe( pipe_slow );
4525%}
4526
4527instruct Repl8I_imm_evex(vec dst, immI con) %{
4528  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4529  match(Set dst (ReplicateI con));
4530  format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4531            "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4532  ins_encode %{
4533    int vector_len = 1;
4534    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4535    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4536  %}
4537  ins_pipe( pipe_slow );
4538%}
4539
4540instruct Repl16I_imm_evex(vec dst, immI con) %{
4541  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4542  match(Set dst (ReplicateI con));
4543  format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4544            "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4545  ins_encode %{
4546    int vector_len = 2;
4547    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4548    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4549  %}
4550  ins_pipe( pipe_slow );
4551%}
4552
4553instruct Repl16I_zero_evex(vec dst, immI0 zero) %{
4554  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4555  match(Set dst (ReplicateI zero));
4556  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4557  ins_encode %{
4558    // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4559    int vector_len = 2;
4560    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4561  %}
4562  ins_pipe( fpu_reg_reg );
4563%}
4564
4565// Replicate long (8 byte) scalar to be vector
4566#ifdef _LP64
4567instruct Repl4L_evex(vec dst, rRegL src) %{
4568  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4569  match(Set dst (ReplicateL src));
4570  format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
4571  ins_encode %{
4572    int vector_len = 1;
4573    __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4574  %}
4575  ins_pipe( pipe_slow );
4576%}
4577
4578instruct Repl8L_evex(vec dst, rRegL src) %{
4579  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4580  match(Set dst (ReplicateL src));
4581  format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
4582  ins_encode %{
4583    int vector_len = 2;
4584    __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4585  %}
4586  ins_pipe( pipe_slow );
4587%}
4588#else // _LP64
4589instruct Repl4L_evex(vec dst, eRegL src, regD tmp) %{
4590  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4591  match(Set dst (ReplicateL src));
4592  effect(TEMP dst, USE src, TEMP tmp);
4593  format %{ "movdl   $dst,$src.lo\n\t"
4594            "movdl   $tmp,$src.hi\n\t"
4595            "punpckldq $dst,$tmp\n\t"
4596            "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4597  ins_encode %{
4598    int vector_len = 1;
4599    __ movdl($dst$$XMMRegister, $src$$Register);
4600    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4601    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4602    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4603  %}
4604  ins_pipe( pipe_slow );
4605%}
4606
4607instruct Repl8L_evex(legVec dst, eRegL src, legVec tmp) %{
4608  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4609  match(Set dst (ReplicateL src));
4610  effect(TEMP dst, USE src, TEMP tmp);
4611  format %{ "movdl   $dst,$src.lo\n\t"
4612            "movdl   $tmp,$src.hi\n\t"
4613            "punpckldq $dst,$tmp\n\t"
4614            "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4615  ins_encode %{
4616    int vector_len = 2;
4617    __ movdl($dst$$XMMRegister, $src$$Register);
4618    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4619    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4620    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4621  %}
4622  ins_pipe( pipe_slow );
4623%}
4624#endif // _LP64
4625
4626instruct Repl4L_imm_evex(vec dst, immL con) %{
4627  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4628  match(Set dst (ReplicateL con));
4629  format %{ "movq    $dst,[$constantaddress]\n\t"
4630            "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4631  ins_encode %{
4632    int vector_len = 1;
4633    __ movq($dst$$XMMRegister, $constantaddress($con));
4634    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4635  %}
4636  ins_pipe( pipe_slow );
4637%}
4638
4639instruct Repl8L_imm_evex(vec dst, immL con) %{
4640  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4641  match(Set dst (ReplicateL con));
4642  format %{ "movq    $dst,[$constantaddress]\n\t"
4643            "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4644  ins_encode %{
4645    int vector_len = 2;
4646    __ movq($dst$$XMMRegister, $constantaddress($con));
4647    __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4648  %}
4649  ins_pipe( pipe_slow );
4650%}
4651
4652instruct Repl2L_mem_evex(vec dst, memory mem) %{
4653  predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
4654  match(Set dst (ReplicateL (LoadL mem)));
4655  format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4656  ins_encode %{
4657    int vector_len = 0;
4658    __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4659  %}
4660  ins_pipe( pipe_slow );
4661%}
4662
4663instruct Repl4L_mem_evex(vec dst, memory mem) %{
4664  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4665  match(Set dst (ReplicateL (LoadL mem)));
4666  format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4667  ins_encode %{
4668    int vector_len = 1;
4669    __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4670  %}
4671  ins_pipe( pipe_slow );
4672%}
4673
4674instruct Repl8L_mem_evex(vec dst, memory mem) %{
4675  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4676  match(Set dst (ReplicateL (LoadL mem)));
4677  format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4678  ins_encode %{
4679    int vector_len = 2;
4680    __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4681  %}
4682  ins_pipe( pipe_slow );
4683%}
4684
4685instruct Repl8L_zero_evex(vec dst, immL0 zero) %{
4686  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4687  match(Set dst (ReplicateL zero));
4688  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4689  ins_encode %{
4690    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4691    int vector_len = 2;
4692    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4693  %}
4694  ins_pipe( fpu_reg_reg );
4695%}
4696
4697instruct Repl8F_evex(vec dst, regF src) %{
4698  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4699  match(Set dst (ReplicateF src));
4700  format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
4701  ins_encode %{
4702    int vector_len = 1;
4703    __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4704  %}
4705  ins_pipe( pipe_slow );
4706%}
4707
4708instruct Repl8F_mem_evex(vec dst, memory mem) %{
4709  predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4710  match(Set dst (ReplicateF (LoadF mem)));
4711  format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4712  ins_encode %{
4713    int vector_len = 1;
4714    __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4715  %}
4716  ins_pipe( pipe_slow );
4717%}
4718
4719instruct Repl16F_evex(vec dst, regF src) %{
4720  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4721  match(Set dst (ReplicateF src));
4722  format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
4723  ins_encode %{
4724    int vector_len = 2;
4725    __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4726  %}
4727  ins_pipe( pipe_slow );
4728%}
4729
4730instruct Repl16F_mem_evex(vec dst, memory mem) %{
4731  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4732  match(Set dst (ReplicateF (LoadF mem)));
4733  format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4734  ins_encode %{
4735    int vector_len = 2;
4736    __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4737  %}
4738  ins_pipe( pipe_slow );
4739%}
4740
4741instruct Repl16F_zero_evex(vec dst, immF0 zero) %{
4742  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4743  match(Set dst (ReplicateF zero));
4744  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4745  ins_encode %{
4746    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4747    int vector_len = 2;
4748    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4749  %}
4750  ins_pipe( fpu_reg_reg );
4751%}
4752
4753instruct Repl4D_evex(vec dst, regD src) %{
4754  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4755  match(Set dst (ReplicateD src));
4756  format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
4757  ins_encode %{
4758    int vector_len = 1;
4759    __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4760  %}
4761  ins_pipe( pipe_slow );
4762%}
4763
4764instruct Repl4D_mem_evex(vec dst, memory mem) %{
4765  predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4766  match(Set dst (ReplicateD (LoadD mem)));
4767  format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4768  ins_encode %{
4769    int vector_len = 1;
4770    __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4771  %}
4772  ins_pipe( pipe_slow );
4773%}
4774
4775instruct Repl8D_evex(vec dst, regD src) %{
4776  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4777  match(Set dst (ReplicateD src));
4778  format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
4779  ins_encode %{
4780    int vector_len = 2;
4781    __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4782  %}
4783  ins_pipe( pipe_slow );
4784%}
4785
4786instruct Repl8D_mem_evex(vec dst, memory mem) %{
4787  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4788  match(Set dst (ReplicateD (LoadD mem)));
4789  format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4790  ins_encode %{
4791    int vector_len = 2;
4792    __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4793  %}
4794  ins_pipe( pipe_slow );
4795%}
4796
4797instruct Repl8D_zero_evex(vec dst, immD0 zero) %{
4798  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4799  match(Set dst (ReplicateD zero));
4800  format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4801  ins_encode %{
4802    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4803    int vector_len = 2;
4804    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4805  %}
4806  ins_pipe( fpu_reg_reg );
4807%}
4808
4809// ====================REDUCTION ARITHMETIC=======================================
4810
4811instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
4812  predicate(UseSSE > 2 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->length() == 2);
4813  match(Set dst (AddReductionVI src1 src2));
4814  effect(TEMP tmp2, TEMP tmp);
4815  format %{ "movdqu  $tmp2,$src2\n\t"
4816            "phaddd  $tmp2,$tmp2\n\t"
4817            "movd    $tmp,$src1\n\t"
4818            "paddd   $tmp,$tmp2\n\t"
4819            "movd    $dst,$tmp\t! add reduction2I" %}
4820  ins_encode %{
4821    __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4822    __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4823    __ movdl($tmp$$XMMRegister, $src1$$Register);
4824    __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4825    __ movdl($dst$$Register, $tmp$$XMMRegister);
4826  %}
4827  ins_pipe( pipe_slow );
4828%}
4829
4830instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
4831  predicate(UseAVX > 0 && VM_Version::supports_avxonly() && n->in(2)->bottom_type()->is_vect()->length() == 2);
4832  match(Set dst (AddReductionVI src1 src2));
4833  effect(TEMP tmp, TEMP tmp2);
4834  format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4835            "movd     $tmp2,$src1\n\t"
4836            "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4837            "movd     $dst,$tmp2\t! add reduction2I" %}
4838  ins_encode %{
4839    int vector_len = 0;
4840    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4841    __ movdl($tmp2$$XMMRegister, $src1$$Register);
4842    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4843    __ movdl($dst$$Register, $tmp2$$XMMRegister);
4844  %}
4845  ins_pipe( pipe_slow );
4846%}
4847
4848instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
4849  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 2);
4850  match(Set dst (AddReductionVI src1 src2));
4851  effect(TEMP tmp, TEMP tmp2);
4852  format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4853            "vpaddd  $tmp,$src2,$tmp2\n\t"
4854            "movd    $tmp2,$src1\n\t"
4855            "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4856            "movd    $dst,$tmp2\t! add reduction2I" %}
4857  ins_encode %{
4858    int vector_len = 0;
4859    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4860    __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4861    __ movdl($tmp2$$XMMRegister, $src1$$Register);
4862    __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4863    __ movdl($dst$$Register, $tmp2$$XMMRegister);
4864  %}
4865  ins_pipe( pipe_slow );
4866%}
4867
4868instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
4869  predicate(UseSSE > 2 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->length() == 4);
4870  match(Set dst (AddReductionVI src1 src2));
4871  effect(TEMP tmp, TEMP tmp2);
4872  format %{ "movdqu  $tmp,$src2\n\t"
4873            "phaddd  $tmp,$tmp\n\t"
4874            "phaddd  $tmp,$tmp\n\t"
4875            "movd    $tmp2,$src1\n\t"
4876            "paddd   $tmp2,$tmp\n\t"
4877            "movd    $dst,$tmp2\t! add reduction4I" %}
4878  ins_encode %{
4879    __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4880    __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4881    __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4882    __ movdl($tmp2$$XMMRegister, $src1$$Register);
4883    __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
4884    __ movdl($dst$$Register, $tmp2$$XMMRegister);
4885  %}
4886  ins_pipe( pipe_slow );
4887%}
4888
4889instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
4890  predicate(UseAVX > 0 && VM_Version::supports_avxonly() && n->in(2)->bottom_type()->is_vect()->length() == 4);
4891  match(Set dst (AddReductionVI src1 src2));
4892  effect(TEMP tmp, TEMP tmp2);
4893  format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4894            "vphaddd  $tmp,$tmp,$tmp\n\t"
4895            "movd     $tmp2,$src1\n\t"
4896            "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4897            "movd     $dst,$tmp2\t! add reduction4I" %}
4898  ins_encode %{
4899    int vector_len = 0;
4900    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4901    __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4902    __ movdl($tmp2$$XMMRegister, $src1$$Register);
4903    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4904    __ movdl($dst$$Register, $tmp2$$XMMRegister);
4905  %}
4906  ins_pipe( pipe_slow );
4907%}
4908
4909instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
4910  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 4);
4911  match(Set dst (AddReductionVI src1 src2));
4912  effect(TEMP tmp, TEMP tmp2);
4913  format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4914            "vpaddd  $tmp,$src2,$tmp2\n\t"
4915            "pshufd  $tmp2,$tmp,0x1\n\t"
4916            "vpaddd  $tmp,$tmp,$tmp2\n\t"
4917            "movd    $tmp2,$src1\n\t"
4918            "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4919            "movd    $dst,$tmp2\t! add reduction4I" %}
4920  ins_encode %{
4921    int vector_len = 0;
4922    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4923    __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4924    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4925    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4926    __ movdl($tmp2$$XMMRegister, $src1$$Register);
4927    __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4928    __ movdl($dst$$Register, $tmp2$$XMMRegister);
4929  %}
4930  ins_pipe( pipe_slow );
4931%}
4932
4933instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
4934  predicate(UseAVX > 0 && VM_Version::supports_avxonly() && n->in(2)->bottom_type()->is_vect()->length() == 8);
4935  match(Set dst (AddReductionVI src1 src2));
4936  effect(TEMP tmp, TEMP tmp2);
4937  format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4938            "vphaddd  $tmp,$tmp,$tmp2\n\t"
4939            "vextracti128_high  $tmp2,$tmp\n\t"
4940            "vpaddd   $tmp,$tmp,$tmp2\n\t"
4941            "movd     $tmp2,$src1\n\t"
4942            "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4943            "movd     $dst,$tmp2\t! add reduction8I" %}
4944  ins_encode %{
4945    int vector_len = 1;
4946    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4947    __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4948    __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
4949    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4950    __ movdl($tmp2$$XMMRegister, $src1$$Register);
4951    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4952    __ movdl($dst$$Register, $tmp2$$XMMRegister);
4953  %}
4954  ins_pipe( pipe_slow );
4955%}
4956
4957instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
4958  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 8);
4959  match(Set dst (AddReductionVI src1 src2));
4960  effect(TEMP tmp, TEMP tmp2);
4961  format %{ "vextracti128_high  $tmp,$src2\n\t"
4962            "vpaddd  $tmp,$tmp,$src2\n\t"
4963            "pshufd  $tmp2,$tmp,0xE\n\t"
4964            "vpaddd  $tmp,$tmp,$tmp2\n\t"
4965            "pshufd  $tmp2,$tmp,0x1\n\t"
4966            "vpaddd  $tmp,$tmp,$tmp2\n\t"
4967            "movd    $tmp2,$src1\n\t"
4968            "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4969            "movd    $dst,$tmp2\t! add reduction8I" %}
4970  ins_encode %{
4971    int vector_len = 0;
4972    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4973    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4974    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4975    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4976    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4977    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4978    __ movdl($tmp2$$XMMRegister, $src1$$Register);
4979    __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4980    __ movdl($dst$$Register, $tmp2$$XMMRegister);
4981  %}
4982  ins_pipe( pipe_slow );
4983%}
4984
4985instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVec src2, legVec tmp, legVec tmp2, legVec tmp3) %{
4986  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 16);
4987  match(Set dst (AddReductionVI src1 src2));
4988  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4989  format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
4990            "vpaddd  $tmp3,$tmp3,$src2\n\t"
4991            "vextracti128_high  $tmp,$tmp3\n\t"
4992            "vpaddd  $tmp,$tmp,$tmp3\n\t"
4993            "pshufd  $tmp2,$tmp,0xE\n\t"
4994            "vpaddd  $tmp,$tmp,$tmp2\n\t"
4995            "pshufd  $tmp2,$tmp,0x1\n\t"
4996            "vpaddd  $tmp,$tmp,$tmp2\n\t"
4997            "movd    $tmp2,$src1\n\t"
4998            "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4999            "movd    $dst,$tmp2\t! mul reduction16I" %}
5000  ins_encode %{
5001    __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5002    __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5003    __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5004    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5005    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5006    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5007    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5008    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5009    __ movdl($tmp2$$XMMRegister, $src1$$Register);
5010    __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5011    __ movdl($dst$$Register, $tmp2$$XMMRegister);
5012  %}
5013  ins_pipe( pipe_slow );
5014%}
5015
5016#ifdef _LP64
5017instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
5018  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 2);
5019  match(Set dst (AddReductionVL src1 src2));
5020  effect(TEMP tmp, TEMP tmp2);
5021  format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5022            "vpaddq  $tmp,$src2,$tmp2\n\t"
5023            "movdq   $tmp2,$src1\n\t"
5024            "vpaddq  $tmp2,$tmp,$tmp2\n\t"
5025            "movdq   $dst,$tmp2\t! add reduction2L" %}
5026  ins_encode %{
5027    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5028    __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5029    __ movdq($tmp2$$XMMRegister, $src1$$Register);
5030    __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5031    __ movdq($dst$$Register, $tmp2$$XMMRegister);
5032  %}
5033  ins_pipe( pipe_slow );
5034%}
5035
5036instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
5037  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 4);
5038  match(Set dst (AddReductionVL src1 src2));
5039  effect(TEMP tmp, TEMP tmp2);
5040  format %{ "vextracti128_high  $tmp,$src2\n\t"
5041            "vpaddq  $tmp2,$tmp,$src2\n\t"
5042            "pshufd  $tmp,$tmp2,0xE\n\t"
5043            "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5044            "movdq   $tmp,$src1\n\t"
5045            "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5046            "movdq   $dst,$tmp2\t! add reduction4L" %}
5047  ins_encode %{
5048    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5049    __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5050    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5051    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5052    __ movdq($tmp$$XMMRegister, $src1$$Register);
5053    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5054    __ movdq($dst$$Register, $tmp2$$XMMRegister);
5055  %}
5056  ins_pipe( pipe_slow );
5057%}
5058
5059instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVec src2, legVec tmp, legVec tmp2) %{
5060  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 8);
5061  match(Set dst (AddReductionVL src1 src2));
5062  effect(TEMP tmp, TEMP tmp2);
5063  format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5064            "vpaddq  $tmp2,$tmp2,$src2\n\t"
5065            "vextracti128_high  $tmp,$tmp2\n\t"
5066            "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5067            "pshufd  $tmp,$tmp2,0xE\n\t"
5068            "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5069            "movdq   $tmp,$src1\n\t"
5070            "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5071            "movdq   $dst,$tmp2\t! add reduction8L" %}
5072  ins_encode %{
5073    __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5074    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5075    __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5076    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5077    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5078    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5079    __ movdq($tmp$$XMMRegister, $src1$$Register);
5080    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5081    __ movdq($dst$$Register, $tmp2$$XMMRegister);
5082  %}
5083  ins_pipe( pipe_slow );
5084%}
5085#endif
5086
5087instruct rsadd2F_reduction_reg(regF dst, vec src2, vec tmp) %{
5088  predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->length() == 2);
5089  match(Set dst (AddReductionVF dst src2));
5090  effect(TEMP dst, TEMP tmp);
5091  format %{ "addss   $dst,$src2\n\t"
5092            "pshufd  $tmp,$src2,0x01\n\t"
5093            "addss   $dst,$tmp\t! add reduction2F" %}
5094  ins_encode %{
5095    __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5096    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5097    __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5098  %}
5099  ins_pipe( pipe_slow );
5100%}
5101
5102instruct rvadd2F_reduction_reg(regF dst, vec src2, vec tmp) %{
5103  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 2);
5104  match(Set dst (AddReductionVF dst src2));
5105  effect(TEMP dst, TEMP tmp);
5106  format %{ "vaddss  $dst,$dst,$src2\n\t"
5107            "pshufd  $tmp,$src2,0x01\n\t"
5108            "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
5109  ins_encode %{
5110    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5111    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5112    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5113  %}
5114  ins_pipe( pipe_slow );
5115%}
5116
5117instruct rsadd4F_reduction_reg(regF dst, vec src2, vec tmp) %{
5118  predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->length() == 4);
5119  match(Set dst (AddReductionVF dst src2));
5120  effect(TEMP dst, TEMP tmp);
5121  format %{ "addss   $dst,$src2\n\t"
5122            "pshufd  $tmp,$src2,0x01\n\t"
5123            "addss   $dst,$tmp\n\t"
5124            "pshufd  $tmp,$src2,0x02\n\t"
5125            "addss   $dst,$tmp\n\t"
5126            "pshufd  $tmp,$src2,0x03\n\t"
5127            "addss   $dst,$tmp\t! add reduction4F" %}
5128  ins_encode %{
5129    __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5130    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5131    __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5132    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5133    __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5134    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5135    __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5136  %}
5137  ins_pipe( pipe_slow );
5138%}
5139
5140instruct rvadd4F_reduction_reg(regF dst, vec src2, vec tmp) %{
5141  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 4);
5142  match(Set dst (AddReductionVF dst src2));
5143  effect(TEMP tmp, TEMP dst);
5144  format %{ "vaddss  $dst,dst,$src2\n\t"
5145            "pshufd  $tmp,$src2,0x01\n\t"
5146            "vaddss  $dst,$dst,$tmp\n\t"
5147            "pshufd  $tmp,$src2,0x02\n\t"
5148            "vaddss  $dst,$dst,$tmp\n\t"
5149            "pshufd  $tmp,$src2,0x03\n\t"
5150            "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5151  ins_encode %{
5152    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5153    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5154    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5155    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5156    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5157    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5158    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5159  %}
5160  ins_pipe( pipe_slow );
5161%}
5162
5163instruct radd8F_reduction_reg(regF dst, vec src2, vec tmp, vec tmp2) %{
5164  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 8);
5165  match(Set dst (AddReductionVF dst src2));
5166  effect(TEMP tmp, TEMP dst, TEMP tmp2);
5167  format %{ "vaddss  $dst,$dst,$src2\n\t"
5168            "pshufd  $tmp,$src2,0x01\n\t"
5169            "vaddss  $dst,$dst,$tmp\n\t"
5170            "pshufd  $tmp,$src2,0x02\n\t"
5171            "vaddss  $dst,$dst,$tmp\n\t"
5172            "pshufd  $tmp,$src2,0x03\n\t"
5173            "vaddss  $dst,$dst,$tmp\n\t"
5174            "vextractf128_high  $tmp2,$src2\n\t"
5175            "vaddss  $dst,$dst,$tmp2\n\t"
5176            "pshufd  $tmp,$tmp2,0x01\n\t"
5177            "vaddss  $dst,$dst,$tmp\n\t"
5178            "pshufd  $tmp,$tmp2,0x02\n\t"
5179            "vaddss  $dst,$dst,$tmp\n\t"
5180            "pshufd  $tmp,$tmp2,0x03\n\t"
5181            "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5182  ins_encode %{
5183    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5184    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5185    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5186    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5187    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5188    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5189    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5190    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5191    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5192    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5193    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5194    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5195    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5196    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5197    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5198  %}
5199  ins_pipe( pipe_slow );
5200%}
5201
5202instruct radd16F_reduction_reg(regF dst, legVec src2, legVec tmp, legVec tmp2) %{
5203  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 16);
5204  match(Set dst (AddReductionVF dst src2));
5205  effect(TEMP tmp, TEMP dst, TEMP tmp2);
5206  format %{ "vaddss  $dst,$dst,$src2\n\t"
5207            "pshufd  $tmp,$src2,0x01\n\t"
5208            "vaddss  $dst,$dst,$tmp\n\t"
5209            "pshufd  $tmp,$src2,0x02\n\t"
5210            "vaddss  $dst,$dst,$tmp\n\t"
5211            "pshufd  $tmp,$src2,0x03\n\t"
5212            "vaddss  $dst,$dst,$tmp\n\t"
5213            "vextractf32x4  $tmp2,$src2,0x1\n\t"
5214            "vaddss  $dst,$dst,$tmp2\n\t"
5215            "pshufd  $tmp,$tmp2,0x01\n\t"
5216            "vaddss  $dst,$dst,$tmp\n\t"
5217            "pshufd  $tmp,$tmp2,0x02\n\t"
5218            "vaddss  $dst,$dst,$tmp\n\t"
5219            "pshufd  $tmp,$tmp2,0x03\n\t"
5220            "vaddss  $dst,$dst,$tmp\n\t"
5221            "vextractf32x4  $tmp2,$src2,0x2\n\t"
5222            "vaddss  $dst,$dst,$tmp2\n\t"
5223            "pshufd  $tmp,$tmp2,0x01\n\t"
5224            "vaddss  $dst,$dst,$tmp\n\t"
5225            "pshufd  $tmp,$tmp2,0x02\n\t"
5226            "vaddss  $dst,$dst,$tmp\n\t"
5227            "pshufd  $tmp,$tmp2,0x03\n\t"
5228            "vaddss  $dst,$dst,$tmp\n\t"
5229            "vextractf32x4  $tmp2,$src2,0x3\n\t"
5230            "vaddss  $dst,$dst,$tmp2\n\t"
5231            "pshufd  $tmp,$tmp2,0x01\n\t"
5232            "vaddss  $dst,$dst,$tmp\n\t"
5233            "pshufd  $tmp,$tmp2,0x02\n\t"
5234            "vaddss  $dst,$dst,$tmp\n\t"
5235            "pshufd  $tmp,$tmp2,0x03\n\t"
5236            "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5237  ins_encode %{
5238    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5239    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5240    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5241    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5242    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5243    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5244    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5245    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5246    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5247    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5248    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5249    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5250    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5251    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5252    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5253    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5254    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5255    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5256    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5257    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5258    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5259    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5260    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5261    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5262    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5263    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5264    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5265    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5266    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5267    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5268    __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5269  %}
5270  ins_pipe( pipe_slow );
5271%}
5272
5273instruct rsadd2D_reduction_reg(regD dst, vec src2, vec tmp) %{
5274  predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->length() == 2);
5275  match(Set dst (AddReductionVD dst src2));
5276  effect(TEMP tmp, TEMP dst);
5277  format %{ "addsd   $dst,$src2\n\t"
5278            "pshufd  $tmp,$src2,0xE\n\t"
5279            "addsd   $dst,$tmp\t! add reduction2D" %}
5280  ins_encode %{
5281    __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5282    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5283    __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5284  %}
5285  ins_pipe( pipe_slow );
5286%}
5287
5288instruct rvadd2D_reduction_reg(regD dst, vec src2, vec tmp) %{
5289  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 2);
5290  match(Set dst (AddReductionVD dst src2));
5291  effect(TEMP tmp, TEMP dst);
5292  format %{ "vaddsd  $dst,$dst,$src2\n\t"
5293            "pshufd  $tmp,$src2,0xE\n\t"
5294            "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5295  ins_encode %{
5296    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5297    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5298    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5299  %}
5300  ins_pipe( pipe_slow );
5301%}
5302
5303instruct rvadd4D_reduction_reg(regD dst, vec src2, vec tmp, vec tmp2) %{
5304  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 4);
5305  match(Set dst (AddReductionVD dst src2));
5306  effect(TEMP tmp, TEMP dst, TEMP tmp2);
5307  format %{ "vaddsd  $dst,$dst,$src2\n\t"
5308            "pshufd  $tmp,$src2,0xE\n\t"
5309            "vaddsd  $dst,$dst,$tmp\n\t"
5310            "vextractf128  $tmp2,$src2,0x1\n\t"
5311            "vaddsd  $dst,$dst,$tmp2\n\t"
5312            "pshufd  $tmp,$tmp2,0xE\n\t"
5313            "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5314  ins_encode %{
5315    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5316    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5317    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5318    __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5319    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5320    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5321    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5322  %}
5323  ins_pipe( pipe_slow );
5324%}
5325
5326instruct rvadd8D_reduction_reg(regD dst, legVec src2, legVec tmp, legVec tmp2) %{
5327  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 8);
5328  match(Set dst (AddReductionVD dst src2));
5329  effect(TEMP tmp, TEMP dst, TEMP tmp2);
5330  format %{ "vaddsd  $dst,$dst,$src2\n\t"
5331            "pshufd  $tmp,$src2,0xE\n\t"
5332            "vaddsd  $dst,$dst,$tmp\n\t"
5333            "vextractf32x4  $tmp2,$src2,0x1\n\t"
5334            "vaddsd  $dst,$dst,$tmp2\n\t"
5335            "pshufd  $tmp,$tmp2,0xE\n\t"
5336            "vaddsd  $dst,$dst,$tmp\n\t"
5337            "vextractf32x4  $tmp2,$src2,0x2\n\t"
5338            "vaddsd  $dst,$dst,$tmp2\n\t"
5339            "pshufd  $tmp,$tmp2,0xE\n\t"
5340            "vaddsd  $dst,$dst,$tmp\n\t"
5341            "vextractf32x4  $tmp2,$src2,0x3\n\t"
5342            "vaddsd  $dst,$dst,$tmp2\n\t"
5343            "pshufd  $tmp,$tmp2,0xE\n\t"
5344            "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5345  ins_encode %{
5346    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5347    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5348    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5349    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5350    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5351    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5352    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5353    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5354    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5355    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5356    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5357    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5358    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5359    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5360    __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5361  %}
5362  ins_pipe( pipe_slow );
5363%}
5364
5365instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
5366  predicate(UseSSE > 3 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->length() == 2);
5367  match(Set dst (MulReductionVI src1 src2));
5368  effect(TEMP tmp, TEMP tmp2);
5369  format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5370            "pmulld  $tmp2,$src2\n\t"
5371            "movd    $tmp,$src1\n\t"
5372            "pmulld  $tmp2,$tmp\n\t"
5373            "movd    $dst,$tmp2\t! mul reduction2I" %}
5374  ins_encode %{
5375    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5376    __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5377    __ movdl($tmp$$XMMRegister, $src1$$Register);
5378    __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5379    __ movdl($dst$$Register, $tmp2$$XMMRegister);
5380  %}
5381  ins_pipe( pipe_slow );
5382%}
5383
5384instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
5385  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 2);
5386  match(Set dst (MulReductionVI src1 src2));
5387  effect(TEMP tmp, TEMP tmp2);
5388  format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5389            "vpmulld  $tmp,$src2,$tmp2\n\t"
5390            "movd     $tmp2,$src1\n\t"
5391            "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5392            "movd     $dst,$tmp2\t! mul reduction2I" %}
5393  ins_encode %{
5394    int vector_len = 0;
5395    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5396    __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5397    __ movdl($tmp2$$XMMRegister, $src1$$Register);
5398    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5399    __ movdl($dst$$Register, $tmp2$$XMMRegister);
5400  %}
5401  ins_pipe( pipe_slow );
5402%}
5403
5404instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
5405  predicate(UseSSE > 3 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->length() == 4);
5406  match(Set dst (MulReductionVI src1 src2));
5407  effect(TEMP tmp, TEMP tmp2);
5408  format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5409            "pmulld  $tmp2,$src2\n\t"
5410            "pshufd  $tmp,$tmp2,0x1\n\t"
5411            "pmulld  $tmp2,$tmp\n\t"
5412            "movd    $tmp,$src1\n\t"
5413            "pmulld  $tmp2,$tmp\n\t"
5414            "movd    $dst,$tmp2\t! mul reduction4I" %}
5415  ins_encode %{
5416    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5417    __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5418    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5419    __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5420    __ movdl($tmp$$XMMRegister, $src1$$Register);
5421    __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5422    __ movdl($dst$$Register, $tmp2$$XMMRegister);
5423  %}
5424  ins_pipe( pipe_slow );
5425%}
5426
5427instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
5428  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 4);
5429  match(Set dst (MulReductionVI src1 src2));
5430  effect(TEMP tmp, TEMP tmp2);
5431  format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5432            "vpmulld  $tmp,$src2,$tmp2\n\t"
5433            "pshufd   $tmp2,$tmp,0x1\n\t"
5434            "vpmulld  $tmp,$tmp,$tmp2\n\t"
5435            "movd     $tmp2,$src1\n\t"
5436            "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5437            "movd     $dst,$tmp2\t! mul reduction4I" %}
5438  ins_encode %{
5439    int vector_len = 0;
5440    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5441    __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5442    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5443    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5444    __ movdl($tmp2$$XMMRegister, $src1$$Register);
5445    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5446    __ movdl($dst$$Register, $tmp2$$XMMRegister);
5447  %}
5448  ins_pipe( pipe_slow );
5449%}
5450
5451instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vec src2, vec tmp, vec tmp2) %{
5452  predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->length() == 8);
5453  match(Set dst (MulReductionVI src1 src2));
5454  effect(TEMP tmp, TEMP tmp2);
5455  format %{ "vextracti128_high  $tmp,$src2\n\t"
5456            "vpmulld  $tmp,$tmp,$src2\n\t"
5457            "pshufd   $tmp2,$tmp,0xE\n\t"
5458            "vpmulld  $tmp,$tmp,$tmp2\n\t"
5459            "pshufd   $tmp2,$tmp,0x1\n\t"
5460            "vpmulld  $tmp,$tmp,$tmp2\n\t"
5461            "movd     $tmp2,$src1\n\t"
5462            "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5463            "movd     $dst,$tmp2\t! mul reduction8I" %}
5464  ins_encode %{
5465    int vector_len = 0;
5466    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5467    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5468    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5469    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5470    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5471    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5472    __ movdl($tmp2$$XMMRegister, $src1$$Register);
5473    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5474    __ movdl($dst$$Register, $tmp2$$XMMRegister);
5475  %}
5476  ins_pipe( pipe_slow );
5477%}
5478
5479instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVec src2, legVec tmp, legVec tmp2, legVec tmp3) %{
5480  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 16);
5481  match(Set dst (MulReductionVI src1 src2));
5482  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5483  format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5484            "vpmulld  $tmp3,$tmp3,$src2\n\t"
5485            "vextracti128_high  $tmp,$tmp3\n\t"
5486            "vpmulld  $tmp,$tmp,$src2\n\t"
5487            "pshufd   $tmp2,$tmp,0xE\n\t"
5488            "vpmulld  $tmp,$tmp,$tmp2\n\t"
5489            "pshufd   $tmp2,$tmp,0x1\n\t"
5490            "vpmulld  $tmp,$tmp,$tmp2\n\t"
5491            "movd     $tmp2,$src1\n\t"
5492            "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5493            "movd     $dst,$tmp2\t! mul reduction16I" %}
5494  ins_encode %{
5495    __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5496    __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5497    __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5498    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5499    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5500    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5501    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5502    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5503    __ movdl($tmp2$$XMMRegister, $src1$$Register);
5504    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5505    __ movdl($dst$$Register, $tmp2$$XMMRegister);
5506  %}
5507  ins_pipe( pipe_slow );
5508%}
5509
5510#ifdef _LP64
5511instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
5512  predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->in(2)->bottom_type()->is_vect()->length() == 2);
5513  match(Set dst (MulReductionVL src1 src2));
5514  effect(TEMP tmp, TEMP tmp2);
5515  format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5516            "vpmullq  $tmp,$src2,$tmp2\n\t"
5517            "movdq    $tmp2,$src1\n\t"
5518            "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5519            "movdq    $dst,$tmp2\t! mul reduction2L" %}
5520  ins_encode %{
5521    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5522    __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5523    __ movdq($tmp2$$XMMRegister, $src1$$Register);
5524    __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5525    __ movdq($dst$$Register, $tmp2$$XMMRegister);
5526  %}
5527  ins_pipe( pipe_slow );
5528%}
5529
5530instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vec src2, vec tmp, vec tmp2) %{
5531  predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->in(2)->bottom_type()->is_vect()->length() == 4);
5532  match(Set dst (MulReductionVL src1 src2));
5533  effect(TEMP tmp, TEMP tmp2);
5534  format %{ "vextracti128_high  $tmp,$src2\n\t"
5535            "vpmullq  $tmp2,$tmp,$src2\n\t"
5536            "pshufd   $tmp,$tmp2,0xE\n\t"
5537            "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5538            "movdq    $tmp,$src1\n\t"
5539            "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5540            "movdq    $dst,$tmp2\t! mul reduction4L" %}
5541  ins_encode %{
5542    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5543    __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5544    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5545    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5546    __ movdq($tmp$$XMMRegister, $src1$$Register);
5547    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5548    __ movdq($dst$$Register, $tmp2$$XMMRegister);
5549  %}
5550  ins_pipe( pipe_slow );
5551%}
5552
5553instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVec src2, legVec tmp, legVec tmp2) %{
5554  predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->in(2)->bottom_type()->is_vect()->length() == 8);
5555  match(Set dst (MulReductionVL src1 src2));
5556  effect(TEMP tmp, TEMP tmp2);
5557  format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5558            "vpmullq  $tmp2,$tmp2,$src2\n\t"
5559            "vextracti128_high  $tmp,$tmp2\n\t"
5560            "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5561            "pshufd   $tmp,$tmp2,0xE\n\t"
5562            "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5563            "movdq    $tmp,$src1\n\t"
5564            "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5565            "movdq    $dst,$tmp2\t! mul reduction8L" %}
5566  ins_encode %{
5567    __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5568    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5569    __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5570    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5571    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5572    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5573    __ movdq($tmp$$XMMRegister, $src1$$Register);
5574    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5575    __ movdq($dst$$Register, $tmp2$$XMMRegister);
5576  %}
5577  ins_pipe( pipe_slow );
5578%}
5579#endif
5580
5581instruct rsmul2F_reduction(regF dst, vec src2, vec tmp) %{
5582  predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->length() == 2);
5583  match(Set dst (MulReductionVF dst src2));
5584  effect(TEMP dst, TEMP tmp);
5585  format %{ "mulss   $dst,$src2\n\t"
5586            "pshufd  $tmp,$src2,0x01\n\t"
5587            "mulss   $dst,$tmp\t! mul reduction2F" %}
5588  ins_encode %{
5589    __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5590    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5591    __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5592  %}
5593  ins_pipe( pipe_slow );
5594%}
5595
5596instruct rvmul2F_reduction_reg(regF dst, vec src2, vec tmp) %{
5597  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 2);
5598  match(Set dst (MulReductionVF dst src2));
5599  effect(TEMP tmp, TEMP dst);
5600  format %{ "vmulss  $dst,$dst,$src2\n\t"
5601            "pshufd  $tmp,$src2,0x01\n\t"
5602            "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5603  ins_encode %{
5604    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5605    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5606    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5607  %}
5608  ins_pipe( pipe_slow );
5609%}
5610
5611instruct rsmul4F_reduction_reg(regF dst, vec src2, vec tmp) %{
5612  predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->length() == 4);
5613  match(Set dst (MulReductionVF dst src2));
5614  effect(TEMP dst, TEMP tmp);
5615  format %{ "mulss   $dst,$src2\n\t"
5616            "pshufd  $tmp,$src2,0x01\n\t"
5617            "mulss   $dst,$tmp\n\t"
5618            "pshufd  $tmp,$src2,0x02\n\t"
5619            "mulss   $dst,$tmp\n\t"
5620            "pshufd  $tmp,$src2,0x03\n\t"
5621            "mulss   $dst,$tmp\t! mul reduction4F" %}
5622  ins_encode %{
5623    __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5624    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5625    __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5626    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5627    __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5628    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5629    __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5630  %}
5631  ins_pipe( pipe_slow );
5632%}
5633
5634instruct rvmul4F_reduction_reg(regF dst, vec src2, vec tmp) %{
5635  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 4);
5636  match(Set dst (MulReductionVF dst src2));
5637  effect(TEMP tmp, TEMP dst);
5638  format %{ "vmulss  $dst,$dst,$src2\n\t"
5639            "pshufd  $tmp,$src2,0x01\n\t"
5640            "vmulss  $dst,$dst,$tmp\n\t"
5641            "pshufd  $tmp,$src2,0x02\n\t"
5642            "vmulss  $dst,$dst,$tmp\n\t"
5643            "pshufd  $tmp,$src2,0x03\n\t"
5644            "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5645  ins_encode %{
5646    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5647    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5648    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5649    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5650    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5651    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5652    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5653  %}
5654  ins_pipe( pipe_slow );
5655%}
5656
5657instruct rvmul8F_reduction_reg(regF dst, vec src2, vec tmp, vec tmp2) %{
5658  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 8);
5659  match(Set dst (MulReductionVF dst src2));
5660  effect(TEMP tmp, TEMP dst, TEMP tmp2);
5661  format %{ "vmulss  $dst,$dst,$src2\n\t"
5662            "pshufd  $tmp,$src2,0x01\n\t"
5663            "vmulss  $dst,$dst,$tmp\n\t"
5664            "pshufd  $tmp,$src2,0x02\n\t"
5665            "vmulss  $dst,$dst,$tmp\n\t"
5666            "pshufd  $tmp,$src2,0x03\n\t"
5667            "vmulss  $dst,$dst,$tmp\n\t"
5668            "vextractf128_high  $tmp2,$src2\n\t"
5669            "vmulss  $dst,$dst,$tmp2\n\t"
5670            "pshufd  $tmp,$tmp2,0x01\n\t"
5671            "vmulss  $dst,$dst,$tmp\n\t"
5672            "pshufd  $tmp,$tmp2,0x02\n\t"
5673            "vmulss  $dst,$dst,$tmp\n\t"
5674            "pshufd  $tmp,$tmp2,0x03\n\t"
5675            "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5676  ins_encode %{
5677    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5678    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5679    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5680    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5681    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5682    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5683    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5684    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5685    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5686    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5687    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5688    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5689    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5690    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5691    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5692  %}
5693  ins_pipe( pipe_slow );
5694%}
5695
5696instruct rvmul16F_reduction_reg(regF dst, legVec src2, legVec tmp, legVec tmp2) %{
5697  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 16);
5698  match(Set dst (MulReductionVF dst src2));
5699  effect(TEMP tmp, TEMP dst, TEMP tmp2);
5700  format %{ "vmulss  $dst,$dst,$src2\n\t"
5701            "pshufd  $tmp,$src2,0x01\n\t"
5702            "vmulss  $dst,$dst,$tmp\n\t"
5703            "pshufd  $tmp,$src2,0x02\n\t"
5704            "vmulss  $dst,$dst,$tmp\n\t"
5705            "pshufd  $tmp,$src2,0x03\n\t"
5706            "vmulss  $dst,$dst,$tmp\n\t"
5707            "vextractf32x4  $tmp2,$src2,0x1\n\t"
5708            "vmulss  $dst,$dst,$tmp2\n\t"
5709            "pshufd  $tmp,$tmp2,0x01\n\t"
5710            "vmulss  $dst,$dst,$tmp\n\t"
5711            "pshufd  $tmp,$tmp2,0x02\n\t"
5712            "vmulss  $dst,$dst,$tmp\n\t"
5713            "pshufd  $tmp,$tmp2,0x03\n\t"
5714            "vmulss  $dst,$dst,$tmp\n\t"
5715            "vextractf32x4  $tmp2,$src2,0x2\n\t"
5716            "vmulss  $dst,$dst,$tmp2\n\t"
5717            "pshufd  $tmp,$tmp2,0x01\n\t"
5718            "vmulss  $dst,$dst,$tmp\n\t"
5719            "pshufd  $tmp,$tmp2,0x02\n\t"
5720            "vmulss  $dst,$dst,$tmp\n\t"
5721            "pshufd  $tmp,$tmp2,0x03\n\t"
5722            "vmulss  $dst,$dst,$tmp\n\t"
5723            "vextractf32x4  $tmp2,$src2,0x3\n\t"
5724            "vmulss  $dst,$dst,$tmp2\n\t"
5725            "pshufd  $tmp,$tmp2,0x01\n\t"
5726            "vmulss  $dst,$dst,$tmp\n\t"
5727            "pshufd  $tmp,$tmp2,0x02\n\t"
5728            "vmulss  $dst,$dst,$tmp\n\t"
5729            "pshufd  $tmp,$tmp2,0x03\n\t"
5730            "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5731  ins_encode %{
5732    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5733    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5734    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5735    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5736    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5737    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5738    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5739    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5740    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5741    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5742    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5743    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5744    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5745    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5746    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5747    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5748    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5749    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5750    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5751    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5752    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5753    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5754    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5755    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5756    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5757    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5758    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5759    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5760    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5761    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5762    __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5763  %}
5764  ins_pipe( pipe_slow );
5765%}
5766
5767instruct rsmul2D_reduction_reg(regD dst, vec src2, vec tmp) %{
5768  predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->length() == 2);
5769  match(Set dst (MulReductionVD dst src2));
5770  effect(TEMP dst, TEMP tmp);
5771  format %{ "mulsd   $dst,$src2\n\t"
5772            "pshufd  $tmp,$src2,0xE\n\t"
5773            "mulsd   $dst,$tmp\t! mul reduction2D" %}
5774  ins_encode %{
5775    __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5776    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5777    __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5778  %}
5779  ins_pipe( pipe_slow );
5780%}
5781
5782instruct rvmul2D_reduction_reg(regD dst, vec src2, vec tmp) %{
5783  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 2);
5784  match(Set dst (MulReductionVD dst src2));
5785  effect(TEMP tmp, TEMP dst);
5786  format %{ "vmulsd  $dst,$dst,$src2\n\t"
5787            "pshufd  $tmp,$src2,0xE\n\t"
5788            "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5789  ins_encode %{
5790    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5791    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5792    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5793  %}
5794  ins_pipe( pipe_slow );
5795%}
5796
5797instruct rvmul4D_reduction_reg(regD dst, vec src2, vec tmp, vec tmp2) %{
5798  predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->length() == 4);
5799  match(Set dst (MulReductionVD dst src2));
5800  effect(TEMP tmp, TEMP dst, TEMP tmp2);
5801  format %{ "vmulsd  $dst,$dst,$src2\n\t"
5802            "pshufd  $tmp,$src2,0xE\n\t"
5803            "vmulsd  $dst,$dst,$tmp\n\t"
5804            "vextractf128_high  $tmp2,$src2\n\t"
5805            "vmulsd  $dst,$dst,$tmp2\n\t"
5806            "pshufd  $tmp,$tmp2,0xE\n\t"
5807            "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5808  ins_encode %{
5809    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5810    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5811    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5812    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5813    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5814    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5815    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5816  %}
5817  ins_pipe( pipe_slow );
5818%}
5819
5820instruct rvmul8D_reduction_reg(regD dst, legVec src2, legVec tmp, legVec tmp2) %{
5821  predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->length() == 8);
5822  match(Set dst (MulReductionVD dst src2));
5823  effect(TEMP tmp, TEMP dst, TEMP tmp2);
5824  format %{ "vmulsd  $dst,$dst,$src2\n\t"
5825            "pshufd  $tmp,$src2,0xE\n\t"
5826            "vmulsd  $dst,$dst,$tmp\n\t"
5827            "vextractf32x4  $tmp2,$src2,0x1\n\t"
5828            "vmulsd  $dst,$dst,$tmp2\n\t"
5829            "pshufd  $tmp,$src2,0xE\n\t"
5830            "vmulsd  $dst,$dst,$tmp\n\t"
5831            "vextractf32x4  $tmp2,$src2,0x2\n\t"
5832            "vmulsd  $dst,$dst,$tmp2\n\t"
5833            "pshufd  $tmp,$tmp2,0xE\n\t"
5834            "vmulsd  $dst,$dst,$tmp\n\t"
5835            "vextractf32x4  $tmp2,$src2,0x3\n\t"
5836            "vmulsd  $dst,$dst,$tmp2\n\t"
5837            "pshufd  $tmp,$tmp2,0xE\n\t"
5838            "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5839  ins_encode %{
5840    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5841    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5842    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5843    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5844    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5845    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5846    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5847    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5848    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5849    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5850    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5851    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5852    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5853    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5854    __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5855  %}
5856  ins_pipe( pipe_slow );
5857%}
5858
5859// ====================VECTOR ARITHMETIC=======================================
5860
5861// --------------------------------- ADD --------------------------------------
5862
5863// Bytes vector add
5864instruct vaddB(vec dst, vec src) %{
5865  predicate(UseAVX == 0);
5866  match(Set dst (AddVB dst src));
5867  format %{ "paddb   $dst,$src\t! add packedB" %}
5868  ins_encode %{
5869    __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5870  %}
5871  ins_pipe( pipe_slow );
5872%}
5873
5874instruct vaddB_reg(vec dst, vec src1, vec src2) %{
5875  predicate(UseAVX > 0);
5876  match(Set dst (AddVB src1 src2));
5877  format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
5878  ins_encode %{
5879    int vector_len = vector_length_encoding(this);
5880    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5881  %}
5882  ins_pipe( pipe_slow );
5883%}
5884
5885instruct vaddB_mem(vec dst, vec src, memory mem) %{
5886  predicate(UseAVX > 0);
5887  match(Set dst (AddVB src (LoadVector mem)));
5888  format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
5889  ins_encode %{
5890    int vector_len = vector_length_encoding(this);
5891    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5892  %}
5893  ins_pipe( pipe_slow );
5894%}
5895
5896// Shorts/Chars vector add
5897instruct vaddS(vec dst, vec src) %{
5898  predicate(UseAVX == 0);
5899  match(Set dst (AddVS dst src));
5900  format %{ "paddw   $dst,$src\t! add packedS" %}
5901  ins_encode %{
5902    __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5903  %}
5904  ins_pipe( pipe_slow );
5905%}
5906
5907instruct vaddS_reg(vec dst, vec src1, vec src2) %{
5908  predicate(UseAVX > 0);
5909  match(Set dst (AddVS src1 src2));
5910  format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
5911  ins_encode %{
5912    int vector_len = vector_length_encoding(this);
5913    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5914  %}
5915  ins_pipe( pipe_slow );
5916%}
5917
5918instruct vaddS_mem(vec dst, vec src, memory mem) %{
5919  predicate(UseAVX > 0);
5920  match(Set dst (AddVS src (LoadVector mem)));
5921  format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
5922  ins_encode %{
5923    int vector_len = vector_length_encoding(this);
5924    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5925  %}
5926  ins_pipe( pipe_slow );
5927%}
5928
5929// Integers vector add
5930instruct vaddI(vec dst, vec src) %{
5931  predicate(UseAVX == 0);
5932  match(Set dst (AddVI dst src));
5933  format %{ "paddd   $dst,$src\t! add packedI" %}
5934  ins_encode %{
5935    __ paddd($dst$$XMMRegister, $src$$XMMRegister);
5936  %}
5937  ins_pipe( pipe_slow );
5938%}
5939
5940instruct vaddI_reg(vec dst, vec src1, vec src2) %{
5941  predicate(UseAVX > 0);
5942  match(Set dst (AddVI src1 src2));
5943  format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
5944  ins_encode %{
5945    int vector_len = vector_length_encoding(this);
5946    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5947  %}
5948  ins_pipe( pipe_slow );
5949%}
5950
5951
5952instruct vaddI_mem(vec dst, vec src, memory mem) %{
5953  predicate(UseAVX > 0);
5954  match(Set dst (AddVI src (LoadVector mem)));
5955  format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
5956  ins_encode %{
5957    int vector_len = vector_length_encoding(this);
5958    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5959  %}
5960  ins_pipe( pipe_slow );
5961%}
5962
5963// Longs vector add
5964instruct vaddL(vec dst, vec src) %{
5965  predicate(UseAVX == 0);
5966  match(Set dst (AddVL dst src));
5967  format %{ "paddq   $dst,$src\t! add packedL" %}
5968  ins_encode %{
5969    __ paddq($dst$$XMMRegister, $src$$XMMRegister);
5970  %}
5971  ins_pipe( pipe_slow );
5972%}
5973
5974instruct vaddL_reg(vec dst, vec src1, vec src2) %{
5975  predicate(UseAVX > 0);
5976  match(Set dst (AddVL src1 src2));
5977  format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
5978  ins_encode %{
5979    int vector_len = vector_length_encoding(this);
5980    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5981  %}
5982  ins_pipe( pipe_slow );
5983%}
5984
5985instruct vaddL_mem(vec dst, vec src, memory mem) %{
5986  predicate(UseAVX > 0);
5987  match(Set dst (AddVL src (LoadVector mem)));
5988  format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
5989  ins_encode %{
5990    int vector_len = vector_length_encoding(this);
5991    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5992  %}
5993  ins_pipe( pipe_slow );
5994%}
5995
5996// Floats vector add
5997instruct vaddF(vec dst, vec src) %{
5998  predicate(UseAVX == 0);
5999  match(Set dst (AddVF dst src));
6000  format %{ "addps   $dst,$src\t! add packedF" %}
6001  ins_encode %{
6002    __ addps($dst$$XMMRegister, $src$$XMMRegister);
6003  %}
6004  ins_pipe( pipe_slow );
6005%}
6006
6007instruct vaddF_reg(vec dst, vec src1, vec src2) %{
6008  predicate(UseAVX > 0);
6009  match(Set dst (AddVF src1 src2));
6010  format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
6011  ins_encode %{
6012    int vector_len = vector_length_encoding(this);
6013    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6014  %}
6015  ins_pipe( pipe_slow );
6016%}
6017
6018instruct vaddF_mem(vec dst, vec src, memory mem) %{
6019  predicate(UseAVX > 0);
6020  match(Set dst (AddVF src (LoadVector mem)));
6021  format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
6022  ins_encode %{
6023    int vector_len = vector_length_encoding(this);
6024    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6025  %}
6026  ins_pipe( pipe_slow );
6027%}
6028
6029// Doubles vector add
6030instruct vaddD(vec dst, vec src) %{
6031  predicate(UseAVX == 0);
6032  match(Set dst (AddVD dst src));
6033  format %{ "addpd   $dst,$src\t! add packedD" %}
6034  ins_encode %{
6035    __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6036  %}
6037  ins_pipe( pipe_slow );
6038%}
6039
6040instruct vaddD_reg(vec dst, vec src1, vec src2) %{
6041  predicate(UseAVX > 0);
6042  match(Set dst (AddVD src1 src2));
6043  format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
6044  ins_encode %{
6045    int vector_len = vector_length_encoding(this);
6046    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6047  %}
6048  ins_pipe( pipe_slow );
6049%}
6050
6051instruct vaddD_mem(vec dst, vec src, memory mem) %{
6052  predicate(UseAVX > 0);
6053  match(Set dst (AddVD src (LoadVector mem)));
6054  format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
6055  ins_encode %{
6056    int vector_len = vector_length_encoding(this);
6057    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6058  %}
6059  ins_pipe( pipe_slow );
6060%}
6061
6062// --------------------------------- SUB --------------------------------------
6063
6064// Bytes vector sub
6065instruct vsubB(vec dst, vec src) %{
6066  predicate(UseAVX == 0);
6067  match(Set dst (SubVB dst src));
6068  format %{ "psubb   $dst,$src\t! sub packedB" %}
6069  ins_encode %{
6070    __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6071  %}
6072  ins_pipe( pipe_slow );
6073%}
6074
6075instruct vsubB_reg(vec dst, vec src1, vec src2) %{
6076  predicate(UseAVX > 0);
6077  match(Set dst (SubVB src1 src2));
6078  format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
6079  ins_encode %{
6080    int vector_len = vector_length_encoding(this);
6081    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6082  %}
6083  ins_pipe( pipe_slow );
6084%}
6085
6086instruct vsubB_mem(vec dst, vec src, memory mem) %{
6087  predicate(UseAVX > 0);
6088  match(Set dst (SubVB src (LoadVector mem)));
6089  format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
6090  ins_encode %{
6091    int vector_len = vector_length_encoding(this);
6092    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6093  %}
6094  ins_pipe( pipe_slow );
6095%}
6096
6097// Shorts/Chars vector sub
6098instruct vsubS(vec dst, vec src) %{
6099  predicate(UseAVX == 0);
6100  match(Set dst (SubVS dst src));
6101  format %{ "psubw   $dst,$src\t! sub packedS" %}
6102  ins_encode %{
6103    __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6104  %}
6105  ins_pipe( pipe_slow );
6106%}
6107
6108
6109instruct vsubS_reg(vec dst, vec src1, vec src2) %{
6110  predicate(UseAVX > 0);
6111  match(Set dst (SubVS src1 src2));
6112  format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
6113  ins_encode %{
6114    int vector_len = vector_length_encoding(this);
6115    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6116  %}
6117  ins_pipe( pipe_slow );
6118%}
6119
6120instruct vsubS_mem(vec dst, vec src, memory mem) %{
6121  predicate(UseAVX > 0);
6122  match(Set dst (SubVS src (LoadVector mem)));
6123  format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
6124  ins_encode %{
6125    int vector_len = vector_length_encoding(this);
6126    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6127  %}
6128  ins_pipe( pipe_slow );
6129%}
6130
6131// Integers vector sub
6132instruct vsubI(vec dst, vec src) %{
6133  predicate(UseAVX == 0);
6134  match(Set dst (SubVI dst src));
6135  format %{ "psubd   $dst,$src\t! sub packedI" %}
6136  ins_encode %{
6137    __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6138  %}
6139  ins_pipe( pipe_slow );
6140%}
6141
6142instruct vsubI_reg(vec dst, vec src1, vec src2) %{
6143  predicate(UseAVX > 0);
6144  match(Set dst (SubVI src1 src2));
6145  format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
6146  ins_encode %{
6147    int vector_len = vector_length_encoding(this);
6148    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6149  %}
6150  ins_pipe( pipe_slow );
6151%}
6152
6153instruct vsubI_mem(vec dst, vec src, memory mem) %{
6154  predicate(UseAVX > 0);
6155  match(Set dst (SubVI src (LoadVector mem)));
6156  format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
6157  ins_encode %{
6158    int vector_len = vector_length_encoding(this);
6159    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6160  %}
6161  ins_pipe( pipe_slow );
6162%}
6163
6164// Longs vector sub
6165instruct vsubL(vec dst, vec src) %{
6166  predicate(UseAVX == 0);
6167  match(Set dst (SubVL dst src));
6168  format %{ "psubq   $dst,$src\t! sub packedL" %}
6169  ins_encode %{
6170    __ psubq($dst$$XMMRegister, $src$$XMMRegister);
6171  %}
6172  ins_pipe( pipe_slow );
6173%}
6174
6175instruct vsubL_reg(vec dst, vec src1, vec src2) %{
6176  predicate(UseAVX > 0);
6177  match(Set dst (SubVL src1 src2));
6178  format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
6179  ins_encode %{
6180    int vector_len = vector_length_encoding(this);
6181    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6182  %}
6183  ins_pipe( pipe_slow );
6184%}
6185
6186
6187instruct vsubL_mem(vec dst, vec src, memory mem) %{
6188  predicate(UseAVX > 0);
6189  match(Set dst (SubVL src (LoadVector mem)));
6190  format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
6191  ins_encode %{
6192    int vector_len = vector_length_encoding(this);
6193    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6194  %}
6195  ins_pipe( pipe_slow );
6196%}
6197
6198// Floats vector sub
6199instruct vsubF(vec dst, vec src) %{
6200  predicate(UseAVX == 0);
6201  match(Set dst (SubVF dst src));
6202  format %{ "subps   $dst,$src\t! sub packedF" %}
6203  ins_encode %{
6204    __ subps($dst$$XMMRegister, $src$$XMMRegister);
6205  %}
6206  ins_pipe( pipe_slow );
6207%}
6208
6209instruct vsubF_reg(vec dst, vec src1, vec src2) %{
6210  predicate(UseAVX > 0);
6211  match(Set dst (SubVF src1 src2));
6212  format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
6213  ins_encode %{
6214    int vector_len = vector_length_encoding(this);
6215    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6216  %}
6217  ins_pipe( pipe_slow );
6218%}
6219
6220instruct vsubF_mem(vec dst, vec src, memory mem) %{
6221  predicate(UseAVX > 0);
6222  match(Set dst (SubVF src (LoadVector mem)));
6223  format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
6224  ins_encode %{
6225    int vector_len = vector_length_encoding(this);
6226    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6227  %}
6228  ins_pipe( pipe_slow );
6229%}
6230
6231// Doubles vector sub
6232instruct vsubD(vec dst, vec src) %{
6233  predicate(UseAVX == 0);
6234  match(Set dst (SubVD dst src));
6235  format %{ "subpd   $dst,$src\t! sub packedD" %}
6236  ins_encode %{
6237    __ subpd($dst$$XMMRegister, $src$$XMMRegister);
6238  %}
6239  ins_pipe( pipe_slow );
6240%}
6241
6242instruct vsubD_reg(vec dst, vec src1, vec src2) %{
6243  predicate(UseAVX > 0);
6244  match(Set dst (SubVD src1 src2));
6245  format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
6246  ins_encode %{
6247    int vector_len = vector_length_encoding(this);
6248    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6249  %}
6250  ins_pipe( pipe_slow );
6251%}
6252
6253instruct vsubD_mem(vec dst, vec src, memory mem) %{
6254  predicate(UseAVX > 0);
6255  match(Set dst (SubVD src (LoadVector mem)));
6256  format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
6257  ins_encode %{
6258    int vector_len = vector_length_encoding(this);
6259    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6260  %}
6261  ins_pipe( pipe_slow );
6262%}
6263
6264// --------------------------------- MUL --------------------------------------
6265
6266// Byte vector mul
6267instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
6268  predicate(n->as_Vector()->length() == 4 ||
6269            n->as_Vector()->length() == 8);
6270  match(Set dst (MulVB src1 src2));
6271  effect(TEMP dst, TEMP tmp, TEMP scratch);
6272  format %{"vector_mulB $dst,$src1,$src2" %}
6273  ins_encode %{
6274    assert(UseSSE > 3, "required");
6275    __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
6276    __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
6277    __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
6278    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6279    __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
6280    __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
6281  %}
6282  ins_pipe( pipe_slow );
6283%}
6284
6285instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
6286  predicate(n->as_Vector()->length() == 16 && UseAVX <= 1);
6287  match(Set dst (MulVB src1 src2));
6288  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6289  format %{"vector_mulB $dst,$src1,$src2" %}
6290  ins_encode %{
6291    assert(UseSSE > 3, "required");
6292    __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
6293    __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
6294    __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
6295    __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
6296    __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
6297    __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
6298    __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
6299    __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
6300    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6301    __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
6302    __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
6303    __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
6304  %}
6305  ins_pipe( pipe_slow );
6306%}
6307
6308instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
6309  predicate(n->as_Vector()->length() == 16 && UseAVX > 1);
6310  match(Set dst (MulVB src1 src2));
6311  effect(TEMP dst, TEMP tmp, TEMP scratch);
6312  format %{"vector_mulB $dst,$src1,$src2" %}
6313  ins_encode %{
6314  int vector_len = Assembler::AVX_256bit;
6315    __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len);
6316    __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
6317    __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vector_len);
6318    __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6319    __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
6320    __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
6321    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
6322  %}
6323  ins_pipe( pipe_slow );
6324%}
6325
6326instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
6327  predicate(n->as_Vector()->length() == 32);
6328  match(Set dst (MulVB src1 src2));
6329  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6330  format %{"vector_mulB $dst,$src1,$src2" %}
6331  ins_encode %{
6332    assert(UseAVX > 1, "required");
6333    int vector_len = Assembler::AVX_256bit;
6334    __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
6335    __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
6336    __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
6337    __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
6338    __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
6339    __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
6340    __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
6341    __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
6342    __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6343    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
6344    __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
6345    __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6346    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vector_len);
6347    __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
6348  %}
6349  ins_pipe( pipe_slow );
6350%}
6351
6352instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
6353  predicate(n->as_Vector()->length() == 64);
6354  match(Set dst (MulVB src1 src2));
6355  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6356  format %{"vector_mulB $dst,$src1,$src2\n\t" %}
6357  ins_encode %{
6358    assert(UseAVX > 2, "required");
6359    int vector_len = Assembler::AVX_512bit;
6360    __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
6361    __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
6362    __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
6363    __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
6364    __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
6365    __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
6366    __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
6367    __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
6368    __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6369    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
6370    __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
6371    __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
6372    __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6373    __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
6374    __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
6375  %}
6376  ins_pipe( pipe_slow );
6377%}
6378
6379// Shorts/Chars vector mul
6380instruct vmulS(vec dst, vec src) %{
6381  predicate(UseAVX == 0);
6382  match(Set dst (MulVS dst src));
6383  format %{ "pmullw $dst,$src\t! mul packedS" %}
6384  ins_encode %{
6385    __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
6386  %}
6387  ins_pipe( pipe_slow );
6388%}
6389
6390instruct vmulS_reg(vec dst, vec src1, vec src2) %{
6391  predicate(UseAVX > 0);
6392  match(Set dst (MulVS src1 src2));
6393  format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
6394  ins_encode %{
6395    int vector_len = vector_length_encoding(this);
6396    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6397  %}
6398  ins_pipe( pipe_slow );
6399%}
6400
6401instruct vmulS_mem(vec dst, vec src, memory mem) %{
6402  predicate(UseAVX > 0);
6403  match(Set dst (MulVS src (LoadVector mem)));
6404  format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
6405  ins_encode %{
6406    int vector_len = vector_length_encoding(this);
6407    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6408  %}
6409  ins_pipe( pipe_slow );
6410%}
6411
6412// Integers vector mul
6413instruct vmulI(vec dst, vec src) %{
6414  predicate(UseAVX == 0);
6415  match(Set dst (MulVI dst src));
6416  format %{ "pmulld  $dst,$src\t! mul packedI" %}
6417  ins_encode %{
6418    assert(UseSSE > 3, "required");
6419    __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
6420  %}
6421  ins_pipe( pipe_slow );
6422%}
6423
6424instruct vmulI_reg(vec dst, vec src1, vec src2) %{
6425  predicate(UseAVX > 0);
6426  match(Set dst (MulVI src1 src2));
6427  format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
6428  ins_encode %{
6429    int vector_len = vector_length_encoding(this);
6430    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6431  %}
6432  ins_pipe( pipe_slow );
6433%}
6434
6435instruct vmulI_mem(vec dst, vec src, memory mem) %{
6436  predicate(UseAVX > 0);
6437  match(Set dst (MulVI src (LoadVector mem)));
6438  format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
6439  ins_encode %{
6440    int vector_len = vector_length_encoding(this);
6441    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6442  %}
6443  ins_pipe( pipe_slow );
6444%}
6445
6446// Longs vector mul
6447instruct vmulL_reg(vec dst, vec src1, vec src2) %{
6448  match(Set dst (MulVL src1 src2));
6449  format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
6450  ins_encode %{
6451    assert(UseAVX > 2, "required");
6452    int vector_len = vector_length_encoding(this);
6453    __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6454  %}
6455  ins_pipe( pipe_slow );
6456%}
6457
6458instruct vmulL_mem(vec dst, vec src, memory mem) %{
6459  match(Set dst (MulVL src (LoadVector mem)));
6460  format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
6461  ins_encode %{
6462    assert(UseAVX > 2, "required");
6463    int vector_len = vector_length_encoding(this);
6464    __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6465  %}
6466  ins_pipe( pipe_slow );
6467%}
6468
6469// Floats vector mul
6470instruct vmulF(vec dst, vec src) %{
6471  predicate(UseAVX == 0);
6472  match(Set dst (MulVF dst src));
6473  format %{ "mulps   $dst,$src\t! mul packedF" %}
6474  ins_encode %{
6475    __ mulps($dst$$XMMRegister, $src$$XMMRegister);
6476  %}
6477  ins_pipe( pipe_slow );
6478%}
6479
6480instruct vmulF_reg(vec dst, vec src1, vec src2) %{
6481  predicate(UseAVX > 0);
6482  match(Set dst (MulVF src1 src2));
6483  format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
6484  ins_encode %{
6485    int vector_len = vector_length_encoding(this);
6486    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6487  %}
6488  ins_pipe( pipe_slow );
6489%}
6490
6491instruct vmulF_mem(vec dst, vec src, memory mem) %{
6492  predicate(UseAVX > 0);
6493  match(Set dst (MulVF src (LoadVector mem)));
6494  format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
6495  ins_encode %{
6496    int vector_len = vector_length_encoding(this);
6497    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6498  %}
6499  ins_pipe( pipe_slow );
6500%}
6501
6502// Doubles vector mul
6503instruct vmulD(vec dst, vec src) %{
6504  predicate(UseAVX == 0);
6505  match(Set dst (MulVD dst src));
6506  format %{ "mulpd   $dst,$src\t! mul packedD" %}
6507  ins_encode %{
6508    __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
6509  %}
6510  ins_pipe( pipe_slow );
6511%}
6512
6513instruct vmulD_reg(vec dst, vec src1, vec src2) %{
6514  predicate(UseAVX > 0);
6515  match(Set dst (MulVD src1 src2));
6516  format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
6517  ins_encode %{
6518    int vector_len = vector_length_encoding(this);
6519    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6520  %}
6521  ins_pipe( pipe_slow );
6522%}
6523
6524instruct vmulD_mem(vec dst, vec src, memory mem) %{
6525  predicate(UseAVX > 0);
6526  match(Set dst (MulVD src (LoadVector mem)));
6527  format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
6528  ins_encode %{
6529    int vector_len = vector_length_encoding(this);
6530    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6531  %}
6532  ins_pipe( pipe_slow );
6533%}
6534
6535instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
6536  predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6537  match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
6538  effect(TEMP dst, USE src1, USE src2);
6539  format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
6540            "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
6541         %}
6542  ins_encode %{
6543    int vector_len = 1;
6544    int cond = (Assembler::Condition)($copnd$$cmpcode);
6545    __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
6546    __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
6547  %}
6548  ins_pipe( pipe_slow );
6549%}
6550
6551instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
6552  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6553  match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
6554  effect(TEMP dst, USE src1, USE src2);
6555  format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
6556            "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
6557         %}
6558  ins_encode %{
6559    int vector_len = 1;
6560    int cond = (Assembler::Condition)($copnd$$cmpcode);
6561    __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
6562    __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
6563  %}
6564  ins_pipe( pipe_slow );
6565%}
6566
6567// --------------------------------- DIV --------------------------------------
6568
6569// Floats vector div
6570instruct vdivF(vec dst, vec src) %{
6571  predicate(UseAVX == 0);
6572  match(Set dst (DivVF dst src));
6573  format %{ "divps   $dst,$src\t! div packedF" %}
6574  ins_encode %{
6575    __ divps($dst$$XMMRegister, $src$$XMMRegister);
6576  %}
6577  ins_pipe( pipe_slow );
6578%}
6579
6580instruct vdivF_reg(vec dst, vec src1, vec src2) %{
6581  predicate(UseAVX > 0);
6582  match(Set dst (DivVF src1 src2));
6583  format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
6584  ins_encode %{
6585    int vector_len = vector_length_encoding(this);
6586    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6587  %}
6588  ins_pipe( pipe_slow );
6589%}
6590
6591instruct vdivF_mem(vec dst, vec src, memory mem) %{
6592  predicate(UseAVX > 0);
6593  match(Set dst (DivVF src (LoadVector mem)));
6594  format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
6595  ins_encode %{
6596    int vector_len = vector_length_encoding(this);
6597    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6598  %}
6599  ins_pipe( pipe_slow );
6600%}
6601
6602// Doubles vector div
6603instruct vdivD(vec dst, vec src) %{
6604  predicate(UseAVX == 0);
6605  match(Set dst (DivVD dst src));
6606  format %{ "divpd   $dst,$src\t! div packedD" %}
6607  ins_encode %{
6608    __ divpd($dst$$XMMRegister, $src$$XMMRegister);
6609  %}
6610  ins_pipe( pipe_slow );
6611%}
6612
6613instruct vdivD_reg(vec dst, vec src1, vec src2) %{
6614  predicate(UseAVX > 0);
6615  match(Set dst (DivVD src1 src2));
6616  format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
6617  ins_encode %{
6618    int vector_len = vector_length_encoding(this);
6619    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6620  %}
6621  ins_pipe( pipe_slow );
6622%}
6623
6624instruct vdivD_mem(vec dst, vec src, memory mem) %{
6625  predicate(UseAVX > 0);
6626  match(Set dst (DivVD src (LoadVector mem)));
6627  format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
6628  ins_encode %{
6629    int vector_len = vector_length_encoding(this);
6630    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6631  %}
6632  ins_pipe( pipe_slow );
6633%}
6634
6635// --------------------------------- Sqrt --------------------------------------
6636
6637instruct vsqrtF_reg(vec dst, vec src) %{
6638  match(Set dst (SqrtVF src));
6639  format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
6640  ins_encode %{
6641    assert(UseAVX > 0, "required");
6642    int vector_len = vector_length_encoding(this);
6643    __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
6644  %}
6645  ins_pipe( pipe_slow );
6646%}
6647
6648instruct vsqrtF_mem(vec dst, memory mem) %{
6649  match(Set dst (SqrtVF (LoadVector mem)));
6650  format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
6651  ins_encode %{
6652    assert(UseAVX > 0, "required");
6653    int vector_len = vector_length_encoding(this);
6654    __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
6655  %}
6656  ins_pipe( pipe_slow );
6657%}
6658
6659// Floating point vector sqrt
6660instruct vsqrtD_reg(vec dst, vec src) %{
6661  match(Set dst (SqrtVD src));
6662  format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
6663  ins_encode %{
6664    assert(UseAVX > 0, "required");
6665    int vector_len = vector_length_encoding(this);
6666    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
6667  %}
6668  ins_pipe( pipe_slow );
6669%}
6670
6671instruct vsqrtD_mem(vec dst, memory mem) %{
6672  match(Set dst (SqrtVD (LoadVector mem)));
6673  format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
6674  ins_encode %{
6675    assert(UseAVX > 0, "required");
6676    int vector_len = vector_length_encoding(this);
6677    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
6678  %}
6679  ins_pipe( pipe_slow );
6680%}
6681
6682// ------------------------------ Shift ---------------------------------------
6683
6684// Left and right shift count vectors are the same on x86
6685// (only lowest bits of xmm reg are used for count).
6686instruct vshiftcnt(vec dst, rRegI cnt) %{
6687  match(Set dst (LShiftCntV cnt));
6688  match(Set dst (RShiftCntV cnt));
6689  format %{ "movdl    $dst,$cnt\t! load shift count" %}
6690  ins_encode %{
6691    __ movdl($dst$$XMMRegister, $cnt$$Register);
6692  %}
6693  ins_pipe( pipe_slow );
6694%}
6695
6696instruct vshiftcntimm(vec dst, immI8 cnt, rRegI tmp) %{
6697  match(Set dst cnt);
6698  effect(TEMP tmp);
6699  format %{ "movl    $tmp,$cnt\t"
6700            "movdl   $dst,$tmp\t! load shift count" %}
6701  ins_encode %{
6702    __ movl($tmp$$Register, $cnt$$constant);
6703    __ movdl($dst$$XMMRegister, $tmp$$Register);
6704  %}
6705  ins_pipe( pipe_slow );
6706%}
6707
6708// Byte vector shift
6709instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6710  predicate(n->as_Vector()->length() <= 8);
6711  match(Set dst (LShiftVB src shift));
6712  match(Set dst (RShiftVB src shift));
6713  match(Set dst (URShiftVB src shift));
6714  effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
6715  format %{"vector_byte_shift $dst,$src,$shift" %}
6716  ins_encode %{
6717    assert(UseSSE > 3, "required");
6718    int opcode = this->ideal_Opcode();
6719    __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister);
6720    __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
6721    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6722    __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
6723    __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
6724  %}
6725  ins_pipe( pipe_slow );
6726%}
6727
6728instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6729  predicate(n->as_Vector()->length() == 16 && UseAVX <= 1);
6730  match(Set dst (LShiftVB src shift));
6731  match(Set dst (RShiftVB src shift));
6732  match(Set dst (URShiftVB src shift));
6733  effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
6734  format %{"vector_byte_shift $dst,$src,$shift" %}
6735  ins_encode %{
6736    assert(UseSSE > 3, "required");
6737    int opcode = this->ideal_Opcode();
6738
6739    __ vextendbw(opcode, $tmp1$$XMMRegister, $src$$XMMRegister);
6740    __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
6741    __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
6742    __ vextendbw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
6743    __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
6744    __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6745    __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
6746    __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
6747    __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
6748  %}
6749  ins_pipe( pipe_slow );
6750%}
6751
6752instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6753  predicate(n->as_Vector()->length() == 16 && UseAVX > 1);
6754  match(Set dst (LShiftVB src shift));
6755  match(Set dst (RShiftVB src shift));
6756  match(Set dst (URShiftVB src shift));
6757  effect(TEMP dst, TEMP tmp, TEMP scratch);
6758  format %{"vector_byte_shift $dst,$src,$shift" %}
6759  ins_encode %{
6760    int opcode = this->ideal_Opcode();
6761    int vector_len = Assembler::AVX_256bit;
6762    __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister, vector_len);
6763    __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
6764    __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
6765    __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
6766    __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
6767  %}
6768  ins_pipe( pipe_slow );
6769%}
6770
6771instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6772  predicate(n->as_Vector()->length() == 32);
6773  match(Set dst (LShiftVB src shift));
6774  match(Set dst (RShiftVB src shift));
6775  match(Set dst (URShiftVB src shift));
6776  effect(TEMP dst, TEMP tmp, TEMP scratch);
6777  format %{"vector_byte_shift $dst,$src,$shift" %}
6778  ins_encode %{
6779    assert(UseAVX > 1, "required");
6780    int opcode = this->ideal_Opcode();
6781    int vector_len = Assembler::AVX_256bit;
6782    __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
6783    __ vextendbw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6784    __ vextendbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
6785    __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
6786    __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vector_len);
6787    __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
6788    __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
6789    __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
6790    __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
6791  %}
6792  ins_pipe( pipe_slow );
6793%}
6794
6795instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6796  predicate(n->as_Vector()->length() == 64);
6797  match(Set dst (LShiftVB src shift));
6798  match(Set dst (RShiftVB src shift));
6799  match(Set dst (URShiftVB src shift));
6800  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6801  format %{"vector_byte_shift $dst,$src,$shift" %}
6802  ins_encode %{
6803    assert(UseAVX > 2, "required");
6804    int opcode = this->ideal_Opcode();
6805    int vector_len = Assembler::AVX_512bit;
6806    __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6807    __ vextendbw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
6808    __ vextendbw(opcode, $tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
6809    __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vector_len);
6810    __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
6811    __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6812    __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
6813    __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
6814    __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
6815    __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6816    __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
6817    __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
6818  %}
6819  ins_pipe( pipe_slow );
6820%}
6821
6822// Shorts vector logical right shift produces incorrect Java result
6823// for negative data because java code convert short value into int with
6824// sign extension before a shift. But char vectors are fine since chars are
6825// unsigned values.
6826// Shorts/Chars vector left shift
6827instruct vshiftS(vec dst, vec src, vec shift) %{
6828  match(Set dst (LShiftVS src shift));
6829  match(Set dst (RShiftVS src shift));
6830  match(Set dst (URShiftVS src shift));
6831  effect(TEMP dst, USE src, USE shift);
6832  format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6833  ins_encode %{
6834    int opcode = this->ideal_Opcode();
6835    if (UseAVX > 0) {
6836      int vlen_enc = vector_length_encoding(this);
6837      __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6838    } else {
6839      int vlen = vector_length(this);
6840      if (vlen == 2) {
6841        __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6842        __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6843      } else if (vlen == 4) {
6844        __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6845        __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6846      } else {
6847        assert (vlen == 8, "sanity");
6848        __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6849        __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6850      }
6851    }
6852  %}
6853  ins_pipe( pipe_slow );
6854%}
6855
6856// Integers vector left shift
6857instruct vshiftI(vec dst, vec src, vec shift) %{
6858  match(Set dst (LShiftVI src shift));
6859  match(Set dst (RShiftVI src shift));
6860  match(Set dst (URShiftVI src shift));
6861  effect(TEMP dst, USE src, USE shift);
6862  format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6863  ins_encode %{
6864    int opcode = this->ideal_Opcode();
6865    if (UseAVX > 0) {
6866      int vector_len = vector_length_encoding(this);
6867      __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
6868    } else {
6869      int vlen = vector_length(this);
6870      if (vlen == 2) {
6871        __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6872        __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6873      } else {
6874        assert(vlen == 4, "sanity");
6875        __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6876        __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6877      }
6878    }
6879  %}
6880  ins_pipe( pipe_slow );
6881%}
6882
6883// Longs vector shift
6884instruct vshiftL(vec dst, vec src, vec shift) %{
6885  match(Set dst (LShiftVL src shift));
6886  match(Set dst (URShiftVL src shift));
6887  effect(TEMP dst, USE src, USE shift);
6888  format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6889  ins_encode %{
6890    int opcode = this->ideal_Opcode();
6891    if (UseAVX > 0) {
6892      int vector_len = vector_length_encoding(this);
6893      __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
6894    } else {
6895      assert(vector_length(this) == 2, "");
6896      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6897      __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6898    }
6899  %}
6900  ins_pipe( pipe_slow );
6901%}
6902
6903// -------------------ArithmeticRightShift -----------------------------------
6904// Long vector arithmetic right shift
6905instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6906  predicate(UseAVX <= 2);
6907  match(Set dst (RShiftVL src shift));
6908  effect(TEMP dst, TEMP tmp, TEMP scratch);
6909  format %{ "vshiftq $dst,$src,$shift" %}
6910  ins_encode %{
6911    uint vlen = vector_length(this);
6912    if (vlen == 2) {
6913      assert(UseSSE >= 2, "required");
6914      __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6915      __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6916      __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6917      __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6918      __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6919      __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6920    } else {
6921      assert(vlen == 4, "sanity");
6922      assert(UseAVX > 1, "required");
6923      int vector_len = Assembler::AVX_256bit;
6924      __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
6925      __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6926      __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
6927      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
6928      __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
6929    }
6930  %}
6931  ins_pipe( pipe_slow );
6932%}
6933
6934instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6935  predicate(UseAVX > 2);
6936  match(Set dst (RShiftVL src shift));
6937  format %{ "vshiftq $dst,$src,$shift" %}
6938  ins_encode %{
6939    int vector_len = vector_length_encoding(this);
6940    __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
6941  %}
6942  ins_pipe( pipe_slow );
6943%}
6944
6945// --------------------------------- AND --------------------------------------
6946
6947instruct vand(vec dst, vec src) %{
6948  predicate(UseAVX == 0);
6949  match(Set dst (AndV dst src));
6950  format %{ "pand    $dst,$src\t! and vectors" %}
6951  ins_encode %{
6952    __ pand($dst$$XMMRegister, $src$$XMMRegister);
6953  %}
6954  ins_pipe( pipe_slow );
6955%}
6956
6957instruct vand_reg(vec dst, vec src1, vec src2) %{
6958  predicate(UseAVX > 0);
6959  match(Set dst (AndV src1 src2));
6960  format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
6961  ins_encode %{
6962    int vector_len = vector_length_encoding(this);
6963    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6964  %}
6965  ins_pipe( pipe_slow );
6966%}
6967
6968instruct vand_mem(vec dst, vec src, memory mem) %{
6969  predicate(UseAVX > 0);
6970  match(Set dst (AndV src (LoadVector mem)));
6971  format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
6972  ins_encode %{
6973    int vector_len = vector_length_encoding(this);
6974    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6975  %}
6976  ins_pipe( pipe_slow );
6977%}
6978
6979// --------------------------------- OR ---------------------------------------
6980
6981instruct vor(vec dst, vec src) %{
6982  predicate(UseAVX == 0);
6983  match(Set dst (OrV dst src));
6984  format %{ "por     $dst,$src\t! or vectors" %}
6985  ins_encode %{
6986    __ por($dst$$XMMRegister, $src$$XMMRegister);
6987  %}
6988  ins_pipe( pipe_slow );
6989%}
6990
6991instruct vor_reg(vec dst, vec src1, vec src2) %{
6992  predicate(UseAVX > 0);
6993  match(Set dst (OrV src1 src2));
6994  format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
6995  ins_encode %{
6996    int vector_len = vector_length_encoding(this);
6997    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6998  %}
6999  ins_pipe( pipe_slow );
7000%}
7001
7002instruct vor_mem(vec dst, vec src, memory mem) %{
7003  predicate(UseAVX > 0);
7004  match(Set dst (OrV src (LoadVector mem)));
7005  format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
7006  ins_encode %{
7007    int vector_len = vector_length_encoding(this);
7008    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7009  %}
7010  ins_pipe( pipe_slow );
7011%}
7012
7013// --------------------------------- XOR --------------------------------------
7014
7015instruct vxor(vec dst, vec src) %{
7016  predicate(UseAVX == 0);
7017  match(Set dst (XorV dst src));
7018  format %{ "pxor    $dst,$src\t! xor vectors" %}
7019  ins_encode %{
7020    __ pxor($dst$$XMMRegister, $src$$XMMRegister);
7021  %}
7022  ins_pipe( pipe_slow );
7023%}
7024
7025instruct vxor_reg(vec dst, vec src1, vec src2) %{
7026  predicate(UseAVX > 0);
7027  match(Set dst (XorV src1 src2));
7028  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
7029  ins_encode %{
7030    int vector_len = vector_length_encoding(this);
7031    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7032  %}
7033  ins_pipe( pipe_slow );
7034%}
7035
7036instruct vxor_mem(vec dst, vec src, memory mem) %{
7037  predicate(UseAVX > 0);
7038  match(Set dst (XorV src (LoadVector mem)));
7039  format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
7040  ins_encode %{
7041    int vector_len = vector_length_encoding(this);
7042    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7043  %}
7044  ins_pipe( pipe_slow );
7045%}
7046
7047// --------------------------------- ABS --------------------------------------
7048// a = |a|
7049instruct vabsB_reg(vec dst, vec src) %{
7050  match(Set dst (AbsVB  src));
7051  format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
7052  ins_encode %{
7053    uint vlen = vector_length(this);
7054    if (vlen <= 16) {
7055      __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7056    } else {
7057      int vlen_enc = vector_length_encoding(this);
7058      __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7059    }
7060  %}
7061  ins_pipe( pipe_slow );
7062%}
7063
7064instruct vabsS_reg(vec dst, vec src) %{
7065  match(Set dst (AbsVS  src));
7066  format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
7067  ins_encode %{
7068    uint vlen = vector_length(this);
7069    if (vlen <= 8) {
7070      __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7071    } else {
7072      int vlen_enc = vector_length_encoding(this);
7073      __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7074    }
7075  %}
7076  ins_pipe( pipe_slow );
7077%}
7078
7079instruct vabsI_reg(vec dst, vec src) %{
7080  match(Set dst (AbsVI  src));
7081  format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
7082  ins_encode %{
7083    uint vlen = vector_length(this);
7084    if (vlen <= 4) {
7085      __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7086    } else {
7087      int vlen_enc = vector_length_encoding(this);
7088      __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7089    }
7090  %}
7091  ins_pipe( pipe_slow );
7092%}
7093
7094instruct vabsL_reg(vec dst, vec src) %{
7095  match(Set dst (AbsVL  src));
7096  format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
7097  ins_encode %{
7098    assert(UseAVX > 2, "required");
7099    int vector_len = vector_length_encoding(this);
7100    __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
7101  %}
7102  ins_pipe( pipe_slow );
7103%}
7104
7105// --------------------------------- ABSNEG --------------------------------------
7106
7107instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
7108  predicate(n->as_Vector()->length() != 4); // handled by 1-operand instruction vabsneg4F
7109  match(Set dst (AbsVF src));
7110  match(Set dst (NegVF src));
7111  effect(TEMP scratch);
7112  format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
7113  ins_cost(150);
7114  ins_encode %{
7115    int opcode = this->ideal_Opcode();
7116    int vlen = vector_length(this);
7117    if (vlen == 2) {
7118      __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7119    } else {
7120      assert(vlen == 8 || vlen == 16, "required");
7121      int vlen_enc = vector_length_encoding(this);
7122      __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7123    }
7124  %}
7125  ins_pipe( pipe_slow );
7126%}
7127
7128instruct vabsneg4F(vec dst, rRegI scratch) %{
7129  predicate(n->as_Vector()->length() == 4);
7130  match(Set dst (AbsVF dst));
7131  match(Set dst (NegVF dst));
7132  effect(TEMP scratch);
7133  format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7134  ins_cost(150);
7135  ins_encode %{
7136    int opcode = this->ideal_Opcode();
7137    __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7138  %}
7139  ins_pipe( pipe_slow );
7140%}
7141
7142instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7143  match(Set dst (AbsVD  src));
7144  match(Set dst (NegVD  src));
7145  effect(TEMP scratch);
7146  format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7147  ins_encode %{
7148    int opcode = this->ideal_Opcode();
7149    uint vlen = vector_length(this);
7150    if (vlen == 2) {
7151      assert(UseSSE >= 2, "required");
7152      __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7153    } else {
7154      int vlen_enc = vector_length_encoding(this);
7155      __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7156    }
7157  %}
7158  ins_pipe( pipe_slow );
7159%}
7160
7161// --------------------------------- FMA --------------------------------------
7162// a * b + c
7163
7164instruct vfmaF_reg(vec a, vec b, vec c) %{
7165  match(Set c (FmaVF  c (Binary a b)));
7166  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
7167  ins_cost(150);
7168  ins_encode %{
7169    assert(UseFMA, "not enabled");
7170    int vector_len = vector_length_encoding(this);
7171    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
7172  %}
7173  ins_pipe( pipe_slow );
7174%}
7175
7176instruct vfmaF_mem(vec a, memory b, vec c) %{
7177  match(Set c (FmaVF  c (Binary a (LoadVector b))));
7178  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
7179  ins_cost(150);
7180  ins_encode %{
7181    assert(UseFMA, "not enabled");
7182    int vector_len = vector_length_encoding(this);
7183    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
7184  %}
7185  ins_pipe( pipe_slow );
7186%}
7187
7188instruct vfmaD_reg(vec a, vec b, vec c) %{
7189  match(Set c (FmaVD  c (Binary a b)));
7190  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
7191  ins_cost(150);
7192  ins_encode %{
7193    assert(UseFMA, "not enabled");
7194    int vector_len = vector_length_encoding(this);
7195    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
7196  %}
7197  ins_pipe( pipe_slow );
7198%}
7199
7200instruct vfmaD_mem(vec a, memory b, vec c) %{
7201  match(Set c (FmaVD  c (Binary a (LoadVector b))));
7202  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
7203  ins_cost(150);
7204  ins_encode %{
7205    assert(UseFMA, "not enabled");
7206    int vector_len = vector_length_encoding(this);
7207    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
7208  %}
7209  ins_pipe( pipe_slow );
7210%}
7211
7212// --------------------------------- Vector Multiply Add --------------------------------------
7213
7214instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
7215  predicate(UseAVX == 0);
7216  match(Set dst (MulAddVS2VI dst src1));
7217  format %{ "pmaddwd $dst,$dst,$src1\t! muladd packedStoI" %}
7218  ins_encode %{
7219    __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
7220  %}
7221  ins_pipe( pipe_slow );
7222%}
7223
7224instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
7225  predicate(UseAVX > 0);
7226  match(Set dst (MulAddVS2VI src1 src2));
7227  format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
7228  ins_encode %{
7229    int vector_len = vector_length_encoding(this);
7230    __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7231  %}
7232  ins_pipe( pipe_slow );
7233%}
7234
7235// --------------------------------- Vector Multiply Add Add ----------------------------------
7236
7237instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
7238  predicate(VM_Version::supports_vnni());
7239  match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
7240  format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
7241  ins_encode %{
7242    assert(UseAVX > 2, "required");
7243    int vector_len = vector_length_encoding(this);
7244    __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7245  %}
7246  ins_pipe( pipe_slow );
7247  ins_cost(10);
7248%}
7249
7250// --------------------------------- PopCount --------------------------------------
7251
7252instruct vpopcountI(vec dst, vec src) %{
7253  match(Set dst (PopCountVI src));
7254  format %{ "vpopcntd  $dst,$src\t! vector popcount packedI" %}
7255  ins_encode %{
7256    assert(UsePopCountInstruction, "not enabled");
7257
7258    int vector_len = vector_length_encoding(this);
7259    __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
7260  %}
7261  ins_pipe( pipe_slow );
7262%}
7263