1 /*
2  * Copyright (c) 2018, 2019 Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.
8  *
9  * This code is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12  * version 2 for more details (a copy is included in the LICENSE file that
13  * accompanied this code).
14  *
15  * You should have received a copy of the GNU General Public License version
16  * 2 along with this work; if not, write to the Free Software Foundation,
17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18  *
19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20  * or visit www.oracle.com if you need additional information or have any
21  * questions.
22  */
23 
24 
25 package org.graalvm.compiler.lir.amd64;
26 
27 import jdk.vm.ci.amd64.AMD64;
28 import jdk.vm.ci.amd64.AMD64.CPUFeature;
29 import jdk.vm.ci.amd64.AMD64Kind;
30 import jdk.vm.ci.code.Register;
31 import jdk.vm.ci.meta.JavaKind;
32 import jdk.vm.ci.meta.Value;
33 import org.graalvm.compiler.asm.Label;
34 import org.graalvm.compiler.asm.amd64.AMD64Address;
35 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
36 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp;
37 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp;
38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMOp;
39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRVMOp;
40 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
41 import org.graalvm.compiler.asm.amd64.AVXKind;
42 import org.graalvm.compiler.core.common.LIRKind;
43 import org.graalvm.compiler.lir.LIRInstructionClass;
44 import org.graalvm.compiler.lir.Opcode;
45 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
46 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
47 
48 import static jdk.vm.ci.code.ValueUtil.asRegister;
49 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
50 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
51 
52 /**
53  */
54 @Opcode("AMD64_ARRAY_INDEX_OF")
55 public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction {
56     public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class);
57 
58     private final JavaKind kind;
59     private final int vmPageSize;
60     private final int nValues;
61     private final boolean findTwoConsecutive;
62     private final AMD64Kind vectorKind;
63 
64     @Def({REG}) protected Value resultValue;
65     @Alive({REG}) protected Value arrayPtrValue;
66     @Use({REG}) protected Value arrayLengthValue;
67     @Alive({REG}) protected Value searchValue1;
68     @Alive({REG, ILLEGAL}) protected Value searchValue2;
69     @Alive({REG, ILLEGAL}) protected Value searchValue3;
70     @Alive({REG, ILLEGAL}) protected Value searchValue4;
71     @Temp({REG}) protected Value arraySlotsRemaining;
72     @Temp({REG}) protected Value comparisonResult1;
73     @Temp({REG}) protected Value comparisonResult2;
74     @Temp({REG}) protected Value comparisonResult3;
75     @Temp({REG}) protected Value comparisonResult4;
76     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal1;
77     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal2;
78     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal3;
79     @Temp({REG, ILLEGAL}) protected Value vectorCompareVal4;
80     @Temp({REG, ILLEGAL}) protected Value vectorArray1;
81     @Temp({REG, ILLEGAL}) protected Value vectorArray2;
82     @Temp({REG, ILLEGAL}) protected Value vectorArray3;
83     @Temp({REG, ILLEGAL}) protected Value vectorArray4;
84 
AMD64ArrayIndexOfOp(JavaKind kind, boolean findTwoConsecutive, int vmPageSize, int maxVectorSize, LIRGeneratorTool tool, Value result, Value arrayPtr, Value arrayLength, Value... searchValues)85     public AMD64ArrayIndexOfOp(JavaKind kind, boolean findTwoConsecutive, int vmPageSize, int maxVectorSize, LIRGeneratorTool tool, Value result, Value arrayPtr, Value arrayLength,
86                     Value... searchValues) {
87         super(TYPE);
88         this.kind = kind;
89         this.findTwoConsecutive = findTwoConsecutive;
90         this.vmPageSize = vmPageSize;
91         assert 0 < searchValues.length && searchValues.length <= 4;
92         assert byteMode(kind) || charMode(kind);
93         assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool);
94         nValues = searchValues.length;
95         assert !findTwoConsecutive || nValues == 1;
96         resultValue = result;
97         arrayPtrValue = arrayPtr;
98         arrayLengthValue = arrayLength;
99         searchValue1 = searchValues[0];
100         searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL;
101         searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL;
102         searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL;
103         arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
104         comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
105         comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
106         comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
107         comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
108         vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(kind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD
109                         : byteMode(kind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD;
110         vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind));
111         vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
112         vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
113         vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
114         vectorArray1 = tool.newVariable(LIRKind.value(vectorKind));
115         vectorArray2 = tool.newVariable(LIRKind.value(vectorKind));
116         vectorArray3 = tool.newVariable(LIRKind.value(vectorKind));
117         vectorArray4 = tool.newVariable(LIRKind.value(vectorKind));
118     }
119 
byteMode(JavaKind kind)120     private static boolean byteMode(JavaKind kind) {
121         return kind == JavaKind.Byte;
122     }
123 
charMode(JavaKind kind)124     private static boolean charMode(JavaKind kind) {
125         return kind == JavaKind.Char;
126     }
127 
getComparisonKind()128     private JavaKind getComparisonKind() {
129         return findTwoConsecutive ? (byteMode(kind) ? JavaKind.Char : JavaKind.Int) : kind;
130     }
131 
getVectorSize()132     private AVXKind.AVXSize getVectorSize() {
133         return AVXKind.getDataSize(vectorKind);
134     }
135 
136     @Override
emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm)137     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
138         Register arrayPtr = asRegister(arrayPtrValue);
139         Register arrayLength = asRegister(arrayLengthValue);
140         Register result = asRegister(resultValue);
141         Register slotsRemaining = asRegister(arraySlotsRemaining);
142         Register[] searchValue = {
143                         nValues > 0 ? asRegister(searchValue1) : null,
144                         nValues > 1 ? asRegister(searchValue2) : null,
145                         nValues > 2 ? asRegister(searchValue3) : null,
146                         nValues > 3 ? asRegister(searchValue4) : null,
147         };
148         Register[] vecCmp = {
149                         nValues > 0 ? asRegister(vectorCompareVal1) : null,
150                         nValues > 1 ? asRegister(vectorCompareVal2) : null,
151                         nValues > 2 ? asRegister(vectorCompareVal3) : null,
152                         nValues > 3 ? asRegister(vectorCompareVal4) : null,
153         };
154         Register[] vecArray = {
155                         asRegister(vectorArray1),
156                         asRegister(vectorArray2),
157                         asRegister(vectorArray3),
158                         asRegister(vectorArray4),
159         };
160         Register[] cmpResult = {
161                         asRegister(comparisonResult1),
162                         asRegister(comparisonResult2),
163                         asRegister(comparisonResult3),
164                         asRegister(comparisonResult4),
165         };
166         Label retFound = new Label();
167         Label retNotFound = new Label();
168         Label end = new Label();
169 
170         // load array length
171         // important: this must be the first register manipulation, since arrayLengthValue is
172         // annotated with @Use
173         asm.movl(slotsRemaining, arrayLength);
174         // load array pointer
175         asm.movq(result, arrayPtr);
176         // move search values to vectors
177         for (int i = 0; i < nValues; i++) {
178             if (asm.supports(CPUFeature.AVX)) {
179                 VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp[i], searchValue[i]);
180             } else {
181                 asm.movdl(vecCmp[i], searchValue[i]);
182             }
183         }
184         // fill comparison vector with copies of the search value
185         for (int i = 0; i < nValues; i++) {
186             emitBroadcast(asm, getComparisonKind(), vecCmp[i], vecArray[0], getVectorSize());
187         }
188 
189         emitArrayIndexOfChars(crb, asm, result, slotsRemaining, searchValue, vecCmp, vecArray, cmpResult, retFound, retNotFound);
190 
191         // return -1 (no match)
192         asm.bind(retNotFound);
193         asm.movq(result, -1);
194         asm.jmpb(end);
195 
196         asm.bind(retFound);
197         // convert array pointer to offset
198         asm.subq(result, arrayPtr);
199         if (charMode(kind)) {
200             asm.shrq(result, 1);
201         }
202         asm.bind(end);
203     }
204 
emitArrayIndexOfChars(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register arrayPtr, Register slotsRemaining, Register[] searchValue, Register[] vecCmp, Register[] vecArray, Register[] cmpResult, Label retFound, Label retNotFound)205     private void emitArrayIndexOfChars(CompilationResultBuilder crb, AMD64MacroAssembler asm,
206                     Register arrayPtr,
207                     Register slotsRemaining,
208                     Register[] searchValue,
209                     Register[] vecCmp,
210                     Register[] vecArray,
211                     Register[] cmpResult,
212                     Label retFound,
213                     Label retNotFound) {
214         int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1;
215         AVXKind.AVXSize vectorSize = getVectorSize();
216 
217         Label bulkVectorLoop = new Label();
218         Label singleVectorLoop = new Label();
219         Label[] vectorFound = {
220                         new Label(),
221                         new Label(),
222                         new Label(),
223                         new Label(),
224         };
225         Label lessThanVectorSizeRemaining = new Label();
226         Label lessThanVectorSizeRemainingLoop = new Label();
227         Label bulkVectorLoopExit = nVectors == 1 ? lessThanVectorSizeRemaining : singleVectorLoop;
228         int bytesPerVector = vectorSize.getBytes();
229         int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount();
230         int singleVectorLoopCondition = arraySlotsPerVector;
231         int bulkSize = arraySlotsPerVector * nVectors;
232         int bulkSizeBytes = bytesPerVector * nVectors;
233         int bulkLoopCondition = bulkSize;
234         int[] vectorOffsets;
235         JavaKind vectorCompareKind = kind;
236         if (findTwoConsecutive) {
237             singleVectorLoopCondition++;
238             bulkLoopCondition++;
239             bulkSize /= 2;
240             bulkSizeBytes /= 2;
241             vectorOffsets = new int[]{0, kind.getByteCount(), bytesPerVector, bytesPerVector + kind.getByteCount()};
242             vectorCompareKind = byteMode(kind) ? JavaKind.Char : JavaKind.Int;
243         } else {
244             vectorOffsets = new int[]{0, bytesPerVector, bytesPerVector * 2, bytesPerVector * 3};
245         }
246 
247         // load copy of low part of array pointer
248         Register tmpArrayPtrLow = cmpResult[0];
249         asm.movl(tmpArrayPtrLow, arrayPtr);
250 
251         // check if bulk vector load is in bounds
252         asm.cmpl(slotsRemaining, bulkLoopCondition);
253         asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
254 
255         // check if array pointer is aligned to bulkSize
256         asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1);
257         asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop);
258 
259         // do one unaligned bulk comparison pass and adjust alignment afterwards
260         emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false);
261         // load copy of low part of array pointer
262         asm.movl(tmpArrayPtrLow, arrayPtr);
263         // adjust array pointer
264         asm.addq(arrayPtr, bulkSizeBytes);
265         // adjust number of array slots remaining
266         asm.subl(slotsRemaining, bulkSize);
267         // get offset to bulk size alignment
268         asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1);
269         emitBytesToArraySlots(asm, kind, tmpArrayPtrLow);
270         // adjust array pointer to bulk size alignment
271         asm.andq(arrayPtr, ~(bulkSizeBytes - 1));
272         // adjust number of array slots remaining
273         asm.addl(slotsRemaining, tmpArrayPtrLow);
274         // check if there are enough array slots remaining for the bulk loop
275         asm.cmpl(slotsRemaining, bulkLoopCondition);
276         asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
277 
278         emitAlign(crb, asm);
279         asm.bind(bulkVectorLoop);
280         // memory-aligned bulk comparison
281         emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, !findTwoConsecutive);
282         // adjust number of array slots remaining
283         asm.subl(slotsRemaining, bulkSize);
284         // adjust array pointer
285         asm.addq(arrayPtr, bulkSizeBytes);
286         // check if there are enough array slots remaining for the bulk loop
287         asm.cmpl(slotsRemaining, bulkLoopCondition);
288         asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit);
289         // continue loop
290         asm.jmp(bulkVectorLoop);
291 
292         if (nVectors > 1) {
293             emitAlign(crb, asm);
294             // same loop as bulkVectorLoop, with only one vector
295             asm.bind(singleVectorLoop);
296             // check if single vector load is in bounds
297             asm.cmpl(slotsRemaining, singleVectorLoopCondition);
298             asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining);
299             // compare
300             emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoConsecutive ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false);
301             // adjust number of array slots remaining
302             asm.subl(slotsRemaining, arraySlotsPerVector);
303             // adjust array pointer
304             asm.addq(arrayPtr, bytesPerVector);
305             // continue loop
306             asm.jmpb(singleVectorLoop);
307         }
308 
309         asm.bind(lessThanVectorSizeRemaining);
310         // check if any array slots remain
311         asm.testl(slotsRemaining, slotsRemaining);
312         asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound);
313 
314         // a vector compare will read out of bounds of the input array.
315         // check if the out-of-bounds read would cross a memory page boundary.
316         // load copy of low part of array pointer
317         asm.movl(tmpArrayPtrLow, arrayPtr);
318         // check if pointer + vector size would cross the page boundary
319         asm.andl(tmpArrayPtrLow, (vmPageSize - 1));
320         asm.cmpl(tmpArrayPtrLow, (vmPageSize - (findTwoConsecutive ? bytesPerVector + kind.getByteCount() : bytesPerVector)));
321         // if the page boundary would be crossed, do byte/character-wise comparison instead.
322         asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop);
323 
324         Label[] overBoundsMatch = {new Label(), new Label()};
325         // otherwise, do a vector compare that reads beyond array bounds
326         emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoConsecutive ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, overBoundsMatch, false);
327         // no match
328         asm.jmp(retNotFound);
329         if (findTwoConsecutive) {
330             Label overBoundsFinish = new Label();
331             asm.bind(overBoundsMatch[1]);
332             // get match offset of second result
333             asm.bsfq(cmpResult[1], cmpResult[1]);
334             asm.addl(cmpResult[1], kind.getByteCount());
335             // replace first result with second and continue
336             asm.movl(cmpResult[0], cmpResult[1]);
337             asm.jmpb(overBoundsFinish);
338 
339             asm.bind(overBoundsMatch[0]);
340             emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, overBoundsFinish);
341         } else {
342             asm.bind(overBoundsMatch[0]);
343             // find match offset
344             asm.bsfq(cmpResult[0], cmpResult[0]);
345         }
346 
347         // adjust array pointer for match result
348         asm.addq(arrayPtr, cmpResult[0]);
349         if (charMode(kind)) {
350             // convert byte offset to chars
351             asm.shrl(cmpResult[0], 1);
352         }
353         // check if offset of matched value is greater than number of bytes remaining / out of array
354         // bounds
355         if (findTwoConsecutive) {
356             asm.decrementl(slotsRemaining);
357         }
358         asm.cmpl(cmpResult[0], slotsRemaining);
359         // match is out of bounds, return no match
360         asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound);
361         // adjust number of array slots remaining
362         if (findTwoConsecutive) {
363             asm.incrementl(slotsRemaining, 1);
364         }
365         asm.subl(slotsRemaining, cmpResult[0]);
366         // match is in bounds, return offset
367         asm.jmp(retFound);
368 
369         // compare remaining slots in the array one-by-one
370         asm.bind(lessThanVectorSizeRemainingLoop);
371         // check if enough array slots remain
372         asm.cmpl(slotsRemaining, findTwoConsecutive ? 1 : 0);
373         asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, retNotFound);
374         // load char / byte
375         if (byteMode(kind)) {
376             if (findTwoConsecutive) {
377                 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr));
378             } else {
379                 asm.movzbl(cmpResult[0], new AMD64Address(arrayPtr));
380             }
381         } else {
382             if (findTwoConsecutive) {
383                 asm.movl(cmpResult[0], new AMD64Address(arrayPtr));
384             } else {
385                 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr));
386             }
387         }
388         // check for match
389         for (int i = 0; i < nValues; i++) {
390             emitCompareInst(asm, getComparisonKind(), cmpResult[0], searchValue[i]);
391             asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound);
392         }
393         // adjust number of array slots remaining
394         asm.decrementl(slotsRemaining);
395         // adjust array pointer
396         asm.addq(arrayPtr, kind.getByteCount());
397         // continue loop
398         asm.jmpb(lessThanVectorSizeRemainingLoop);
399 
400         for (int i = 1; i < nVectors; i += (findTwoConsecutive ? 2 : 1)) {
401             emitVectorFoundWithOffset(asm, kind, vectorOffsets[i], arrayPtr, cmpResult[i], slotsRemaining, vectorFound[i], retFound);
402         }
403 
404         if (findTwoConsecutive) {
405             asm.bind(vectorFound[2]);
406             asm.addq(arrayPtr, vectorOffsets[2]);
407             // adjust number of array slots remaining
408             asm.subl(slotsRemaining, charMode(kind) ? vectorOffsets[2] / 2 : vectorOffsets[2]);
409             asm.movl(cmpResult[0], cmpResult[2]);
410             asm.movl(cmpResult[1], cmpResult[3]);
411             asm.bind(vectorFound[0]);
412             emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, new Label());
413         } else {
414             asm.bind(vectorFound[0]);
415             // find index of first set bit in bit mask
416             asm.bsfq(cmpResult[0], cmpResult[0]);
417         }
418         // add offset to array pointer
419         asm.addq(arrayPtr, cmpResult[0]);
420         if (charMode(kind)) {
421             // convert byte offset to chars
422             asm.shrl(cmpResult[0], 1);
423         }
424         // adjust number of array slots remaining
425         asm.subl(slotsRemaining, cmpResult[0]);
426         asm.jmpb(retFound);
427     }
428 
emitFindTwoCharPrefixMinResult(AMD64MacroAssembler asm, JavaKind kind, Register[] cmpResult, Label done)429     private static void emitFindTwoCharPrefixMinResult(AMD64MacroAssembler asm, JavaKind kind, Register[] cmpResult, Label done) {
430         // find match offset
431         asm.bsfq(cmpResult[0], cmpResult[0]);
432         // check if second result is also a match
433         asm.testl(cmpResult[1], cmpResult[1]);
434         asm.jcc(AMD64Assembler.ConditionFlag.Zero, done);
435         // get match offset of second result
436         asm.bsfq(cmpResult[1], cmpResult[1]);
437         asm.addl(cmpResult[1], kind.getByteCount());
438         // check if first result is less than second
439         asm.cmpl(cmpResult[0], cmpResult[1]);
440         asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, done);
441         // first result is greater than second, replace it with the second result
442         asm.movl(cmpResult[0], cmpResult[1]);
443         asm.bind(done);
444     }
445 
emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm)446     private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
447         asm.align(crb.target.wordSize * 2);
448     }
449 
450     /**
451      * Fills {@code vecDst} with copies of its lowest byte, word or dword.
452      */
emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize)453     private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) {
454         switch (kind) {
455             case Byte:
456                 if (asm.supports(CPUFeature.AVX2)) {
457                     VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst);
458                 } else if (asm.supports(CPUFeature.AVX)) {
459                     VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp);
460                     VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp);
461                 } else if (asm.supports(CPUFeature.SSSE3)) {
462                     asm.pxor(vecTmp, vecTmp);
463                     asm.pshufb(vecDst, vecTmp);
464                 } else { // SSE2
465                     asm.punpcklbw(vecDst, vecDst);
466                     asm.punpcklbw(vecDst, vecDst);
467                     asm.pshufd(vecDst, vecDst, 0);
468                 }
469                 break;
470             case Short:
471             case Char:
472                 if (asm.supports(CPUFeature.AVX2)) {
473                     VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst);
474                 } else if (asm.supports(CPUFeature.AVX)) {
475                     VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0);
476                     VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
477                 } else { // SSE
478                     asm.pshuflw(vecDst, vecDst, 0);
479                     asm.pshufd(vecDst, vecDst, 0);
480                 }
481                 break;
482             case Int:
483                 if (asm.supports(CPUFeature.AVX2)) {
484                     VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst);
485                 } else if (asm.supports(CPUFeature.AVX)) {
486                     VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
487                 } else { // SSE
488                     asm.pshufd(vecDst, vecDst, 0);
489                 }
490                 break;
491             default:
492                 throw new UnsupportedOperationException();
493         }
494     }
495 
496     /**
497      * Convert a byte offset stored in {@code bytes} to an array index offset.
498      */
emitBytesToArraySlots(AMD64MacroAssembler asm, JavaKind kind, Register bytes)499     private static void emitBytesToArraySlots(AMD64MacroAssembler asm, JavaKind kind, Register bytes) {
500         if (charMode(kind)) {
501             asm.shrl(bytes, 1);
502         } else {
503             assert byteMode(kind);
504         }
505     }
506 
emitVectorCompare(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, int nValues, int nVectors, int[] vectorOffsets, Register arrayPtr, Register[] vecCmp, Register[] vecArray, Register[] cmpResult, Label[] vectorFound, boolean alignedLoad)507     private static void emitVectorCompare(AMD64MacroAssembler asm,
508                     JavaKind kind,
509                     AVXKind.AVXSize vectorSize,
510                     int nValues,
511                     int nVectors,
512                     int[] vectorOffsets,
513                     Register arrayPtr,
514                     Register[] vecCmp,
515                     Register[] vecArray,
516                     Register[] cmpResult,
517                     Label[] vectorFound,
518                     boolean alignedLoad) {
519         // load array contents into vectors
520         for (int i = 0; i < nValues; i++) {
521             for (int j = 0; j < nVectors; j++) {
522                 emitArrayLoad(asm, vectorSize, vecArray[(i * nVectors) + j], arrayPtr, vectorOffsets[j], alignedLoad);
523             }
524         }
525         // compare all loaded bytes to the search value.
526         // matching bytes are set to 0xff, non-matching bytes are set to 0x00.
527         for (int i = 0; i < nValues; i++) {
528             for (int j = 0; j < nVectors; j++) {
529                 emitVectorCompareInst(asm, kind, vectorSize, vecArray[(i * nVectors) + j], vecCmp[i]);
530             }
531         }
532         // create 32-bit-masks from the most significant bit of every byte in the comparison
533         // results.
534         for (int i = 0; i < nValues * nVectors; i++) {
535             emitMOVMSK(asm, vectorSize, cmpResult[i], vecArray[i]);
536         }
537         // join results of comparisons against multiple values
538         for (int stride = 1; stride < nValues; stride *= 2) {
539             for (int i = 0; i < nVectors; i++) {
540                 for (int j = 0; j + stride < nValues; j += stride * 2) {
541                     asm.orl(cmpResult[i + (j * nVectors)], cmpResult[i + ((j + stride) * nVectors)]);
542                 }
543             }
544         }
545         // check if a match was found
546         for (int i = 0; i < nVectors; i++) {
547             asm.testl(cmpResult[i], cmpResult[i]);
548             asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound[i]);
549         }
550     }
551 
emitVectorFoundWithOffset(AMD64MacroAssembler asm, JavaKind kind, int resultOffset, Register result, Register cmpResult, Register slotsRemaining, Label entry, Label ret)552     private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm,
553                     JavaKind kind,
554                     int resultOffset,
555                     Register result,
556                     Register cmpResult,
557                     Register slotsRemaining,
558                     Label entry,
559                     Label ret) {
560         asm.bind(entry);
561         if (resultOffset > 0) {
562             // adjust array pointer
563             asm.addq(result, resultOffset);
564             // adjust number of array slots remaining
565             asm.subl(slotsRemaining, charMode(kind) ? resultOffset / 2 : resultOffset);
566         }
567         // find index of first set bit in bit mask
568         asm.bsfq(cmpResult, cmpResult);
569         // add offset to array pointer
570         asm.addq(result, cmpResult);
571         if (charMode(kind)) {
572             // convert byte offset to chars
573             asm.shrl(cmpResult, 1);
574         }
575         // adjust number of array slots remaining
576         asm.subl(slotsRemaining, cmpResult);
577         asm.jmpb(ret);
578     }
579 
emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad)580     private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) {
581         AMD64Address src = new AMD64Address(arrayPtr, offset);
582         if (asm.supports(CPUFeature.AVX)) {
583             VexMoveOp loadOp = alignedLoad ? VexMoveOp.VMOVDQA : VexMoveOp.VMOVDQU;
584             loadOp.emit(asm, vectorSize, vecDst, src);
585         } else {
586             // SSE
587             asm.movdqu(vecDst, src);
588         }
589     }
590 
591     /**
592      * Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values
593      * are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero.
594      */
emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp)595     private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) {
596         switch (kind) {
597             case Byte:
598                 if (asm.supports(CPUFeature.AVX)) {
599                     VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
600                 } else { // SSE
601                     asm.pcmpeqb(vecArray, vecCmp);
602                 }
603                 break;
604             case Short:
605             case Char:
606                 if (asm.supports(CPUFeature.AVX)) {
607                     VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
608                 } else { // SSE
609                     asm.pcmpeqw(vecArray, vecCmp);
610                 }
611                 break;
612             case Int:
613                 if (asm.supports(CPUFeature.AVX)) {
614                     VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
615                 } else { // SSE
616                     asm.pcmpeqd(vecArray, vecCmp);
617                 }
618                 break;
619             default:
620                 throw new UnsupportedOperationException();
621         }
622     }
623 
emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc)624     private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) {
625         if (asm.supports(CPUFeature.AVX)) {
626             VexRMOp.VPMOVMSKB.emit(asm, vectorSize, dst, vecSrc);
627         } else {
628             // SSE
629             asm.pmovmskb(dst, vecSrc);
630         }
631     }
632 
emitCompareInst(AMD64MacroAssembler asm, JavaKind kind, Register dst, Register src)633     private static void emitCompareInst(AMD64MacroAssembler asm, JavaKind kind, Register dst, Register src) {
634         switch (kind) {
635             case Byte:
636                 asm.cmpb(dst, src);
637                 break;
638             case Short:
639             case Char:
640                 asm.cmpw(dst, src);
641                 break;
642             case Int:
643                 asm.cmpl(dst, src);
644                 break;
645             default:
646                 asm.cmpq(dst, src);
647         }
648     }
649 
supportsAVX2(LIRGeneratorTool tool)650     private static boolean supportsAVX2(LIRGeneratorTool tool) {
651         return supports(tool, CPUFeature.AVX2);
652     }
653 
supports(LIRGeneratorTool tool, CPUFeature cpuFeature)654     private static boolean supports(LIRGeneratorTool tool, CPUFeature cpuFeature) {
655         return ((AMD64) tool.target().arch).getFeatures().contains(cpuFeature);
656     }
657 }
658