1 /* 2 * Copyright (c) 2018, 2019 Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 25 package org.graalvm.compiler.lir.amd64; 26 27 import jdk.vm.ci.amd64.AMD64; 28 import jdk.vm.ci.amd64.AMD64.CPUFeature; 29 import jdk.vm.ci.amd64.AMD64Kind; 30 import jdk.vm.ci.code.Register; 31 import jdk.vm.ci.meta.JavaKind; 32 import jdk.vm.ci.meta.Value; 33 import org.graalvm.compiler.asm.Label; 34 import org.graalvm.compiler.asm.amd64.AMD64Address; 35 import org.graalvm.compiler.asm.amd64.AMD64Assembler; 36 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp; 37 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp; 38 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMOp; 39 import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRVMOp; 40 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 41 import org.graalvm.compiler.asm.amd64.AVXKind; 42 import org.graalvm.compiler.core.common.LIRKind; 43 import org.graalvm.compiler.lir.LIRInstructionClass; 44 import org.graalvm.compiler.lir.Opcode; 45 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 46 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 47 48 import static jdk.vm.ci.code.ValueUtil.asRegister; 49 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; 50 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 51 52 /** 53 */ 54 @Opcode("AMD64_ARRAY_INDEX_OF") 55 public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction { 56 public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class); 57 58 private final JavaKind kind; 59 private final int vmPageSize; 60 private final int nValues; 61 private final boolean findTwoConsecutive; 62 private final AMD64Kind vectorKind; 63 64 @Def({REG}) protected Value resultValue; 65 @Alive({REG}) protected Value arrayPtrValue; 66 @Use({REG}) protected Value arrayLengthValue; 67 @Alive({REG}) protected Value searchValue1; 68 @Alive({REG, ILLEGAL}) protected Value searchValue2; 69 @Alive({REG, ILLEGAL}) protected Value searchValue3; 70 @Alive({REG, ILLEGAL}) protected Value searchValue4; 71 @Temp({REG}) protected Value arraySlotsRemaining; 72 @Temp({REG}) protected Value comparisonResult1; 73 @Temp({REG}) protected Value comparisonResult2; 74 @Temp({REG}) protected Value comparisonResult3; 75 @Temp({REG}) protected Value comparisonResult4; 76 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal1; 77 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal2; 78 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal3; 79 @Temp({REG, ILLEGAL}) protected Value vectorCompareVal4; 80 @Temp({REG, ILLEGAL}) protected Value vectorArray1; 81 @Temp({REG, ILLEGAL}) protected Value vectorArray2; 82 @Temp({REG, ILLEGAL}) protected Value vectorArray3; 83 @Temp({REG, ILLEGAL}) protected Value vectorArray4; 84 AMD64ArrayIndexOfOp(JavaKind kind, boolean findTwoConsecutive, int vmPageSize, int maxVectorSize, LIRGeneratorTool tool, Value result, Value arrayPtr, Value arrayLength, Value... searchValues)85 public AMD64ArrayIndexOfOp(JavaKind kind, boolean findTwoConsecutive, int vmPageSize, int maxVectorSize, LIRGeneratorTool tool, Value result, Value arrayPtr, Value arrayLength, 86 Value... searchValues) { 87 super(TYPE); 88 this.kind = kind; 89 this.findTwoConsecutive = findTwoConsecutive; 90 this.vmPageSize = vmPageSize; 91 assert 0 < searchValues.length && searchValues.length <= 4; 92 assert byteMode(kind) || charMode(kind); 93 assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool); 94 nValues = searchValues.length; 95 assert !findTwoConsecutive || nValues == 1; 96 resultValue = result; 97 arrayPtrValue = arrayPtr; 98 arrayLengthValue = arrayLength; 99 searchValue1 = searchValues[0]; 100 searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL; 101 searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL; 102 searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL; 103 arraySlotsRemaining = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 104 comparisonResult1 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 105 comparisonResult2 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 106 comparisonResult3 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 107 comparisonResult4 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 108 vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(kind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD 109 : byteMode(kind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD; 110 vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind)); 111 vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 112 vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 113 vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL; 114 vectorArray1 = tool.newVariable(LIRKind.value(vectorKind)); 115 vectorArray2 = tool.newVariable(LIRKind.value(vectorKind)); 116 vectorArray3 = tool.newVariable(LIRKind.value(vectorKind)); 117 vectorArray4 = tool.newVariable(LIRKind.value(vectorKind)); 118 } 119 byteMode(JavaKind kind)120 private static boolean byteMode(JavaKind kind) { 121 return kind == JavaKind.Byte; 122 } 123 charMode(JavaKind kind)124 private static boolean charMode(JavaKind kind) { 125 return kind == JavaKind.Char; 126 } 127 getComparisonKind()128 private JavaKind getComparisonKind() { 129 return findTwoConsecutive ? (byteMode(kind) ? JavaKind.Char : JavaKind.Int) : kind; 130 } 131 getVectorSize()132 private AVXKind.AVXSize getVectorSize() { 133 return AVXKind.getDataSize(vectorKind); 134 } 135 136 @Override emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm)137 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) { 138 Register arrayPtr = asRegister(arrayPtrValue); 139 Register arrayLength = asRegister(arrayLengthValue); 140 Register result = asRegister(resultValue); 141 Register slotsRemaining = asRegister(arraySlotsRemaining); 142 Register[] searchValue = { 143 nValues > 0 ? asRegister(searchValue1) : null, 144 nValues > 1 ? asRegister(searchValue2) : null, 145 nValues > 2 ? asRegister(searchValue3) : null, 146 nValues > 3 ? asRegister(searchValue4) : null, 147 }; 148 Register[] vecCmp = { 149 nValues > 0 ? asRegister(vectorCompareVal1) : null, 150 nValues > 1 ? asRegister(vectorCompareVal2) : null, 151 nValues > 2 ? asRegister(vectorCompareVal3) : null, 152 nValues > 3 ? asRegister(vectorCompareVal4) : null, 153 }; 154 Register[] vecArray = { 155 asRegister(vectorArray1), 156 asRegister(vectorArray2), 157 asRegister(vectorArray3), 158 asRegister(vectorArray4), 159 }; 160 Register[] cmpResult = { 161 asRegister(comparisonResult1), 162 asRegister(comparisonResult2), 163 asRegister(comparisonResult3), 164 asRegister(comparisonResult4), 165 }; 166 Label retFound = new Label(); 167 Label retNotFound = new Label(); 168 Label end = new Label(); 169 170 // load array length 171 // important: this must be the first register manipulation, since arrayLengthValue is 172 // annotated with @Use 173 asm.movl(slotsRemaining, arrayLength); 174 // load array pointer 175 asm.movq(result, arrayPtr); 176 // move search values to vectors 177 for (int i = 0; i < nValues; i++) { 178 if (asm.supports(CPUFeature.AVX)) { 179 VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, vecCmp[i], searchValue[i]); 180 } else { 181 asm.movdl(vecCmp[i], searchValue[i]); 182 } 183 } 184 // fill comparison vector with copies of the search value 185 for (int i = 0; i < nValues; i++) { 186 emitBroadcast(asm, getComparisonKind(), vecCmp[i], vecArray[0], getVectorSize()); 187 } 188 189 emitArrayIndexOfChars(crb, asm, result, slotsRemaining, searchValue, vecCmp, vecArray, cmpResult, retFound, retNotFound); 190 191 // return -1 (no match) 192 asm.bind(retNotFound); 193 asm.movq(result, -1); 194 asm.jmpb(end); 195 196 asm.bind(retFound); 197 // convert array pointer to offset 198 asm.subq(result, arrayPtr); 199 if (charMode(kind)) { 200 asm.shrq(result, 1); 201 } 202 asm.bind(end); 203 } 204 emitArrayIndexOfChars(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register arrayPtr, Register slotsRemaining, Register[] searchValue, Register[] vecCmp, Register[] vecArray, Register[] cmpResult, Label retFound, Label retNotFound)205 private void emitArrayIndexOfChars(CompilationResultBuilder crb, AMD64MacroAssembler asm, 206 Register arrayPtr, 207 Register slotsRemaining, 208 Register[] searchValue, 209 Register[] vecCmp, 210 Register[] vecArray, 211 Register[] cmpResult, 212 Label retFound, 213 Label retNotFound) { 214 int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1; 215 AVXKind.AVXSize vectorSize = getVectorSize(); 216 217 Label bulkVectorLoop = new Label(); 218 Label singleVectorLoop = new Label(); 219 Label[] vectorFound = { 220 new Label(), 221 new Label(), 222 new Label(), 223 new Label(), 224 }; 225 Label lessThanVectorSizeRemaining = new Label(); 226 Label lessThanVectorSizeRemainingLoop = new Label(); 227 Label bulkVectorLoopExit = nVectors == 1 ? lessThanVectorSizeRemaining : singleVectorLoop; 228 int bytesPerVector = vectorSize.getBytes(); 229 int arraySlotsPerVector = vectorSize.getBytes() / kind.getByteCount(); 230 int singleVectorLoopCondition = arraySlotsPerVector; 231 int bulkSize = arraySlotsPerVector * nVectors; 232 int bulkSizeBytes = bytesPerVector * nVectors; 233 int bulkLoopCondition = bulkSize; 234 int[] vectorOffsets; 235 JavaKind vectorCompareKind = kind; 236 if (findTwoConsecutive) { 237 singleVectorLoopCondition++; 238 bulkLoopCondition++; 239 bulkSize /= 2; 240 bulkSizeBytes /= 2; 241 vectorOffsets = new int[]{0, kind.getByteCount(), bytesPerVector, bytesPerVector + kind.getByteCount()}; 242 vectorCompareKind = byteMode(kind) ? JavaKind.Char : JavaKind.Int; 243 } else { 244 vectorOffsets = new int[]{0, bytesPerVector, bytesPerVector * 2, bytesPerVector * 3}; 245 } 246 247 // load copy of low part of array pointer 248 Register tmpArrayPtrLow = cmpResult[0]; 249 asm.movl(tmpArrayPtrLow, arrayPtr); 250 251 // check if bulk vector load is in bounds 252 asm.cmpl(slotsRemaining, bulkLoopCondition); 253 asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); 254 255 // check if array pointer is aligned to bulkSize 256 asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1); 257 asm.jcc(AMD64Assembler.ConditionFlag.Zero, bulkVectorLoop); 258 259 // do one unaligned bulk comparison pass and adjust alignment afterwards 260 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false); 261 // load copy of low part of array pointer 262 asm.movl(tmpArrayPtrLow, arrayPtr); 263 // adjust array pointer 264 asm.addq(arrayPtr, bulkSizeBytes); 265 // adjust number of array slots remaining 266 asm.subl(slotsRemaining, bulkSize); 267 // get offset to bulk size alignment 268 asm.andl(tmpArrayPtrLow, bulkSizeBytes - 1); 269 emitBytesToArraySlots(asm, kind, tmpArrayPtrLow); 270 // adjust array pointer to bulk size alignment 271 asm.andq(arrayPtr, ~(bulkSizeBytes - 1)); 272 // adjust number of array slots remaining 273 asm.addl(slotsRemaining, tmpArrayPtrLow); 274 // check if there are enough array slots remaining for the bulk loop 275 asm.cmpl(slotsRemaining, bulkLoopCondition); 276 asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); 277 278 emitAlign(crb, asm); 279 asm.bind(bulkVectorLoop); 280 // memory-aligned bulk comparison 281 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, nVectors, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, !findTwoConsecutive); 282 // adjust number of array slots remaining 283 asm.subl(slotsRemaining, bulkSize); 284 // adjust array pointer 285 asm.addq(arrayPtr, bulkSizeBytes); 286 // check if there are enough array slots remaining for the bulk loop 287 asm.cmpl(slotsRemaining, bulkLoopCondition); 288 asm.jcc(AMD64Assembler.ConditionFlag.Below, bulkVectorLoopExit); 289 // continue loop 290 asm.jmp(bulkVectorLoop); 291 292 if (nVectors > 1) { 293 emitAlign(crb, asm); 294 // same loop as bulkVectorLoop, with only one vector 295 asm.bind(singleVectorLoop); 296 // check if single vector load is in bounds 297 asm.cmpl(slotsRemaining, singleVectorLoopCondition); 298 asm.jcc(AMD64Assembler.ConditionFlag.Below, lessThanVectorSizeRemaining); 299 // compare 300 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoConsecutive ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, vectorFound, false); 301 // adjust number of array slots remaining 302 asm.subl(slotsRemaining, arraySlotsPerVector); 303 // adjust array pointer 304 asm.addq(arrayPtr, bytesPerVector); 305 // continue loop 306 asm.jmpb(singleVectorLoop); 307 } 308 309 asm.bind(lessThanVectorSizeRemaining); 310 // check if any array slots remain 311 asm.testl(slotsRemaining, slotsRemaining); 312 asm.jcc(AMD64Assembler.ConditionFlag.Zero, retNotFound); 313 314 // a vector compare will read out of bounds of the input array. 315 // check if the out-of-bounds read would cross a memory page boundary. 316 // load copy of low part of array pointer 317 asm.movl(tmpArrayPtrLow, arrayPtr); 318 // check if pointer + vector size would cross the page boundary 319 asm.andl(tmpArrayPtrLow, (vmPageSize - 1)); 320 asm.cmpl(tmpArrayPtrLow, (vmPageSize - (findTwoConsecutive ? bytesPerVector + kind.getByteCount() : bytesPerVector))); 321 // if the page boundary would be crossed, do byte/character-wise comparison instead. 322 asm.jccb(AMD64Assembler.ConditionFlag.Above, lessThanVectorSizeRemainingLoop); 323 324 Label[] overBoundsMatch = {new Label(), new Label()}; 325 // otherwise, do a vector compare that reads beyond array bounds 326 emitVectorCompare(asm, vectorCompareKind, vectorSize, nValues, findTwoConsecutive ? 2 : 1, vectorOffsets, arrayPtr, vecCmp, vecArray, cmpResult, overBoundsMatch, false); 327 // no match 328 asm.jmp(retNotFound); 329 if (findTwoConsecutive) { 330 Label overBoundsFinish = new Label(); 331 asm.bind(overBoundsMatch[1]); 332 // get match offset of second result 333 asm.bsfq(cmpResult[1], cmpResult[1]); 334 asm.addl(cmpResult[1], kind.getByteCount()); 335 // replace first result with second and continue 336 asm.movl(cmpResult[0], cmpResult[1]); 337 asm.jmpb(overBoundsFinish); 338 339 asm.bind(overBoundsMatch[0]); 340 emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, overBoundsFinish); 341 } else { 342 asm.bind(overBoundsMatch[0]); 343 // find match offset 344 asm.bsfq(cmpResult[0], cmpResult[0]); 345 } 346 347 // adjust array pointer for match result 348 asm.addq(arrayPtr, cmpResult[0]); 349 if (charMode(kind)) { 350 // convert byte offset to chars 351 asm.shrl(cmpResult[0], 1); 352 } 353 // check if offset of matched value is greater than number of bytes remaining / out of array 354 // bounds 355 if (findTwoConsecutive) { 356 asm.decrementl(slotsRemaining); 357 } 358 asm.cmpl(cmpResult[0], slotsRemaining); 359 // match is out of bounds, return no match 360 asm.jcc(AMD64Assembler.ConditionFlag.GreaterEqual, retNotFound); 361 // adjust number of array slots remaining 362 if (findTwoConsecutive) { 363 asm.incrementl(slotsRemaining, 1); 364 } 365 asm.subl(slotsRemaining, cmpResult[0]); 366 // match is in bounds, return offset 367 asm.jmp(retFound); 368 369 // compare remaining slots in the array one-by-one 370 asm.bind(lessThanVectorSizeRemainingLoop); 371 // check if enough array slots remain 372 asm.cmpl(slotsRemaining, findTwoConsecutive ? 1 : 0); 373 asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, retNotFound); 374 // load char / byte 375 if (byteMode(kind)) { 376 if (findTwoConsecutive) { 377 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr)); 378 } else { 379 asm.movzbl(cmpResult[0], new AMD64Address(arrayPtr)); 380 } 381 } else { 382 if (findTwoConsecutive) { 383 asm.movl(cmpResult[0], new AMD64Address(arrayPtr)); 384 } else { 385 asm.movzwl(cmpResult[0], new AMD64Address(arrayPtr)); 386 } 387 } 388 // check for match 389 for (int i = 0; i < nValues; i++) { 390 emitCompareInst(asm, getComparisonKind(), cmpResult[0], searchValue[i]); 391 asm.jcc(AMD64Assembler.ConditionFlag.Equal, retFound); 392 } 393 // adjust number of array slots remaining 394 asm.decrementl(slotsRemaining); 395 // adjust array pointer 396 asm.addq(arrayPtr, kind.getByteCount()); 397 // continue loop 398 asm.jmpb(lessThanVectorSizeRemainingLoop); 399 400 for (int i = 1; i < nVectors; i += (findTwoConsecutive ? 2 : 1)) { 401 emitVectorFoundWithOffset(asm, kind, vectorOffsets[i], arrayPtr, cmpResult[i], slotsRemaining, vectorFound[i], retFound); 402 } 403 404 if (findTwoConsecutive) { 405 asm.bind(vectorFound[2]); 406 asm.addq(arrayPtr, vectorOffsets[2]); 407 // adjust number of array slots remaining 408 asm.subl(slotsRemaining, charMode(kind) ? vectorOffsets[2] / 2 : vectorOffsets[2]); 409 asm.movl(cmpResult[0], cmpResult[2]); 410 asm.movl(cmpResult[1], cmpResult[3]); 411 asm.bind(vectorFound[0]); 412 emitFindTwoCharPrefixMinResult(asm, kind, cmpResult, new Label()); 413 } else { 414 asm.bind(vectorFound[0]); 415 // find index of first set bit in bit mask 416 asm.bsfq(cmpResult[0], cmpResult[0]); 417 } 418 // add offset to array pointer 419 asm.addq(arrayPtr, cmpResult[0]); 420 if (charMode(kind)) { 421 // convert byte offset to chars 422 asm.shrl(cmpResult[0], 1); 423 } 424 // adjust number of array slots remaining 425 asm.subl(slotsRemaining, cmpResult[0]); 426 asm.jmpb(retFound); 427 } 428 emitFindTwoCharPrefixMinResult(AMD64MacroAssembler asm, JavaKind kind, Register[] cmpResult, Label done)429 private static void emitFindTwoCharPrefixMinResult(AMD64MacroAssembler asm, JavaKind kind, Register[] cmpResult, Label done) { 430 // find match offset 431 asm.bsfq(cmpResult[0], cmpResult[0]); 432 // check if second result is also a match 433 asm.testl(cmpResult[1], cmpResult[1]); 434 asm.jcc(AMD64Assembler.ConditionFlag.Zero, done); 435 // get match offset of second result 436 asm.bsfq(cmpResult[1], cmpResult[1]); 437 asm.addl(cmpResult[1], kind.getByteCount()); 438 // check if first result is less than second 439 asm.cmpl(cmpResult[0], cmpResult[1]); 440 asm.jcc(AMD64Assembler.ConditionFlag.LessEqual, done); 441 // first result is greater than second, replace it with the second result 442 asm.movl(cmpResult[0], cmpResult[1]); 443 asm.bind(done); 444 } 445 emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm)446 private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) { 447 asm.align(crb.target.wordSize * 2); 448 } 449 450 /** 451 * Fills {@code vecDst} with copies of its lowest byte, word or dword. 452 */ emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize)453 private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) { 454 switch (kind) { 455 case Byte: 456 if (asm.supports(CPUFeature.AVX2)) { 457 VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst); 458 } else if (asm.supports(CPUFeature.AVX)) { 459 VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp); 460 VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp); 461 } else if (asm.supports(CPUFeature.SSSE3)) { 462 asm.pxor(vecTmp, vecTmp); 463 asm.pshufb(vecDst, vecTmp); 464 } else { // SSE2 465 asm.punpcklbw(vecDst, vecDst); 466 asm.punpcklbw(vecDst, vecDst); 467 asm.pshufd(vecDst, vecDst, 0); 468 } 469 break; 470 case Short: 471 case Char: 472 if (asm.supports(CPUFeature.AVX2)) { 473 VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst); 474 } else if (asm.supports(CPUFeature.AVX)) { 475 VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0); 476 VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); 477 } else { // SSE 478 asm.pshuflw(vecDst, vecDst, 0); 479 asm.pshufd(vecDst, vecDst, 0); 480 } 481 break; 482 case Int: 483 if (asm.supports(CPUFeature.AVX2)) { 484 VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst); 485 } else if (asm.supports(CPUFeature.AVX)) { 486 VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0); 487 } else { // SSE 488 asm.pshufd(vecDst, vecDst, 0); 489 } 490 break; 491 default: 492 throw new UnsupportedOperationException(); 493 } 494 } 495 496 /** 497 * Convert a byte offset stored in {@code bytes} to an array index offset. 498 */ emitBytesToArraySlots(AMD64MacroAssembler asm, JavaKind kind, Register bytes)499 private static void emitBytesToArraySlots(AMD64MacroAssembler asm, JavaKind kind, Register bytes) { 500 if (charMode(kind)) { 501 asm.shrl(bytes, 1); 502 } else { 503 assert byteMode(kind); 504 } 505 } 506 emitVectorCompare(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, int nValues, int nVectors, int[] vectorOffsets, Register arrayPtr, Register[] vecCmp, Register[] vecArray, Register[] cmpResult, Label[] vectorFound, boolean alignedLoad)507 private static void emitVectorCompare(AMD64MacroAssembler asm, 508 JavaKind kind, 509 AVXKind.AVXSize vectorSize, 510 int nValues, 511 int nVectors, 512 int[] vectorOffsets, 513 Register arrayPtr, 514 Register[] vecCmp, 515 Register[] vecArray, 516 Register[] cmpResult, 517 Label[] vectorFound, 518 boolean alignedLoad) { 519 // load array contents into vectors 520 for (int i = 0; i < nValues; i++) { 521 for (int j = 0; j < nVectors; j++) { 522 emitArrayLoad(asm, vectorSize, vecArray[(i * nVectors) + j], arrayPtr, vectorOffsets[j], alignedLoad); 523 } 524 } 525 // compare all loaded bytes to the search value. 526 // matching bytes are set to 0xff, non-matching bytes are set to 0x00. 527 for (int i = 0; i < nValues; i++) { 528 for (int j = 0; j < nVectors; j++) { 529 emitVectorCompareInst(asm, kind, vectorSize, vecArray[(i * nVectors) + j], vecCmp[i]); 530 } 531 } 532 // create 32-bit-masks from the most significant bit of every byte in the comparison 533 // results. 534 for (int i = 0; i < nValues * nVectors; i++) { 535 emitMOVMSK(asm, vectorSize, cmpResult[i], vecArray[i]); 536 } 537 // join results of comparisons against multiple values 538 for (int stride = 1; stride < nValues; stride *= 2) { 539 for (int i = 0; i < nVectors; i++) { 540 for (int j = 0; j + stride < nValues; j += stride * 2) { 541 asm.orl(cmpResult[i + (j * nVectors)], cmpResult[i + ((j + stride) * nVectors)]); 542 } 543 } 544 } 545 // check if a match was found 546 for (int i = 0; i < nVectors; i++) { 547 asm.testl(cmpResult[i], cmpResult[i]); 548 asm.jcc(AMD64Assembler.ConditionFlag.NotZero, vectorFound[i]); 549 } 550 } 551 emitVectorFoundWithOffset(AMD64MacroAssembler asm, JavaKind kind, int resultOffset, Register result, Register cmpResult, Register slotsRemaining, Label entry, Label ret)552 private static void emitVectorFoundWithOffset(AMD64MacroAssembler asm, 553 JavaKind kind, 554 int resultOffset, 555 Register result, 556 Register cmpResult, 557 Register slotsRemaining, 558 Label entry, 559 Label ret) { 560 asm.bind(entry); 561 if (resultOffset > 0) { 562 // adjust array pointer 563 asm.addq(result, resultOffset); 564 // adjust number of array slots remaining 565 asm.subl(slotsRemaining, charMode(kind) ? resultOffset / 2 : resultOffset); 566 } 567 // find index of first set bit in bit mask 568 asm.bsfq(cmpResult, cmpResult); 569 // add offset to array pointer 570 asm.addq(result, cmpResult); 571 if (charMode(kind)) { 572 // convert byte offset to chars 573 asm.shrl(cmpResult, 1); 574 } 575 // adjust number of array slots remaining 576 asm.subl(slotsRemaining, cmpResult); 577 asm.jmpb(ret); 578 } 579 emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad)580 private static void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, int offset, boolean alignedLoad) { 581 AMD64Address src = new AMD64Address(arrayPtr, offset); 582 if (asm.supports(CPUFeature.AVX)) { 583 VexMoveOp loadOp = alignedLoad ? VexMoveOp.VMOVDQA : VexMoveOp.VMOVDQU; 584 loadOp.emit(asm, vectorSize, vecDst, src); 585 } else { 586 // SSE 587 asm.movdqu(vecDst, src); 588 } 589 } 590 591 /** 592 * Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values 593 * are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero. 594 */ emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp)595 private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) { 596 switch (kind) { 597 case Byte: 598 if (asm.supports(CPUFeature.AVX)) { 599 VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 600 } else { // SSE 601 asm.pcmpeqb(vecArray, vecCmp); 602 } 603 break; 604 case Short: 605 case Char: 606 if (asm.supports(CPUFeature.AVX)) { 607 VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 608 } else { // SSE 609 asm.pcmpeqw(vecArray, vecCmp); 610 } 611 break; 612 case Int: 613 if (asm.supports(CPUFeature.AVX)) { 614 VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray); 615 } else { // SSE 616 asm.pcmpeqd(vecArray, vecCmp); 617 } 618 break; 619 default: 620 throw new UnsupportedOperationException(); 621 } 622 } 623 emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc)624 private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) { 625 if (asm.supports(CPUFeature.AVX)) { 626 VexRMOp.VPMOVMSKB.emit(asm, vectorSize, dst, vecSrc); 627 } else { 628 // SSE 629 asm.pmovmskb(dst, vecSrc); 630 } 631 } 632 emitCompareInst(AMD64MacroAssembler asm, JavaKind kind, Register dst, Register src)633 private static void emitCompareInst(AMD64MacroAssembler asm, JavaKind kind, Register dst, Register src) { 634 switch (kind) { 635 case Byte: 636 asm.cmpb(dst, src); 637 break; 638 case Short: 639 case Char: 640 asm.cmpw(dst, src); 641 break; 642 case Int: 643 asm.cmpl(dst, src); 644 break; 645 default: 646 asm.cmpq(dst, src); 647 } 648 } 649 supportsAVX2(LIRGeneratorTool tool)650 private static boolean supportsAVX2(LIRGeneratorTool tool) { 651 return supports(tool, CPUFeature.AVX2); 652 } 653 supports(LIRGeneratorTool tool, CPUFeature cpuFeature)654 private static boolean supports(LIRGeneratorTool tool, CPUFeature cpuFeature) { 655 return ((AMD64) tool.target().arch).getFeatures().contains(cpuFeature); 656 } 657 } 658