1 /*
2  * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.
8  *
9  * This code is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12  * version 2 for more details (a copy is included in the LICENSE file that
13  * accompanied this code).
14  *
15  * You should have received a copy of the GNU General Public License version
16  * 2 along with this work; if not, write to the Free Software Foundation,
17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18  *
19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20  * or visit www.oracle.com if you need additional information or have any
21  * questions.
22  */
23 
24 
25 package org.graalvm.compiler.asm.amd64;
26 
27 import static jdk.vm.ci.amd64.AMD64.rax;
28 import static jdk.vm.ci.amd64.AMD64.rcx;
29 import static jdk.vm.ci.amd64.AMD64.rdx;
30 import static jdk.vm.ci.amd64.AMD64.rsp;
31 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseIncDec;
32 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmLoadAndClearUpper;
33 import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmRegToRegMoveAll;
34 
35 import org.graalvm.compiler.asm.Label;
36 import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
37 import org.graalvm.compiler.core.common.NumUtil;
38 
39 import jdk.vm.ci.amd64.AMD64;
40 import jdk.vm.ci.amd64.AMD64Kind;
41 import jdk.vm.ci.code.Register;
42 import jdk.vm.ci.code.TargetDescription;
43 
44 /**
45  * This class implements commonly used X86 code patterns.
46  */
47 public class AMD64MacroAssembler extends AMD64Assembler {
48 
AMD64MacroAssembler(TargetDescription target)49     public AMD64MacroAssembler(TargetDescription target) {
50         super(target);
51     }
52 
decrementq(Register reg, int value)53     public final void decrementq(Register reg, int value) {
54         if (value == Integer.MIN_VALUE) {
55             subq(reg, value);
56             return;
57         }
58         if (value < 0) {
59             incrementq(reg, -value);
60             return;
61         }
62         if (value == 0) {
63             return;
64         }
65         if (value == 1 && UseIncDec) {
66             decq(reg);
67         } else {
68             subq(reg, value);
69         }
70     }
71 
decrementq(AMD64Address dst, int value)72     public final void decrementq(AMD64Address dst, int value) {
73         if (value == Integer.MIN_VALUE) {
74             subq(dst, value);
75             return;
76         }
77         if (value < 0) {
78             incrementq(dst, -value);
79             return;
80         }
81         if (value == 0) {
82             return;
83         }
84         if (value == 1 && UseIncDec) {
85             decq(dst);
86         } else {
87             subq(dst, value);
88         }
89     }
90 
incrementq(Register reg, int value)91     public void incrementq(Register reg, int value) {
92         if (value == Integer.MIN_VALUE) {
93             addq(reg, value);
94             return;
95         }
96         if (value < 0) {
97             decrementq(reg, -value);
98             return;
99         }
100         if (value == 0) {
101             return;
102         }
103         if (value == 1 && UseIncDec) {
104             incq(reg);
105         } else {
106             addq(reg, value);
107         }
108     }
109 
incrementq(AMD64Address dst, int value)110     public final void incrementq(AMD64Address dst, int value) {
111         if (value == Integer.MIN_VALUE) {
112             addq(dst, value);
113             return;
114         }
115         if (value < 0) {
116             decrementq(dst, -value);
117             return;
118         }
119         if (value == 0) {
120             return;
121         }
122         if (value == 1 && UseIncDec) {
123             incq(dst);
124         } else {
125             addq(dst, value);
126         }
127     }
128 
movptr(Register dst, AMD64Address src)129     public final void movptr(Register dst, AMD64Address src) {
130         movq(dst, src);
131     }
132 
movptr(AMD64Address dst, Register src)133     public final void movptr(AMD64Address dst, Register src) {
134         movq(dst, src);
135     }
136 
movptr(AMD64Address dst, int src)137     public final void movptr(AMD64Address dst, int src) {
138         movslq(dst, src);
139     }
140 
cmpptr(Register src1, Register src2)141     public final void cmpptr(Register src1, Register src2) {
142         cmpq(src1, src2);
143     }
144 
cmpptr(Register src1, AMD64Address src2)145     public final void cmpptr(Register src1, AMD64Address src2) {
146         cmpq(src1, src2);
147     }
148 
decrementl(Register reg)149     public final void decrementl(Register reg) {
150         decrementl(reg, 1);
151     }
152 
decrementl(Register reg, int value)153     public final void decrementl(Register reg, int value) {
154         if (value == Integer.MIN_VALUE) {
155             subl(reg, value);
156             return;
157         }
158         if (value < 0) {
159             incrementl(reg, -value);
160             return;
161         }
162         if (value == 0) {
163             return;
164         }
165         if (value == 1 && UseIncDec) {
166             decl(reg);
167         } else {
168             subl(reg, value);
169         }
170     }
171 
decrementl(AMD64Address dst, int value)172     public final void decrementl(AMD64Address dst, int value) {
173         if (value == Integer.MIN_VALUE) {
174             subl(dst, value);
175             return;
176         }
177         if (value < 0) {
178             incrementl(dst, -value);
179             return;
180         }
181         if (value == 0) {
182             return;
183         }
184         if (value == 1 && UseIncDec) {
185             decl(dst);
186         } else {
187             subl(dst, value);
188         }
189     }
190 
incrementl(Register reg, int value)191     public final void incrementl(Register reg, int value) {
192         if (value == Integer.MIN_VALUE) {
193             addl(reg, value);
194             return;
195         }
196         if (value < 0) {
197             decrementl(reg, -value);
198             return;
199         }
200         if (value == 0) {
201             return;
202         }
203         if (value == 1 && UseIncDec) {
204             incl(reg);
205         } else {
206             addl(reg, value);
207         }
208     }
209 
incrementl(AMD64Address dst, int value)210     public final void incrementl(AMD64Address dst, int value) {
211         if (value == Integer.MIN_VALUE) {
212             addl(dst, value);
213             return;
214         }
215         if (value < 0) {
216             decrementl(dst, -value);
217             return;
218         }
219         if (value == 0) {
220             return;
221         }
222         if (value == 1 && UseIncDec) {
223             incl(dst);
224         } else {
225             addl(dst, value);
226         }
227     }
228 
movflt(Register dst, Register src)229     public void movflt(Register dst, Register src) {
230         assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
231         if (UseXmmRegToRegMoveAll) {
232             movaps(dst, src);
233         } else {
234             movss(dst, src);
235         }
236     }
237 
movflt(Register dst, AMD64Address src)238     public void movflt(Register dst, AMD64Address src) {
239         assert dst.getRegisterCategory().equals(AMD64.XMM);
240         movss(dst, src);
241     }
242 
movflt(AMD64Address dst, Register src)243     public void movflt(AMD64Address dst, Register src) {
244         assert src.getRegisterCategory().equals(AMD64.XMM);
245         movss(dst, src);
246     }
247 
movdbl(Register dst, Register src)248     public void movdbl(Register dst, Register src) {
249         assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
250         if (UseXmmRegToRegMoveAll) {
251             movapd(dst, src);
252         } else {
253             movsd(dst, src);
254         }
255     }
256 
movdbl(Register dst, AMD64Address src)257     public void movdbl(Register dst, AMD64Address src) {
258         assert dst.getRegisterCategory().equals(AMD64.XMM);
259         if (UseXmmLoadAndClearUpper) {
260             movsd(dst, src);
261         } else {
262             movlpd(dst, src);
263         }
264     }
265 
movdbl(AMD64Address dst, Register src)266     public void movdbl(AMD64Address dst, Register src) {
267         assert src.getRegisterCategory().equals(AMD64.XMM);
268         movsd(dst, src);
269     }
270 
271     /**
272      * Non-atomic write of a 64-bit constant to memory. Do not use if the address might be a
273      * volatile field!
274      */
movlong(AMD64Address dst, long src)275     public final void movlong(AMD64Address dst, long src) {
276         if (NumUtil.isInt(src)) {
277             AMD64MIOp.MOV.emit(this, OperandSize.QWORD, dst, (int) src);
278         } else {
279             AMD64Address high = new AMD64Address(dst.getBase(), dst.getIndex(), dst.getScale(), dst.getDisplacement() + 4);
280             movl(dst, (int) (src & 0xFFFFFFFF));
281             movl(high, (int) (src >> 32));
282         }
283 
284     }
285 
setl(ConditionFlag cc, Register dst)286     public final void setl(ConditionFlag cc, Register dst) {
287         setb(cc, dst);
288         movzbl(dst, dst);
289     }
290 
setq(ConditionFlag cc, Register dst)291     public final void setq(ConditionFlag cc, Register dst) {
292         setb(cc, dst);
293         movzbq(dst, dst);
294     }
295 
flog(Register dest, Register value, boolean base10)296     public final void flog(Register dest, Register value, boolean base10) {
297         if (base10) {
298             fldlg2();
299         } else {
300             fldln2();
301         }
302         AMD64Address tmp = trigPrologue(value);
303         fyl2x();
304         trigEpilogue(dest, tmp);
305     }
306 
fsin(Register dest, Register value)307     public final void fsin(Register dest, Register value) {
308         AMD64Address tmp = trigPrologue(value);
309         fsin();
310         trigEpilogue(dest, tmp);
311     }
312 
fcos(Register dest, Register value)313     public final void fcos(Register dest, Register value) {
314         AMD64Address tmp = trigPrologue(value);
315         fcos();
316         trigEpilogue(dest, tmp);
317     }
318 
ftan(Register dest, Register value)319     public final void ftan(Register dest, Register value) {
320         AMD64Address tmp = trigPrologue(value);
321         fptan();
322         fstp(0); // ftan pushes 1.0 in addition to the actual result, pop
323         trigEpilogue(dest, tmp);
324     }
325 
fpop()326     public final void fpop() {
327         ffree(0);
328         fincstp();
329     }
330 
trigPrologue(Register value)331     private AMD64Address trigPrologue(Register value) {
332         assert value.getRegisterCategory().equals(AMD64.XMM);
333         AMD64Address tmp = new AMD64Address(AMD64.rsp);
334         subq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
335         movdbl(tmp, value);
336         fldd(tmp);
337         return tmp;
338     }
339 
trigEpilogue(Register dest, AMD64Address tmp)340     private void trigEpilogue(Register dest, AMD64Address tmp) {
341         assert dest.getRegisterCategory().equals(AMD64.XMM);
342         fstpd(tmp);
343         movdbl(dest, tmp);
344         addq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
345     }
346 
347     // IndexOf for constant substrings with size >= 8 chars
348     // which don't need to be loaded through stack.
stringIndexofC8(Register str1, Register str2, Register cnt1, Register cnt2, int intCnt2, Register result, Register vec, Register tmp)349     public void stringIndexofC8(Register str1, Register str2,
350                     Register cnt1, Register cnt2,
351                     int intCnt2, Register result,
352                     Register vec, Register tmp) {
353         // assert(UseSSE42Intrinsics, "SSE4.2 is required");
354 
355         // This method uses pcmpestri inxtruction with bound registers
356         // inputs:
357         // xmm - substring
358         // rax - substring length (elements count)
359         // mem - scanned string
360         // rdx - string length (elements count)
361         // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
362         // outputs:
363         // rcx - matched index in string
364         assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
365 
366         Label reloadSubstr = new Label();
367         Label scanToSubstr = new Label();
368         Label scanSubstr = new Label();
369         Label retFound = new Label();
370         Label retNotFound = new Label();
371         Label exit = new Label();
372         Label foundSubstr = new Label();
373         Label matchSubstrHead = new Label();
374         Label reloadStr = new Label();
375         Label foundCandidate = new Label();
376 
377         // Note, inline_string_indexOf() generates checks:
378         // if (substr.count > string.count) return -1;
379         // if (substr.count == 0) return 0;
380         assert intCnt2 >= 8 : "this code isused only for cnt2 >= 8 chars";
381 
382         // Load substring.
383         movdqu(vec, new AMD64Address(str2, 0));
384         movl(cnt2, intCnt2);
385         movq(result, str1); // string addr
386 
387         if (intCnt2 > 8) {
388             jmpb(scanToSubstr);
389 
390             // Reload substr for rescan, this code
391             // is executed only for large substrings (> 8 chars)
392             bind(reloadSubstr);
393             movdqu(vec, new AMD64Address(str2, 0));
394             negq(cnt2); // Jumped here with negative cnt2, convert to positive
395 
396             bind(reloadStr);
397             // We came here after the beginning of the substring was
398             // matched but the rest of it was not so we need to search
399             // again. Start from the next element after the previous match.
400 
401             // cnt2 is number of substring reminding elements and
402             // cnt1 is number of string reminding elements when cmp failed.
403             // Restored cnt1 = cnt1 - cnt2 + int_cnt2
404             subl(cnt1, cnt2);
405             addl(cnt1, intCnt2);
406             movl(cnt2, intCnt2); // Now restore cnt2
407 
408             decrementl(cnt1, 1);     // Shift to next element
409             cmpl(cnt1, cnt2);
410             jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
411 
412             addq(result, 2);
413 
414         } // (int_cnt2 > 8)
415 
416         // Scan string for start of substr in 16-byte vectors
417         bind(scanToSubstr);
418         pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
419         jccb(ConditionFlag.Below, foundCandidate);   // CF == 1
420         subl(cnt1, 8);
421         jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
422         cmpl(cnt1, cnt2);
423         jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
424         addq(result, 16);
425         jmpb(scanToSubstr);
426 
427         // Found a potential substr
428         bind(foundCandidate);
429         // Matched whole vector if first element matched (tmp(rcx) == 0).
430         if (intCnt2 == 8) {
431             jccb(ConditionFlag.Overflow, retFound);    // OF == 1
432         } else { // int_cnt2 > 8
433             jccb(ConditionFlag.Overflow, foundSubstr);
434         }
435         // After pcmpestri tmp(rcx) contains matched element index
436         // Compute start addr of substr
437         leaq(result, new AMD64Address(result, tmp, Scale.Times2, 0));
438 
439         // Make sure string is still long enough
440         subl(cnt1, tmp);
441         cmpl(cnt1, cnt2);
442         if (intCnt2 == 8) {
443             jccb(ConditionFlag.GreaterEqual, scanToSubstr);
444         } else { // int_cnt2 > 8
445             jccb(ConditionFlag.GreaterEqual, matchSubstrHead);
446         }
447         // Left less then substring.
448 
449         bind(retNotFound);
450         movl(result, -1);
451         jmpb(exit);
452 
453         if (intCnt2 > 8) {
454             // This code is optimized for the case when whole substring
455             // is matched if its head is matched.
456             bind(matchSubstrHead);
457             pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
458             // Reload only string if does not match
459             jccb(ConditionFlag.NoOverflow, reloadStr); // OF == 0
460 
461             Label contScanSubstr = new Label();
462             // Compare the rest of substring (> 8 chars).
463             bind(foundSubstr);
464             // First 8 chars are already matched.
465             negq(cnt2);
466             addq(cnt2, 8);
467 
468             bind(scanSubstr);
469             subl(cnt1, 8);
470             cmpl(cnt2, -8); // Do not read beyond substring
471             jccb(ConditionFlag.LessEqual, contScanSubstr);
472             // Back-up strings to avoid reading beyond substring:
473             // cnt1 = cnt1 - cnt2 + 8
474             addl(cnt1, cnt2); // cnt2 is negative
475             addl(cnt1, 8);
476             movl(cnt2, 8);
477             negq(cnt2);
478             bind(contScanSubstr);
479             if (intCnt2 < 1024 * 1024 * 1024) {
480                 movdqu(vec, new AMD64Address(str2, cnt2, Scale.Times2, intCnt2 * 2));
481                 pcmpestri(vec, new AMD64Address(result, cnt2, Scale.Times2, intCnt2 * 2), 0x0d);
482             } else {
483                 // calculate index in register to avoid integer overflow (int_cnt2*2)
484                 movl(tmp, intCnt2);
485                 addq(tmp, cnt2);
486                 movdqu(vec, new AMD64Address(str2, tmp, Scale.Times2, 0));
487                 pcmpestri(vec, new AMD64Address(result, tmp, Scale.Times2, 0), 0x0d);
488             }
489             // Need to reload strings pointers if not matched whole vector
490             jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0
491             addq(cnt2, 8);
492             jcc(ConditionFlag.Negative, scanSubstr);
493             // Fall through if found full substring
494 
495         } // (int_cnt2 > 8)
496 
497         bind(retFound);
498         // Found result if we matched full small substring.
499         // Compute substr offset
500         subq(result, str1);
501         shrl(result, 1); // index
502         bind(exit);
503 
504     } // string_indexofC8
505 
506     // Small strings are loaded through stack if they cross page boundary.
stringIndexOf(Register str1, Register str2, Register cnt1, Register cnt2, int intCnt2, Register result, Register vec, Register tmp, int vmPageSize)507     public void stringIndexOf(Register str1, Register str2,
508                     Register cnt1, Register cnt2,
509                     int intCnt2, Register result,
510                     Register vec, Register tmp, int vmPageSize) {
511         //
512         // int_cnt2 is length of small (< 8 chars) constant substring
513         // or (-1) for non constant substring in which case its length
514         // is in cnt2 register.
515         //
516         // Note, inline_string_indexOf() generates checks:
517         // if (substr.count > string.count) return -1;
518         // if (substr.count == 0) return 0;
519         //
520         assert intCnt2 == -1 || (0 < intCnt2 && intCnt2 < 8) : "should be != 0";
521 
522         // This method uses pcmpestri instruction with bound registers
523         // inputs:
524         // xmm - substring
525         // rax - substring length (elements count)
526         // mem - scanned string
527         // rdx - string length (elements count)
528         // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
529         // outputs:
530         // rcx - matched index in string
531         assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
532 
533         Label reloadSubstr = new Label();
534         Label scanToSubstr = new Label();
535         Label scanSubstr = new Label();
536         Label adjustStr = new Label();
537         Label retFound = new Label();
538         Label retNotFound = new Label();
539         Label cleanup = new Label();
540         Label foundSubstr = new Label();
541         Label foundCandidate = new Label();
542 
543         int wordSize = 8;
544         // We don't know where these strings are located
545         // and we can't read beyond them. Load them through stack.
546         Label bigStrings = new Label();
547         Label checkStr = new Label();
548         Label copySubstr = new Label();
549         Label copyStr = new Label();
550 
551         movq(tmp, rsp); // save old SP
552 
553         if (intCnt2 > 0) {     // small (< 8 chars) constant substring
554             if (intCnt2 == 1) {  // One char
555                 movzwl(result, new AMD64Address(str2, 0));
556                 movdl(vec, result); // move 32 bits
557             } else if (intCnt2 == 2) { // Two chars
558                 movdl(vec, new AMD64Address(str2, 0)); // move 32 bits
559             } else if (intCnt2 == 4) { // Four chars
560                 movq(vec, new AMD64Address(str2, 0));  // move 64 bits
561             } else { // cnt2 = { 3, 5, 6, 7 }
562                 // Array header size is 12 bytes in 32-bit VM
563                 // + 6 bytes for 3 chars == 18 bytes,
564                 // enough space to load vec and shift.
565                 movdqu(vec, new AMD64Address(str2, (intCnt2 * 2) - 16));
566                 psrldq(vec, 16 - (intCnt2 * 2));
567             }
568         } else { // not constant substring
569             cmpl(cnt2, 8);
570             jccb(ConditionFlag.AboveEqual, bigStrings); // Both strings are big enough
571 
572             // We can read beyond string if str+16 does not cross page boundary
573             // since heaps are aligned and mapped by pages.
574             assert vmPageSize < 1024 * 1024 * 1024 : "default page should be small";
575             movl(result, str2); // We need only low 32 bits
576             andl(result, (vmPageSize - 1));
577             cmpl(result, (vmPageSize - 16));
578             jccb(ConditionFlag.BelowEqual, checkStr);
579 
580             // Move small strings to stack to allow load 16 bytes into vec.
581             subq(rsp, 16);
582             int stackOffset = wordSize - 2;
583             push(cnt2);
584 
585             bind(copySubstr);
586             movzwl(result, new AMD64Address(str2, cnt2, Scale.Times2, -2));
587             movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
588             decrementl(cnt2, 1);
589             jccb(ConditionFlag.NotZero, copySubstr);
590 
591             pop(cnt2);
592             movq(str2, rsp);  // New substring address
593         } // non constant
594 
595         bind(checkStr);
596         cmpl(cnt1, 8);
597         jccb(ConditionFlag.AboveEqual, bigStrings);
598 
599         // Check cross page boundary.
600         movl(result, str1); // We need only low 32 bits
601         andl(result, (vmPageSize - 1));
602         cmpl(result, (vmPageSize - 16));
603         jccb(ConditionFlag.BelowEqual, bigStrings);
604 
605         subq(rsp, 16);
606         int stackOffset = -2;
607         if (intCnt2 < 0) { // not constant
608             push(cnt2);
609             stackOffset += wordSize;
610         }
611         movl(cnt2, cnt1);
612 
613         bind(copyStr);
614         movzwl(result, new AMD64Address(str1, cnt2, Scale.Times2, -2));
615         movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
616         decrementl(cnt2, 1);
617         jccb(ConditionFlag.NotZero, copyStr);
618 
619         if (intCnt2 < 0) { // not constant
620             pop(cnt2);
621         }
622         movq(str1, rsp);  // New string address
623 
624         bind(bigStrings);
625         // Load substring.
626         if (intCnt2 < 0) { // -1
627             movdqu(vec, new AMD64Address(str2, 0));
628             push(cnt2);       // substr count
629             push(str2);       // substr addr
630             push(str1);       // string addr
631         } else {
632             // Small (< 8 chars) constant substrings are loaded already.
633             movl(cnt2, intCnt2);
634         }
635         push(tmp);  // original SP
636         // Finished loading
637 
638         // ========================================================
639         // Start search
640         //
641 
642         movq(result, str1); // string addr
643 
644         if (intCnt2 < 0) {  // Only for non constant substring
645             jmpb(scanToSubstr);
646 
647             // SP saved at sp+0
648             // String saved at sp+1*wordSize
649             // Substr saved at sp+2*wordSize
650             // Substr count saved at sp+3*wordSize
651 
652             // Reload substr for rescan, this code
653             // is executed only for large substrings (> 8 chars)
654             bind(reloadSubstr);
655             movq(str2, new AMD64Address(rsp, 2 * wordSize));
656             movl(cnt2, new AMD64Address(rsp, 3 * wordSize));
657             movdqu(vec, new AMD64Address(str2, 0));
658             // We came here after the beginning of the substring was
659             // matched but the rest of it was not so we need to search
660             // again. Start from the next element after the previous match.
661             subq(str1, result); // Restore counter
662             shrl(str1, 1);
663             addl(cnt1, str1);
664             decrementl(cnt1);   // Shift to next element
665             cmpl(cnt1, cnt2);
666             jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
667 
668             addq(result, 2);
669         } // non constant
670 
671         // Scan string for start of substr in 16-byte vectors
672         bind(scanToSubstr);
673         assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
674         pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
675         jccb(ConditionFlag.Below, foundCandidate);   // CF == 1
676         subl(cnt1, 8);
677         jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
678         cmpl(cnt1, cnt2);
679         jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
680         addq(result, 16);
681 
682         bind(adjustStr);
683         cmpl(cnt1, 8); // Do not read beyond string
684         jccb(ConditionFlag.GreaterEqual, scanToSubstr);
685         // Back-up string to avoid reading beyond string.
686         leaq(result, new AMD64Address(result, cnt1, Scale.Times2, -16));
687         movl(cnt1, 8);
688         jmpb(scanToSubstr);
689 
690         // Found a potential substr
691         bind(foundCandidate);
692         // After pcmpestri tmp(rcx) contains matched element index
693 
694         // Make sure string is still long enough
695         subl(cnt1, tmp);
696         cmpl(cnt1, cnt2);
697         jccb(ConditionFlag.GreaterEqual, foundSubstr);
698         // Left less then substring.
699 
700         bind(retNotFound);
701         movl(result, -1);
702         jmpb(cleanup);
703 
704         bind(foundSubstr);
705         // Compute start addr of substr
706         leaq(result, new AMD64Address(result, tmp, Scale.Times2));
707 
708         if (intCnt2 > 0) { // Constant substring
709             // Repeat search for small substring (< 8 chars)
710             // from new point without reloading substring.
711             // Have to check that we don't read beyond string.
712             cmpl(tmp, 8 - intCnt2);
713             jccb(ConditionFlag.Greater, adjustStr);
714             // Fall through if matched whole substring.
715         } else { // non constant
716             assert intCnt2 == -1 : "should be != 0";
717 
718             addl(tmp, cnt2);
719             // Found result if we matched whole substring.
720             cmpl(tmp, 8);
721             jccb(ConditionFlag.LessEqual, retFound);
722 
723             // Repeat search for small substring (<= 8 chars)
724             // from new point 'str1' without reloading substring.
725             cmpl(cnt2, 8);
726             // Have to check that we don't read beyond string.
727             jccb(ConditionFlag.LessEqual, adjustStr);
728 
729             Label checkNext = new Label();
730             Label contScanSubstr = new Label();
731             Label retFoundLong = new Label();
732             // Compare the rest of substring (> 8 chars).
733             movq(str1, result);
734 
735             cmpl(tmp, cnt2);
736             // First 8 chars are already matched.
737             jccb(ConditionFlag.Equal, checkNext);
738 
739             bind(scanSubstr);
740             pcmpestri(vec, new AMD64Address(str1, 0), 0x0d);
741             // Need to reload strings pointers if not matched whole vector
742             jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0
743 
744             bind(checkNext);
745             subl(cnt2, 8);
746             jccb(ConditionFlag.LessEqual, retFoundLong); // Found full substring
747             addq(str1, 16);
748             addq(str2, 16);
749             subl(cnt1, 8);
750             cmpl(cnt2, 8); // Do not read beyond substring
751             jccb(ConditionFlag.GreaterEqual, contScanSubstr);
752             // Back-up strings to avoid reading beyond substring.
753             leaq(str2, new AMD64Address(str2, cnt2, Scale.Times2, -16));
754             leaq(str1, new AMD64Address(str1, cnt2, Scale.Times2, -16));
755             subl(cnt1, cnt2);
756             movl(cnt2, 8);
757             addl(cnt1, 8);
758             bind(contScanSubstr);
759             movdqu(vec, new AMD64Address(str2, 0));
760             jmpb(scanSubstr);
761 
762             bind(retFoundLong);
763             movq(str1, new AMD64Address(rsp, wordSize));
764         } // non constant
765 
766         bind(retFound);
767         // Compute substr offset
768         subq(result, str1);
769         shrl(result, 1); // index
770 
771         bind(cleanup);
772         pop(rsp); // restore SP
773 
774     }
775 
776 }
777