1 /*
2  * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.
8  *
9  * This code is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12  * version 2 for more details (a copy is included in the LICENSE file that
13  * accompanied this code).
14  *
15  * You should have received a copy of the GNU General Public License version
16  * 2 along with this work; if not, write to the Free Software Foundation,
17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18  *
19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20  * or visit www.oracle.com if you need additional information or have any
21  * questions.
22  *
23  */
24 
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "opto/c2_MacroAssembler.hpp"
29 #include "opto/intrinsicnode.hpp"
30 
31 #ifdef PRODUCT
32 #define BLOCK_COMMENT(str) /* nothing */
33 #define STOP(error) stop(error)
34 #else
35 #define BLOCK_COMMENT(str) block_comment(str)
36 #define STOP(error) block_comment(error); stop(error)
37 #endif
38 
39 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
40 
41 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
42 
43 // Search for str1 in str2 and return index or -1
string_indexof(Register str2,Register str1,Register cnt2,Register cnt1,Register tmp1,Register tmp2,Register tmp3,Register tmp4,Register tmp5,Register tmp6,int icnt1,Register result,int ae)44 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
45                                        Register cnt2, Register cnt1,
46                                        Register tmp1, Register tmp2,
47                                        Register tmp3, Register tmp4,
48                                        Register tmp5, Register tmp6,
49                                        int icnt1, Register result, int ae) {
50   // NOTE: tmp5, tmp6 can be zr depending on specific method version
51   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
52 
53   Register ch1 = rscratch1;
54   Register ch2 = rscratch2;
55   Register cnt1tmp = tmp1;
56   Register cnt2tmp = tmp2;
57   Register cnt1_neg = cnt1;
58   Register cnt2_neg = cnt2;
59   Register result_tmp = tmp4;
60 
61   bool isL = ae == StrIntrinsicNode::LL;
62 
63   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
64   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
65   int str1_chr_shift = str1_isL ? 0:1;
66   int str2_chr_shift = str2_isL ? 0:1;
67   int str1_chr_size = str1_isL ? 1:2;
68   int str2_chr_size = str2_isL ? 1:2;
69   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
70                                       (chr_insn)&MacroAssembler::ldrh;
71   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
72                                       (chr_insn)&MacroAssembler::ldrh;
73   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
74   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
75 
76   // Note, inline_string_indexOf() generates checks:
77   // if (substr.count > string.count) return -1;
78   // if (substr.count == 0) return 0;
79 
80   // We have two strings, a source string in str2, cnt2 and a pattern string
81   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
82 
83   // For larger pattern and source we use a simplified Boyer Moore algorithm.
84   // With a small pattern and source we use linear scan.
85 
86   if (icnt1 == -1) {
87     sub(result_tmp, cnt2, cnt1);
88     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
89     br(LT, LINEARSEARCH);
90     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
91     subs(zr, cnt1, 256);
92     lsr(tmp1, cnt2, 2);
93     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
94     br(GE, LINEARSTUB);
95   }
96 
97 // The Boyer Moore alogorithm is based on the description here:-
98 //
99 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
100 //
101 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
102 // and the 'Good Suffix' rule.
103 //
104 // These rules are essentially heuristics for how far we can shift the
105 // pattern along the search string.
106 //
107 // The implementation here uses the 'Bad Character' rule only because of the
108 // complexity of initialisation for the 'Good Suffix' rule.
109 //
110 // This is also known as the Boyer-Moore-Horspool algorithm:-
111 //
112 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
113 //
114 // This particular implementation has few java-specific optimizations.
115 //
116 // #define ASIZE 256
117 //
118 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
119 //       int i, j;
120 //       unsigned c;
121 //       unsigned char bc[ASIZE];
122 //
123 //       /* Preprocessing */
124 //       for (i = 0; i < ASIZE; ++i)
125 //          bc[i] = m;
126 //       for (i = 0; i < m - 1; ) {
127 //          c = x[i];
128 //          ++i;
129 //          // c < 256 for Latin1 string, so, no need for branch
130 //          #ifdef PATTERN_STRING_IS_LATIN1
131 //          bc[c] = m - i;
132 //          #else
133 //          if (c < ASIZE) bc[c] = m - i;
134 //          #endif
135 //       }
136 //
137 //       /* Searching */
138 //       j = 0;
139 //       while (j <= n - m) {
140 //          c = y[i+j];
141 //          if (x[m-1] == c)
142 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
143 //          if (i < 0) return j;
144 //          // c < 256 for Latin1 string, so, no need for branch
145 //          #ifdef SOURCE_STRING_IS_LATIN1
146 //          // LL case: (c< 256) always true. Remove branch
147 //          j += bc[y[j+m-1]];
148 //          #endif
149 //          #ifndef PATTERN_STRING_IS_UTF
150 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
151 //          if (c < ASIZE)
152 //            j += bc[y[j+m-1]];
153 //          else
154 //            j += 1
155 //          #endif
156 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
157 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
158 //          if (c < ASIZE)
159 //            j += bc[y[j+m-1]];
160 //          else
161 //            j += m
162 //          #endif
163 //       }
164 //    }
165 
166   if (icnt1 == -1) {
167     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
168         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
169     Register cnt1end = tmp2;
170     Register str2end = cnt2;
171     Register skipch = tmp2;
172 
173     // str1 length is >=8, so, we can read at least 1 register for cases when
174     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
175     // UL case. We'll re-read last character in inner pre-loop code to have
176     // single outer pre-loop load
177     const int firstStep = isL ? 7 : 3;
178 
179     const int ASIZE = 256;
180     const int STORED_BYTES = 32; // amount of bytes stored per instruction
181     sub(sp, sp, ASIZE);
182     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
183     mov(ch1, sp);
184     BIND(BM_INIT_LOOP);
185       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
186       subs(tmp5, tmp5, 1);
187       br(GT, BM_INIT_LOOP);
188 
189       sub(cnt1tmp, cnt1, 1);
190       mov(tmp5, str2);
191       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
192       sub(ch2, cnt1, 1);
193       mov(tmp3, str1);
194     BIND(BCLOOP);
195       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
196       if (!str1_isL) {
197         subs(zr, ch1, ASIZE);
198         br(HS, BCSKIP);
199       }
200       strb(ch2, Address(sp, ch1));
201     BIND(BCSKIP);
202       subs(ch2, ch2, 1);
203       br(GT, BCLOOP);
204 
205       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
206       if (str1_isL == str2_isL) {
207         // load last 8 bytes (8LL/4UU symbols)
208         ldr(tmp6, Address(tmp6, -wordSize));
209       } else {
210         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
211         // convert Latin1 to UTF. We'll have to wait until load completed, but
212         // it's still faster than per-character loads+checks
213         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
214         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
215         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
216         andr(tmp6, tmp6, 0xFF); // str1[N-4]
217         orr(ch2, ch1, ch2, LSL, 16);
218         orr(tmp6, tmp6, tmp3, LSL, 48);
219         orr(tmp6, tmp6, ch2, LSL, 16);
220       }
221     BIND(BMLOOPSTR2);
222       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
223       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
224       if (str1_isL == str2_isL) {
225         // re-init tmp3. It's for free because it's executed in parallel with
226         // load above. Alternative is to initialize it before loop, but it'll
227         // affect performance on in-order systems with 2 or more ld/st pipelines
228         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
229       }
230       if (!isL) { // UU/UL case
231         lsl(ch2, cnt1tmp, 1); // offset in bytes
232       }
233       cmp(tmp3, skipch);
234       br(NE, BMSKIP);
235       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
236       mov(ch1, tmp6);
237       if (isL) {
238         b(BMLOOPSTR1_AFTER_LOAD);
239       } else {
240         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
241         b(BMLOOPSTR1_CMP);
242       }
243     BIND(BMLOOPSTR1);
244       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
245       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
246     BIND(BMLOOPSTR1_AFTER_LOAD);
247       subs(cnt1tmp, cnt1tmp, 1);
248       br(LT, BMLOOPSTR1_LASTCMP);
249     BIND(BMLOOPSTR1_CMP);
250       cmp(ch1, ch2);
251       br(EQ, BMLOOPSTR1);
252     BIND(BMSKIP);
253       if (!isL) {
254         // if we've met UTF symbol while searching Latin1 pattern, then we can
255         // skip cnt1 symbols
256         if (str1_isL != str2_isL) {
257           mov(result_tmp, cnt1);
258         } else {
259           mov(result_tmp, 1);
260         }
261         subs(zr, skipch, ASIZE);
262         br(HS, BMADV);
263       }
264       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
265     BIND(BMADV);
266       sub(cnt1tmp, cnt1, 1);
267       add(str2, str2, result_tmp, LSL, str2_chr_shift);
268       cmp(str2, str2end);
269       br(LE, BMLOOPSTR2);
270       add(sp, sp, ASIZE);
271       b(NOMATCH);
272     BIND(BMLOOPSTR1_LASTCMP);
273       cmp(ch1, ch2);
274       br(NE, BMSKIP);
275     BIND(BMMATCH);
276       sub(result, str2, tmp5);
277       if (!str2_isL) lsr(result, result, 1);
278       add(sp, sp, ASIZE);
279       b(DONE);
280 
281     BIND(LINEARSTUB);
282     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
283     br(LT, LINEAR_MEDIUM);
284     mov(result, zr);
285     RuntimeAddress stub = NULL;
286     if (isL) {
287       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
288       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
289     } else if (str1_isL) {
290       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
291        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
292     } else {
293       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
294       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
295     }
296     trampoline_call(stub);
297     b(DONE);
298   }
299 
300   BIND(LINEARSEARCH);
301   {
302     Label DO1, DO2, DO3;
303 
304     Register str2tmp = tmp2;
305     Register first = tmp3;
306 
307     if (icnt1 == -1)
308     {
309         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
310 
311         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
312         br(LT, DOSHORT);
313       BIND(LINEAR_MEDIUM);
314         (this->*str1_load_1chr)(first, Address(str1));
315         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
316         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
317         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
318         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
319 
320       BIND(FIRST_LOOP);
321         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
322         cmp(first, ch2);
323         br(EQ, STR1_LOOP);
324       BIND(STR2_NEXT);
325         adds(cnt2_neg, cnt2_neg, str2_chr_size);
326         br(LE, FIRST_LOOP);
327         b(NOMATCH);
328 
329       BIND(STR1_LOOP);
330         adds(cnt1tmp, cnt1_neg, str1_chr_size);
331         add(cnt2tmp, cnt2_neg, str2_chr_size);
332         br(GE, MATCH);
333 
334       BIND(STR1_NEXT);
335         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
336         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
337         cmp(ch1, ch2);
338         br(NE, STR2_NEXT);
339         adds(cnt1tmp, cnt1tmp, str1_chr_size);
340         add(cnt2tmp, cnt2tmp, str2_chr_size);
341         br(LT, STR1_NEXT);
342         b(MATCH);
343 
344       BIND(DOSHORT);
345       if (str1_isL == str2_isL) {
346         cmp(cnt1, (u1)2);
347         br(LT, DO1);
348         br(GT, DO3);
349       }
350     }
351 
352     if (icnt1 == 4) {
353       Label CH1_LOOP;
354 
355         (this->*load_4chr)(ch1, str1);
356         sub(result_tmp, cnt2, 4);
357         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
358         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
359 
360       BIND(CH1_LOOP);
361         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
362         cmp(ch1, ch2);
363         br(EQ, MATCH);
364         adds(cnt2_neg, cnt2_neg, str2_chr_size);
365         br(LE, CH1_LOOP);
366         b(NOMATCH);
367       }
368 
369     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
370       Label CH1_LOOP;
371 
372       BIND(DO2);
373         (this->*load_2chr)(ch1, str1);
374         if (icnt1 == 2) {
375           sub(result_tmp, cnt2, 2);
376         }
377         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
378         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
379       BIND(CH1_LOOP);
380         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
381         cmp(ch1, ch2);
382         br(EQ, MATCH);
383         adds(cnt2_neg, cnt2_neg, str2_chr_size);
384         br(LE, CH1_LOOP);
385         b(NOMATCH);
386     }
387 
388     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
389       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
390 
391       BIND(DO3);
392         (this->*load_2chr)(first, str1);
393         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
394         if (icnt1 == 3) {
395           sub(result_tmp, cnt2, 3);
396         }
397         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
398         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
399       BIND(FIRST_LOOP);
400         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
401         cmpw(first, ch2);
402         br(EQ, STR1_LOOP);
403       BIND(STR2_NEXT);
404         adds(cnt2_neg, cnt2_neg, str2_chr_size);
405         br(LE, FIRST_LOOP);
406         b(NOMATCH);
407 
408       BIND(STR1_LOOP);
409         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
410         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
411         cmp(ch1, ch2);
412         br(NE, STR2_NEXT);
413         b(MATCH);
414     }
415 
416     if (icnt1 == -1 || icnt1 == 1) {
417       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
418 
419       BIND(DO1);
420         (this->*str1_load_1chr)(ch1, str1);
421         cmp(cnt2, (u1)8);
422         br(LT, DO1_SHORT);
423 
424         sub(result_tmp, cnt2, 8/str2_chr_size);
425         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
426         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
427         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
428 
429         if (str2_isL) {
430           orr(ch1, ch1, ch1, LSL, 8);
431         }
432         orr(ch1, ch1, ch1, LSL, 16);
433         orr(ch1, ch1, ch1, LSL, 32);
434       BIND(CH1_LOOP);
435         ldr(ch2, Address(str2, cnt2_neg));
436         eor(ch2, ch1, ch2);
437         sub(tmp1, ch2, tmp3);
438         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
439         bics(tmp1, tmp1, tmp2);
440         br(NE, HAS_ZERO);
441         adds(cnt2_neg, cnt2_neg, 8);
442         br(LT, CH1_LOOP);
443 
444         cmp(cnt2_neg, (u1)8);
445         mov(cnt2_neg, 0);
446         br(LT, CH1_LOOP);
447         b(NOMATCH);
448 
449       BIND(HAS_ZERO);
450         rev(tmp1, tmp1);
451         clz(tmp1, tmp1);
452         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
453         b(MATCH);
454 
455       BIND(DO1_SHORT);
456         mov(result_tmp, cnt2);
457         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
458         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
459       BIND(DO1_LOOP);
460         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
461         cmpw(ch1, ch2);
462         br(EQ, MATCH);
463         adds(cnt2_neg, cnt2_neg, str2_chr_size);
464         br(LT, DO1_LOOP);
465     }
466   }
467   BIND(NOMATCH);
468     mov(result, -1);
469     b(DONE);
470   BIND(MATCH);
471     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
472   BIND(DONE);
473 }
474 
475 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
476 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
477 
string_indexof_char(Register str1,Register cnt1,Register ch,Register result,Register tmp1,Register tmp2,Register tmp3)478 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
479                                             Register ch, Register result,
480                                             Register tmp1, Register tmp2, Register tmp3)
481 {
482   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
483   Register cnt1_neg = cnt1;
484   Register ch1 = rscratch1;
485   Register result_tmp = rscratch2;
486 
487   cbz(cnt1, NOMATCH);
488 
489   cmp(cnt1, (u1)4);
490   br(LT, DO1_SHORT);
491 
492   orr(ch, ch, ch, LSL, 16);
493   orr(ch, ch, ch, LSL, 32);
494 
495   sub(cnt1, cnt1, 4);
496   mov(result_tmp, cnt1);
497   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
498   sub(cnt1_neg, zr, cnt1, LSL, 1);
499 
500   mov(tmp3, 0x0001000100010001);
501 
502   BIND(CH1_LOOP);
503     ldr(ch1, Address(str1, cnt1_neg));
504     eor(ch1, ch, ch1);
505     sub(tmp1, ch1, tmp3);
506     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
507     bics(tmp1, tmp1, tmp2);
508     br(NE, HAS_ZERO);
509     adds(cnt1_neg, cnt1_neg, 8);
510     br(LT, CH1_LOOP);
511 
512     cmp(cnt1_neg, (u1)8);
513     mov(cnt1_neg, 0);
514     br(LT, CH1_LOOP);
515     b(NOMATCH);
516 
517   BIND(HAS_ZERO);
518     rev(tmp1, tmp1);
519     clz(tmp1, tmp1);
520     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
521     b(MATCH);
522 
523   BIND(DO1_SHORT);
524     mov(result_tmp, cnt1);
525     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
526     sub(cnt1_neg, zr, cnt1, LSL, 1);
527   BIND(DO1_LOOP);
528     ldrh(ch1, Address(str1, cnt1_neg));
529     cmpw(ch, ch1);
530     br(EQ, MATCH);
531     adds(cnt1_neg, cnt1_neg, 2);
532     br(LT, DO1_LOOP);
533   BIND(NOMATCH);
534     mov(result, -1);
535     b(DONE);
536   BIND(MATCH);
537     add(result, result_tmp, cnt1_neg, ASR, 1);
538   BIND(DONE);
539 }
540 
541 // Compare strings.
string_compare(Register str1,Register str2,Register cnt1,Register cnt2,Register result,Register tmp1,Register tmp2,FloatRegister vtmp1,FloatRegister vtmp2,FloatRegister vtmp3,int ae)542 void C2_MacroAssembler::string_compare(Register str1, Register str2,
543     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
544     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
545   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
546       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
547       SHORT_LOOP_START, TAIL_CHECK;
548 
549   bool isLL = ae == StrIntrinsicNode::LL;
550   bool isLU = ae == StrIntrinsicNode::LU;
551   bool isUL = ae == StrIntrinsicNode::UL;
552 
553   // The stub threshold for LL strings is: 72 (64 + 8) chars
554   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
555   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
556   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
557 
558   bool str1_isL = isLL || isLU;
559   bool str2_isL = isLL || isUL;
560 
561   int str1_chr_shift = str1_isL ? 0 : 1;
562   int str2_chr_shift = str2_isL ? 0 : 1;
563   int str1_chr_size = str1_isL ? 1 : 2;
564   int str2_chr_size = str2_isL ? 1 : 2;
565   int minCharsInWord = isLL ? wordSize : wordSize/2;
566 
567   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
568   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
569                                       (chr_insn)&MacroAssembler::ldrh;
570   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
571                                       (chr_insn)&MacroAssembler::ldrh;
572   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
573                             (uxt_insn)&MacroAssembler::uxthw;
574 
575   BLOCK_COMMENT("string_compare {");
576 
577   // Bizzarely, the counts are passed in bytes, regardless of whether they
578   // are L or U strings, however the result is always in characters.
579   if (!str1_isL) asrw(cnt1, cnt1, 1);
580   if (!str2_isL) asrw(cnt2, cnt2, 1);
581 
582   // Compute the minimum of the string lengths and save the difference.
583   subsw(result, cnt1, cnt2);
584   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
585 
586   // A very short string
587   cmpw(cnt2, minCharsInWord);
588   br(Assembler::LE, SHORT_STRING);
589 
590   // Compare longwords
591   // load first parts of strings and finish initialization while loading
592   {
593     if (str1_isL == str2_isL) { // LL or UU
594       ldr(tmp1, Address(str1));
595       cmp(str1, str2);
596       br(Assembler::EQ, DONE);
597       ldr(tmp2, Address(str2));
598       cmp(cnt2, stub_threshold);
599       br(GE, STUB);
600       subsw(cnt2, cnt2, minCharsInWord);
601       br(EQ, TAIL_CHECK);
602       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
603       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
604       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
605     } else if (isLU) {
606       ldrs(vtmp, Address(str1));
607       ldr(tmp2, Address(str2));
608       cmp(cnt2, stub_threshold);
609       br(GE, STUB);
610       subw(cnt2, cnt2, 4);
611       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
612       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
613       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
614       zip1(vtmp, T8B, vtmp, vtmpZ);
615       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
616       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
617       add(cnt1, cnt1, 4);
618       fmovd(tmp1, vtmp);
619     } else { // UL case
620       ldr(tmp1, Address(str1));
621       ldrs(vtmp, Address(str2));
622       cmp(cnt2, stub_threshold);
623       br(GE, STUB);
624       subw(cnt2, cnt2, 4);
625       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
626       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
627       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
628       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
629       zip1(vtmp, T8B, vtmp, vtmpZ);
630       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
631       add(cnt1, cnt1, 8);
632       fmovd(tmp2, vtmp);
633     }
634     adds(cnt2, cnt2, isUL ? 4 : 8);
635     br(GE, TAIL);
636     eor(rscratch2, tmp1, tmp2);
637     cbnz(rscratch2, DIFFERENCE);
638     // main loop
639     bind(NEXT_WORD);
640     if (str1_isL == str2_isL) {
641       ldr(tmp1, Address(str1, cnt2));
642       ldr(tmp2, Address(str2, cnt2));
643       adds(cnt2, cnt2, 8);
644     } else if (isLU) {
645       ldrs(vtmp, Address(str1, cnt1));
646       ldr(tmp2, Address(str2, cnt2));
647       add(cnt1, cnt1, 4);
648       zip1(vtmp, T8B, vtmp, vtmpZ);
649       fmovd(tmp1, vtmp);
650       adds(cnt2, cnt2, 8);
651     } else { // UL
652       ldrs(vtmp, Address(str2, cnt2));
653       ldr(tmp1, Address(str1, cnt1));
654       zip1(vtmp, T8B, vtmp, vtmpZ);
655       add(cnt1, cnt1, 8);
656       fmovd(tmp2, vtmp);
657       adds(cnt2, cnt2, 4);
658     }
659     br(GE, TAIL);
660 
661     eor(rscratch2, tmp1, tmp2);
662     cbz(rscratch2, NEXT_WORD);
663     b(DIFFERENCE);
664     bind(TAIL);
665     eor(rscratch2, tmp1, tmp2);
666     cbnz(rscratch2, DIFFERENCE);
667     // Last longword.  In the case where length == 4 we compare the
668     // same longword twice, but that's still faster than another
669     // conditional branch.
670     if (str1_isL == str2_isL) {
671       ldr(tmp1, Address(str1));
672       ldr(tmp2, Address(str2));
673     } else if (isLU) {
674       ldrs(vtmp, Address(str1));
675       ldr(tmp2, Address(str2));
676       zip1(vtmp, T8B, vtmp, vtmpZ);
677       fmovd(tmp1, vtmp);
678     } else { // UL
679       ldrs(vtmp, Address(str2));
680       ldr(tmp1, Address(str1));
681       zip1(vtmp, T8B, vtmp, vtmpZ);
682       fmovd(tmp2, vtmp);
683     }
684     bind(TAIL_CHECK);
685     eor(rscratch2, tmp1, tmp2);
686     cbz(rscratch2, DONE);
687 
688     // Find the first different characters in the longwords and
689     // compute their difference.
690     bind(DIFFERENCE);
691     rev(rscratch2, rscratch2);
692     clz(rscratch2, rscratch2);
693     andr(rscratch2, rscratch2, isLL ? -8 : -16);
694     lsrv(tmp1, tmp1, rscratch2);
695     (this->*ext_chr)(tmp1, tmp1);
696     lsrv(tmp2, tmp2, rscratch2);
697     (this->*ext_chr)(tmp2, tmp2);
698     subw(result, tmp1, tmp2);
699     b(DONE);
700   }
701 
702   bind(STUB);
703     RuntimeAddress stub = NULL;
704     switch(ae) {
705       case StrIntrinsicNode::LL:
706         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
707         break;
708       case StrIntrinsicNode::UU:
709         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
710         break;
711       case StrIntrinsicNode::LU:
712         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
713         break;
714       case StrIntrinsicNode::UL:
715         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
716         break;
717       default:
718         ShouldNotReachHere();
719      }
720     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
721     trampoline_call(stub);
722     b(DONE);
723 
724   bind(SHORT_STRING);
725   // Is the minimum length zero?
726   cbz(cnt2, DONE);
727   // arrange code to do most branches while loading and loading next characters
728   // while comparing previous
729   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
730   subs(cnt2, cnt2, 1);
731   br(EQ, SHORT_LAST_INIT);
732   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
733   b(SHORT_LOOP_START);
734   bind(SHORT_LOOP);
735   subs(cnt2, cnt2, 1);
736   br(EQ, SHORT_LAST);
737   bind(SHORT_LOOP_START);
738   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
739   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
740   cmp(tmp1, cnt1);
741   br(NE, SHORT_LOOP_TAIL);
742   subs(cnt2, cnt2, 1);
743   br(EQ, SHORT_LAST2);
744   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
745   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
746   cmp(tmp2, rscratch1);
747   br(EQ, SHORT_LOOP);
748   sub(result, tmp2, rscratch1);
749   b(DONE);
750   bind(SHORT_LOOP_TAIL);
751   sub(result, tmp1, cnt1);
752   b(DONE);
753   bind(SHORT_LAST2);
754   cmp(tmp2, rscratch1);
755   br(EQ, DONE);
756   sub(result, tmp2, rscratch1);
757 
758   b(DONE);
759   bind(SHORT_LAST_INIT);
760   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
761   bind(SHORT_LAST);
762   cmp(tmp1, cnt1);
763   br(EQ, DONE);
764   sub(result, tmp1, cnt1);
765 
766   bind(DONE);
767 
768   BLOCK_COMMENT("} string_compare");
769 }
770