1 /*
2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.
9  *
10  * This code is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13  * version 2 for more details (a copy is included in the LICENSE file that
14  * accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License version
17  * 2 along with this work; if not, write to the Free Software Foundation,
18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19  *
20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21  * or visit www.oracle.com if you need additional information or have any
22  * questions.
23  *
24  */
25 
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "gc/shared/barrierSet.hpp"
30 #include "gc/shared/barrierSetAssembler.hpp"
31 #include "interpreter/interpreter.hpp"
32 #include "memory/universe.hpp"
33 #include "nativeInst_aarch64.hpp"
34 #include "oops/instanceOop.hpp"
35 #include "oops/method.hpp"
36 #include "oops/objArrayKlass.hpp"
37 #include "oops/oop.inline.hpp"
38 #include "prims/methodHandles.hpp"
39 #include "runtime/frame.inline.hpp"
40 #include "runtime/handles.inline.hpp"
41 #include "runtime/sharedRuntime.hpp"
42 #include "runtime/stubCodeGenerator.hpp"
43 #include "runtime/stubRoutines.hpp"
44 #include "runtime/thread.inline.hpp"
45 #include "utilities/align.hpp"
46 #ifdef COMPILER2
47 #include "opto/runtime.hpp"
48 #endif
49 #if INCLUDE_ZGC
50 #include "gc/z/zThreadLocalData.hpp"
51 #endif
52 
53 // Declaration and definition of StubGenerator (no .hpp file).
54 // For a more detailed description of the stub routine structure
55 // see the comment in stubRoutines.hpp
56 
57 #undef __
58 #define __ _masm->
59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
60 
61 #ifdef PRODUCT
62 #define BLOCK_COMMENT(str) /* nothing */
63 #else
64 #define BLOCK_COMMENT(str) __ block_comment(str)
65 #endif
66 
67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
68 
69 // Stub Code definitions
70 
71 class StubGenerator: public StubCodeGenerator {
72  private:
73 
74 #ifdef PRODUCT
75 #define inc_counter_np(counter) ((void)0)
76 #else
77   void inc_counter_np_(int& counter) {
78     __ lea(rscratch2, ExternalAddress((address)&counter));
79     __ ldrw(rscratch1, Address(rscratch2));
80     __ addw(rscratch1, rscratch1, 1);
81     __ strw(rscratch1, Address(rscratch2));
82   }
83 #define inc_counter_np(counter) \
84   BLOCK_COMMENT("inc_counter " #counter); \
85   inc_counter_np_(counter);
86 #endif
87 
88   // Call stubs are used to call Java from C
89   //
90   // Arguments:
91   //    c_rarg0:   call wrapper address                   address
92   //    c_rarg1:   result                                 address
93   //    c_rarg2:   result type                            BasicType
94   //    c_rarg3:   method                                 Method*
95   //    c_rarg4:   (interpreter) entry point              address
96   //    c_rarg5:   parameters                             intptr_t*
97   //    c_rarg6:   parameter size (in words)              int
98   //    c_rarg7:   thread                                 Thread*
99   //
100   // There is no return from the stub itself as any Java result
101   // is written to result
102   //
103   // we save r30 (lr) as the return PC at the base of the frame and
104   // link r29 (fp) below it as the frame pointer installing sp (r31)
105   // into fp.
106   //
107   // we save r0-r7, which accounts for all the c arguments.
108   //
109   // TODO: strictly do we need to save them all? they are treated as
110   // volatile by C so could we omit saving the ones we are going to
111   // place in global registers (thread? method?) or those we only use
112   // during setup of the Java call?
113   //
114   // we don't need to save r8 which C uses as an indirect result location
115   // return register.
116   //
117   // we don't need to save r9-r15 which both C and Java treat as
118   // volatile
119   //
120   // we don't need to save r16-18 because Java does not use them
121   //
122   // we save r19-r28 which Java uses as scratch registers and C
123   // expects to be callee-save
124   //
125   // we save the bottom 64 bits of each value stored in v8-v15; it is
126   // the responsibility of the caller to preserve larger values.
127   //
128   // so the stub frame looks like this when we enter Java code
129   //
130   //     [ return_from_Java     ] <--- sp
131   //     [ argument word n      ]
132   //      ...
133   // -27 [ argument word 1      ]
134   // -26 [ saved v15            ] <--- sp_after_call
135   // -25 [ saved v14            ]
136   // -24 [ saved v13            ]
137   // -23 [ saved v12            ]
138   // -22 [ saved v11            ]
139   // -21 [ saved v10            ]
140   // -20 [ saved v9             ]
141   // -19 [ saved v8             ]
142   // -18 [ saved r28            ]
143   // -17 [ saved r27            ]
144   // -16 [ saved r26            ]
145   // -15 [ saved r25            ]
146   // -14 [ saved r24            ]
147   // -13 [ saved r23            ]
148   // -12 [ saved r22            ]
149   // -11 [ saved r21            ]
150   // -10 [ saved r20            ]
151   //  -9 [ saved r19            ]
152   //  -8 [ call wrapper    (r0) ]
153   //  -7 [ result          (r1) ]
154   //  -6 [ result type     (r2) ]
155   //  -5 [ method          (r3) ]
156   //  -4 [ entry point     (r4) ]
157   //  -3 [ parameters      (r5) ]
158   //  -2 [ parameter size  (r6) ]
159   //  -1 [ thread (r7)          ]
160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
161   //   1 [ saved lr       (r30) ]
162 
163   // Call stub stack layout word offsets from fp
164   enum call_stub_layout {
165     sp_after_call_off = -26,
166 
167     d15_off            = -26,
168     d13_off            = -24,
169     d11_off            = -22,
170     d9_off             = -20,
171 
172     r28_off            = -18,
173     r26_off            = -16,
174     r24_off            = -14,
175     r22_off            = -12,
176     r20_off            = -10,
177     call_wrapper_off   =  -8,
178     result_off         =  -7,
179     result_type_off    =  -6,
180     method_off         =  -5,
181     entry_point_off    =  -4,
182     parameter_size_off =  -2,
183     thread_off         =  -1,
184     fp_f               =   0,
185     retaddr_off        =   1,
186   };
187 
generate_call_stub(address & return_address)188   address generate_call_stub(address& return_address) {
189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
191            "adjust this code");
192 
193     StubCodeMark mark(this, "StubRoutines", "call_stub");
194     address start = __ pc();
195 
196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
197 
198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
199     const Address result        (rfp, result_off         * wordSize);
200     const Address result_type   (rfp, result_type_off    * wordSize);
201     const Address method        (rfp, method_off         * wordSize);
202     const Address entry_point   (rfp, entry_point_off    * wordSize);
203     const Address parameter_size(rfp, parameter_size_off * wordSize);
204 
205     const Address thread        (rfp, thread_off         * wordSize);
206 
207     const Address d15_save      (rfp, d15_off * wordSize);
208     const Address d13_save      (rfp, d13_off * wordSize);
209     const Address d11_save      (rfp, d11_off * wordSize);
210     const Address d9_save       (rfp, d9_off * wordSize);
211 
212     const Address r28_save      (rfp, r28_off * wordSize);
213     const Address r26_save      (rfp, r26_off * wordSize);
214     const Address r24_save      (rfp, r24_off * wordSize);
215     const Address r22_save      (rfp, r22_off * wordSize);
216     const Address r20_save      (rfp, r20_off * wordSize);
217 
218     // stub code
219 
220     address aarch64_entry = __ pc();
221 
222     // set up frame and move sp to end of save area
223     __ enter();
224     __ sub(sp, rfp, -sp_after_call_off * wordSize);
225 
226     // save register parameters and Java scratch/global registers
227     // n.b. we save thread even though it gets installed in
228     // rthread because we want to sanity check rthread later
229     __ str(c_rarg7,  thread);
230     __ strw(c_rarg6, parameter_size);
231     __ stp(c_rarg4, c_rarg5,  entry_point);
232     __ stp(c_rarg2, c_rarg3,  result_type);
233     __ stp(c_rarg0, c_rarg1,  call_wrapper);
234 
235     __ stp(r20, r19,   r20_save);
236     __ stp(r22, r21,   r22_save);
237     __ stp(r24, r23,   r24_save);
238     __ stp(r26, r25,   r26_save);
239     __ stp(r28, r27,   r28_save);
240 
241     __ stpd(v9,  v8,   d9_save);
242     __ stpd(v11, v10,  d11_save);
243     __ stpd(v13, v12,  d13_save);
244     __ stpd(v15, v14,  d15_save);
245 
246     // install Java thread in global register now we have saved
247     // whatever value it held
248     __ mov(rthread, c_rarg7);
249     // And method
250     __ mov(rmethod, c_rarg3);
251 
252     // set up the heapbase register
253     __ reinit_heapbase();
254 
255 #ifdef ASSERT
256     // make sure we have no pending exceptions
257     {
258       Label L;
259       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
260       __ cmp(rscratch1, (u1)NULL_WORD);
261       __ br(Assembler::EQ, L);
262       __ stop("StubRoutines::call_stub: entered with pending exception");
263       __ BIND(L);
264     }
265 #endif
266     // pass parameters if any
267     __ mov(esp, sp);
268     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
269     __ andr(sp, rscratch1, -2 * wordSize);
270 
271     BLOCK_COMMENT("pass parameters if any");
272     Label parameters_done;
273     // parameter count is still in c_rarg6
274     // and parameter pointer identifying param 1 is in c_rarg5
275     __ cbzw(c_rarg6, parameters_done);
276 
277     address loop = __ pc();
278     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
279     __ subsw(c_rarg6, c_rarg6, 1);
280     __ push(rscratch1);
281     __ br(Assembler::GT, loop);
282 
283     __ BIND(parameters_done);
284 
285     // call Java entry -- passing methdoOop, and current sp
286     //      rmethod: Method*
287     //      r13: sender sp
288     BLOCK_COMMENT("call Java function");
289     __ mov(r13, sp);
290     __ blr(c_rarg4);
291 
292     // we do this here because the notify will already have been done
293     // if we get to the next instruction via an exception
294     //
295     // n.b. adding this instruction here affects the calculation of
296     // whether or not a routine returns to the call stub (used when
297     // doing stack walks) since the normal test is to check the return
298     // pc against the address saved below. so we may need to allow for
299     // this extra instruction in the check.
300 
301     // save current address for use by exception handling code
302 
303     return_address = __ pc();
304 
305     // store result depending on type (everything that is not
306     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
307     // n.b. this assumes Java returns an integral result in r0
308     // and a floating result in j_farg0
309     __ ldr(j_rarg2, result);
310     Label is_long, is_float, is_double, exit;
311     __ ldr(j_rarg1, result_type);
312     __ cmp(j_rarg1, (u1)T_OBJECT);
313     __ br(Assembler::EQ, is_long);
314     __ cmp(j_rarg1, (u1)T_LONG);
315     __ br(Assembler::EQ, is_long);
316     __ cmp(j_rarg1, (u1)T_FLOAT);
317     __ br(Assembler::EQ, is_float);
318     __ cmp(j_rarg1, (u1)T_DOUBLE);
319     __ br(Assembler::EQ, is_double);
320 
321     // handle T_INT case
322     __ strw(r0, Address(j_rarg2));
323 
324     __ BIND(exit);
325 
326     // pop parameters
327     __ sub(esp, rfp, -sp_after_call_off * wordSize);
328 
329 #ifdef ASSERT
330     // verify that threads correspond
331     {
332       Label L, S;
333       __ ldr(rscratch1, thread);
334       __ cmp(rthread, rscratch1);
335       __ br(Assembler::NE, S);
336       __ get_thread(rscratch1);
337       __ cmp(rthread, rscratch1);
338       __ br(Assembler::EQ, L);
339       __ BIND(S);
340       __ stop("StubRoutines::call_stub: threads must correspond");
341       __ BIND(L);
342     }
343 #endif
344 
345     // restore callee-save registers
346     __ ldpd(v15, v14,  d15_save);
347     __ ldpd(v13, v12,  d13_save);
348     __ ldpd(v11, v10,  d11_save);
349     __ ldpd(v9,  v8,   d9_save);
350 
351     __ ldp(r28, r27,   r28_save);
352     __ ldp(r26, r25,   r26_save);
353     __ ldp(r24, r23,   r24_save);
354     __ ldp(r22, r21,   r22_save);
355     __ ldp(r20, r19,   r20_save);
356 
357     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
358     __ ldrw(c_rarg2, result_type);
359     __ ldr(c_rarg3,  method);
360     __ ldp(c_rarg4, c_rarg5,  entry_point);
361     __ ldp(c_rarg6, c_rarg7,  parameter_size);
362 
363     // leave frame and return to caller
364     __ leave();
365     __ ret(lr);
366 
367     // handle return types different from T_INT
368 
369     __ BIND(is_long);
370     __ str(r0, Address(j_rarg2, 0));
371     __ br(Assembler::AL, exit);
372 
373     __ BIND(is_float);
374     __ strs(j_farg0, Address(j_rarg2, 0));
375     __ br(Assembler::AL, exit);
376 
377     __ BIND(is_double);
378     __ strd(j_farg0, Address(j_rarg2, 0));
379     __ br(Assembler::AL, exit);
380 
381     return start;
382   }
383 
384   // Return point for a Java call if there's an exception thrown in
385   // Java code.  The exception is caught and transformed into a
386   // pending exception stored in JavaThread that can be tested from
387   // within the VM.
388   //
389   // Note: Usually the parameters are removed by the callee. In case
390   // of an exception crossing an activation frame boundary, that is
391   // not the case if the callee is compiled code => need to setup the
392   // rsp.
393   //
394   // r0: exception oop
395 
generate_catch_exception()396   address generate_catch_exception() {
397     StubCodeMark mark(this, "StubRoutines", "catch_exception");
398     address start = __ pc();
399 
400     // same as in generate_call_stub():
401     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
402     const Address thread        (rfp, thread_off         * wordSize);
403 
404 #ifdef ASSERT
405     // verify that threads correspond
406     {
407       Label L, S;
408       __ ldr(rscratch1, thread);
409       __ cmp(rthread, rscratch1);
410       __ br(Assembler::NE, S);
411       __ get_thread(rscratch1);
412       __ cmp(rthread, rscratch1);
413       __ br(Assembler::EQ, L);
414       __ bind(S);
415       __ stop("StubRoutines::catch_exception: threads must correspond");
416       __ bind(L);
417     }
418 #endif
419 
420     // set pending exception
421     __ verify_oop(r0);
422 
423     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
424     __ mov(rscratch1, (address)__FILE__);
425     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
426     __ movw(rscratch1, (int)__LINE__);
427     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
428 
429     // complete return to VM
430     assert(StubRoutines::_call_stub_return_address != NULL,
431            "_call_stub_return_address must have been generated before");
432     __ b(StubRoutines::_call_stub_return_address);
433 
434     return start;
435   }
436 
437   // Continuation point for runtime calls returning with a pending
438   // exception.  The pending exception check happened in the runtime
439   // or native call stub.  The pending exception in Thread is
440   // converted into a Java-level exception.
441   //
442   // Contract with Java-level exception handlers:
443   // r0: exception
444   // r3: throwing pc
445   //
446   // NOTE: At entry of this stub, exception-pc must be in LR !!
447 
448   // NOTE: this is always used as a jump target within generated code
449   // so it just needs to be generated code wiht no x86 prolog
450 
generate_forward_exception()451   address generate_forward_exception() {
452     StubCodeMark mark(this, "StubRoutines", "forward exception");
453     address start = __ pc();
454 
455     // Upon entry, LR points to the return address returning into
456     // Java (interpreted or compiled) code; i.e., the return address
457     // becomes the throwing pc.
458     //
459     // Arguments pushed before the runtime call are still on the stack
460     // but the exception handler will reset the stack pointer ->
461     // ignore them.  A potential result in registers can be ignored as
462     // well.
463 
464 #ifdef ASSERT
465     // make sure this code is only executed if there is a pending exception
466     {
467       Label L;
468       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
469       __ cbnz(rscratch1, L);
470       __ stop("StubRoutines::forward exception: no pending exception (1)");
471       __ bind(L);
472     }
473 #endif
474 
475     // compute exception handler into r19
476 
477     // call the VM to find the handler address associated with the
478     // caller address. pass thread in r0 and caller pc (ret address)
479     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
480     // the stack.
481     __ mov(c_rarg1, lr);
482     // lr will be trashed by the VM call so we move it to R19
483     // (callee-saved) because we also need to pass it to the handler
484     // returned by this call.
485     __ mov(r19, lr);
486     BLOCK_COMMENT("call exception_handler_for_return_address");
487     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
488                          SharedRuntime::exception_handler_for_return_address),
489                     rthread, c_rarg1);
490     // we should not really care that lr is no longer the callee
491     // address. we saved the value the handler needs in r19 so we can
492     // just copy it to r3. however, the C2 handler will push its own
493     // frame and then calls into the VM and the VM code asserts that
494     // the PC for the frame above the handler belongs to a compiled
495     // Java method. So, we restore lr here to satisfy that assert.
496     __ mov(lr, r19);
497     // setup r0 & r3 & clear pending exception
498     __ mov(r3, r19);
499     __ mov(r19, r0);
500     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
501     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
502 
503 #ifdef ASSERT
504     // make sure exception is set
505     {
506       Label L;
507       __ cbnz(r0, L);
508       __ stop("StubRoutines::forward exception: no pending exception (2)");
509       __ bind(L);
510     }
511 #endif
512 
513     // continue at exception handler
514     // r0: exception
515     // r3: throwing pc
516     // r19: exception handler
517     __ verify_oop(r0);
518     __ br(r19);
519 
520     return start;
521   }
522 
523   // Non-destructive plausibility checks for oops
524   //
525   // Arguments:
526   //    r0: oop to verify
527   //    rscratch1: error message
528   //
529   // Stack after saving c_rarg3:
530   //    [tos + 0]: saved c_rarg3
531   //    [tos + 1]: saved c_rarg2
532   //    [tos + 2]: saved lr
533   //    [tos + 3]: saved rscratch2
534   //    [tos + 4]: saved r0
535   //    [tos + 5]: saved rscratch1
generate_verify_oop()536   address generate_verify_oop() {
537 
538     StubCodeMark mark(this, "StubRoutines", "verify_oop");
539     address start = __ pc();
540 
541     Label exit, error;
542 
543     // save c_rarg2 and c_rarg3
544     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
545 
546     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
547     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
548     __ ldr(c_rarg3, Address(c_rarg2));
549     __ add(c_rarg3, c_rarg3, 1);
550     __ str(c_rarg3, Address(c_rarg2));
551 
552     // object is in r0
553     // make sure object is 'reasonable'
554     __ cbz(r0, exit); // if obj is NULL it is OK
555 
556 #if INCLUDE_ZGC
557     if (UseZGC) {
558       // Check if mask is good.
559       // verifies that ZAddressBadMask & r0 == 0
560       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
561       __ andr(c_rarg2, r0, c_rarg3);
562       __ cbnz(c_rarg2, error);
563     }
564 #endif
565 
566     // Check if the oop is in the right area of memory
567     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
568     __ andr(c_rarg2, r0, c_rarg3);
569     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
570 
571     // Compare c_rarg2 and c_rarg3.  We don't use a compare
572     // instruction here because the flags register is live.
573     __ eor(c_rarg2, c_rarg2, c_rarg3);
574     __ cbnz(c_rarg2, error);
575 
576     // make sure klass is 'reasonable', which is not zero.
577     __ load_klass(r0, r0);  // get klass
578     __ cbz(r0, error);      // if klass is NULL it is broken
579 
580     // return if everything seems ok
581     __ bind(exit);
582 
583     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
584     __ ret(lr);
585 
586     // handle errors
587     __ bind(error);
588     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
589 
590     __ push(RegSet::range(r0, r29), sp);
591     // debug(char* msg, int64_t pc, int64_t regs[])
592     __ mov(c_rarg0, rscratch1);      // pass address of error message
593     __ mov(c_rarg1, lr);             // pass return address
594     __ mov(c_rarg2, sp);             // pass address of regs on stack
595 #ifndef PRODUCT
596     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
597 #endif
598     BLOCK_COMMENT("call MacroAssembler::debug");
599     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
600     __ blr(rscratch1);
601 
602     return start;
603   }
604 
array_overlap_test(Label & L_no_overlap,Address::sxtw sf)605   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
606 
607   // The inner part of zero_words().  This is the bulk operation,
608   // zeroing words in blocks, possibly using DC ZVA to do it.  The
609   // caller is responsible for zeroing the last few words.
610   //
611   // Inputs:
612   // r10: the HeapWord-aligned base address of an array to zero.
613   // r11: the count in HeapWords, r11 > 0.
614   //
615   // Returns r10 and r11, adjusted for the caller to clear.
616   // r10: the base address of the tail of words left to clear.
617   // r11: the number of words in the tail.
618   //      r11 < MacroAssembler::zero_words_block_size.
619 
generate_zero_blocks()620   address generate_zero_blocks() {
621     Label done;
622     Label base_aligned;
623 
624     Register base = r10, cnt = r11;
625 
626     __ align(CodeEntryAlignment);
627     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
628     address start = __ pc();
629 
630     if (UseBlockZeroing) {
631       int zva_length = VM_Version::zva_length();
632 
633       // Ensure ZVA length can be divided by 16. This is required by
634       // the subsequent operations.
635       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
636 
637       __ tbz(base, 3, base_aligned);
638       __ str(zr, Address(__ post(base, 8)));
639       __ sub(cnt, cnt, 1);
640       __ bind(base_aligned);
641 
642       // Ensure count >= zva_length * 2 so that it still deserves a zva after
643       // alignment.
644       Label small;
645       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
646       __ subs(rscratch1, cnt, low_limit >> 3);
647       __ br(Assembler::LT, small);
648       __ zero_dcache_blocks(base, cnt);
649       __ bind(small);
650     }
651 
652     {
653       // Number of stp instructions we'll unroll
654       const int unroll =
655         MacroAssembler::zero_words_block_size / 2;
656       // Clear the remaining blocks.
657       Label loop;
658       __ subs(cnt, cnt, unroll * 2);
659       __ br(Assembler::LT, done);
660       __ bind(loop);
661       for (int i = 0; i < unroll; i++)
662         __ stp(zr, zr, __ post(base, 16));
663       __ subs(cnt, cnt, unroll * 2);
664       __ br(Assembler::GE, loop);
665       __ bind(done);
666       __ add(cnt, cnt, unroll * 2);
667     }
668 
669     __ ret(lr);
670 
671     return start;
672   }
673 
674 
675   typedef enum {
676     copy_forwards = 1,
677     copy_backwards = -1
678   } copy_direction;
679 
680   // Bulk copy of blocks of 8 words.
681   //
682   // count is a count of words.
683   //
684   // Precondition: count >= 8
685   //
686   // Postconditions:
687   //
688   // The least significant bit of count contains the remaining count
689   // of words to copy.  The rest of count is trash.
690   //
691   // s and d are adjusted to point to the remaining words to copy
692   //
generate_copy_longs(Label & start,Register s,Register d,Register count,copy_direction direction)693   void generate_copy_longs(Label &start, Register s, Register d, Register count,
694                            copy_direction direction) {
695     int unit = wordSize * direction;
696     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
697 
698     int offset;
699     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
700       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
701     const Register stride = r13;
702 
703     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
704     assert_different_registers(s, d, count, rscratch1);
705 
706     Label again, drain;
707     const char *stub_name;
708     if (direction == copy_forwards)
709       stub_name = "forward_copy_longs";
710     else
711       stub_name = "backward_copy_longs";
712 
713     __ align(CodeEntryAlignment);
714 
715     StubCodeMark mark(this, "StubRoutines", stub_name);
716 
717     __ bind(start);
718 
719     Label unaligned_copy_long;
720     if (AvoidUnalignedAccesses) {
721       __ tbnz(d, 3, unaligned_copy_long);
722     }
723 
724     if (direction == copy_forwards) {
725       __ sub(s, s, bias);
726       __ sub(d, d, bias);
727     }
728 
729 #ifdef ASSERT
730     // Make sure we are never given < 8 words
731     {
732       Label L;
733       __ cmp(count, (u1)8);
734       __ br(Assembler::GE, L);
735       __ stop("genrate_copy_longs called with < 8 words");
736       __ bind(L);
737     }
738 #endif
739 
740     // Fill 8 registers
741     if (UseSIMDForMemoryOps) {
742       __ ldpq(v0, v1, Address(s, 4 * unit));
743       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
744     } else {
745       __ ldp(t0, t1, Address(s, 2 * unit));
746       __ ldp(t2, t3, Address(s, 4 * unit));
747       __ ldp(t4, t5, Address(s, 6 * unit));
748       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
749     }
750 
751     __ subs(count, count, 16);
752     __ br(Assembler::LO, drain);
753 
754     int prefetch = PrefetchCopyIntervalInBytes;
755     bool use_stride = false;
756     if (direction == copy_backwards) {
757        use_stride = prefetch > 256;
758        prefetch = -prefetch;
759        if (use_stride) __ mov(stride, prefetch);
760     }
761 
762     __ bind(again);
763 
764     if (PrefetchCopyIntervalInBytes > 0)
765       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
766 
767     if (UseSIMDForMemoryOps) {
768       __ stpq(v0, v1, Address(d, 4 * unit));
769       __ ldpq(v0, v1, Address(s, 4 * unit));
770       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
771       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
772     } else {
773       __ stp(t0, t1, Address(d, 2 * unit));
774       __ ldp(t0, t1, Address(s, 2 * unit));
775       __ stp(t2, t3, Address(d, 4 * unit));
776       __ ldp(t2, t3, Address(s, 4 * unit));
777       __ stp(t4, t5, Address(d, 6 * unit));
778       __ ldp(t4, t5, Address(s, 6 * unit));
779       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
780       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
781     }
782 
783     __ subs(count, count, 8);
784     __ br(Assembler::HS, again);
785 
786     // Drain
787     __ bind(drain);
788     if (UseSIMDForMemoryOps) {
789       __ stpq(v0, v1, Address(d, 4 * unit));
790       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
791     } else {
792       __ stp(t0, t1, Address(d, 2 * unit));
793       __ stp(t2, t3, Address(d, 4 * unit));
794       __ stp(t4, t5, Address(d, 6 * unit));
795       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
796     }
797 
798     {
799       Label L1, L2;
800       __ tbz(count, exact_log2(4), L1);
801       if (UseSIMDForMemoryOps) {
802         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
803         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
804       } else {
805         __ ldp(t0, t1, Address(s, 2 * unit));
806         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
807         __ stp(t0, t1, Address(d, 2 * unit));
808         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
809       }
810       __ bind(L1);
811 
812       if (direction == copy_forwards) {
813         __ add(s, s, bias);
814         __ add(d, d, bias);
815       }
816 
817       __ tbz(count, 1, L2);
818       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
819       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
820       __ bind(L2);
821     }
822 
823     __ ret(lr);
824 
825     if (AvoidUnalignedAccesses) {
826       Label drain, again;
827       // Register order for storing. Order is different for backward copy.
828 
829       __ bind(unaligned_copy_long);
830 
831       // source address is even aligned, target odd aligned
832       //
833       // when forward copying word pairs we read long pairs at offsets
834       // {0, 2, 4, 6} (in long words). when backwards copying we read
835       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
836       // address by -2 in the forwards case so we can compute the
837       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
838       // or -1.
839       //
840       // when forward copying we need to store 1 word, 3 pairs and
841       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
842       // zero offset We adjust the destination by -1 which means we
843       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
844       //
845       // When backwards copyng we need to store 1 word, 3 pairs and
846       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
847       // offsets {1, 3, 5, 7, 8} * unit.
848 
849       if (direction == copy_forwards) {
850         __ sub(s, s, 16);
851         __ sub(d, d, 8);
852       }
853 
854       // Fill 8 registers
855       //
856       // for forwards copy s was offset by -16 from the original input
857       // value of s so the register contents are at these offsets
858       // relative to the 64 bit block addressed by that original input
859       // and so on for each successive 64 byte block when s is updated
860       //
861       // t0 at offset 0,  t1 at offset 8
862       // t2 at offset 16, t3 at offset 24
863       // t4 at offset 32, t5 at offset 40
864       // t6 at offset 48, t7 at offset 56
865 
866       // for backwards copy s was not offset so the register contents
867       // are at these offsets into the preceding 64 byte block
868       // relative to that original input and so on for each successive
869       // preceding 64 byte block when s is updated. this explains the
870       // slightly counter-intuitive looking pattern of register usage
871       // in the stp instructions for backwards copy.
872       //
873       // t0 at offset -16, t1 at offset -8
874       // t2 at offset -32, t3 at offset -24
875       // t4 at offset -48, t5 at offset -40
876       // t6 at offset -64, t7 at offset -56
877 
878       __ ldp(t0, t1, Address(s, 2 * unit));
879       __ ldp(t2, t3, Address(s, 4 * unit));
880       __ ldp(t4, t5, Address(s, 6 * unit));
881       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
882 
883       __ subs(count, count, 16);
884       __ br(Assembler::LO, drain);
885 
886       int prefetch = PrefetchCopyIntervalInBytes;
887       bool use_stride = false;
888       if (direction == copy_backwards) {
889          use_stride = prefetch > 256;
890          prefetch = -prefetch;
891          if (use_stride) __ mov(stride, prefetch);
892       }
893 
894       __ bind(again);
895 
896       if (PrefetchCopyIntervalInBytes > 0)
897         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
898 
899       if (direction == copy_forwards) {
900        // allowing for the offset of -8 the store instructions place
901        // registers into the target 64 bit block at the following
902        // offsets
903        //
904        // t0 at offset 0
905        // t1 at offset 8,  t2 at offset 16
906        // t3 at offset 24, t4 at offset 32
907        // t5 at offset 40, t6 at offset 48
908        // t7 at offset 56
909 
910         __ str(t0, Address(d, 1 * unit));
911         __ stp(t1, t2, Address(d, 2 * unit));
912         __ ldp(t0, t1, Address(s, 2 * unit));
913         __ stp(t3, t4, Address(d, 4 * unit));
914         __ ldp(t2, t3, Address(s, 4 * unit));
915         __ stp(t5, t6, Address(d, 6 * unit));
916         __ ldp(t4, t5, Address(s, 6 * unit));
917         __ str(t7, Address(__ pre(d, 8 * unit)));
918         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
919       } else {
920        // d was not offset when we started so the registers are
921        // written into the 64 bit block preceding d with the following
922        // offsets
923        //
924        // t1 at offset -8
925        // t3 at offset -24, t0 at offset -16
926        // t5 at offset -48, t2 at offset -32
927        // t7 at offset -56, t4 at offset -48
928        //                   t6 at offset -64
929        //
930        // note that this matches the offsets previously noted for the
931        // loads
932 
933         __ str(t1, Address(d, 1 * unit));
934         __ stp(t3, t0, Address(d, 3 * unit));
935         __ ldp(t0, t1, Address(s, 2 * unit));
936         __ stp(t5, t2, Address(d, 5 * unit));
937         __ ldp(t2, t3, Address(s, 4 * unit));
938         __ stp(t7, t4, Address(d, 7 * unit));
939         __ ldp(t4, t5, Address(s, 6 * unit));
940         __ str(t6, Address(__ pre(d, 8 * unit)));
941         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
942       }
943 
944       __ subs(count, count, 8);
945       __ br(Assembler::HS, again);
946 
947       // Drain
948       //
949       // this uses the same pattern of offsets and register arguments
950       // as above
951       __ bind(drain);
952       if (direction == copy_forwards) {
953         __ str(t0, Address(d, 1 * unit));
954         __ stp(t1, t2, Address(d, 2 * unit));
955         __ stp(t3, t4, Address(d, 4 * unit));
956         __ stp(t5, t6, Address(d, 6 * unit));
957         __ str(t7, Address(__ pre(d, 8 * unit)));
958       } else {
959         __ str(t1, Address(d, 1 * unit));
960         __ stp(t3, t0, Address(d, 3 * unit));
961         __ stp(t5, t2, Address(d, 5 * unit));
962         __ stp(t7, t4, Address(d, 7 * unit));
963         __ str(t6, Address(__ pre(d, 8 * unit)));
964       }
965       // now we need to copy any remaining part block which may
966       // include a 4 word block subblock and/or a 2 word subblock.
967       // bits 2 and 1 in the count are the tell-tale for whetehr we
968       // have each such subblock
969       {
970         Label L1, L2;
971         __ tbz(count, exact_log2(4), L1);
972        // this is the same as above but copying only 4 longs hence
973        // with ony one intervening stp between the str instructions
974        // but note that the offsets and registers still follow the
975        // same pattern
976         __ ldp(t0, t1, Address(s, 2 * unit));
977         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
978         if (direction == copy_forwards) {
979           __ str(t0, Address(d, 1 * unit));
980           __ stp(t1, t2, Address(d, 2 * unit));
981           __ str(t3, Address(__ pre(d, 4 * unit)));
982         } else {
983           __ str(t1, Address(d, 1 * unit));
984           __ stp(t3, t0, Address(d, 3 * unit));
985           __ str(t2, Address(__ pre(d, 4 * unit)));
986         }
987         __ bind(L1);
988 
989         __ tbz(count, 1, L2);
990        // this is the same as above but copying only 2 longs hence
991        // there is no intervening stp between the str instructions
992        // but note that the offset and register patterns are still
993        // the same
994         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
995         if (direction == copy_forwards) {
996           __ str(t0, Address(d, 1 * unit));
997           __ str(t1, Address(__ pre(d, 2 * unit)));
998         } else {
999           __ str(t1, Address(d, 1 * unit));
1000           __ str(t0, Address(__ pre(d, 2 * unit)));
1001         }
1002         __ bind(L2);
1003 
1004        // for forwards copy we need to re-adjust the offsets we
1005        // applied so that s and d are follow the last words written
1006 
1007        if (direction == copy_forwards) {
1008          __ add(s, s, 16);
1009          __ add(d, d, 8);
1010        }
1011 
1012       }
1013 
1014       __ ret(lr);
1015       }
1016   }
1017 
1018   // Small copy: less than 16 bytes.
1019   //
1020   // NB: Ignores all of the bits of count which represent more than 15
1021   // bytes, so a caller doesn't have to mask them.
1022 
copy_memory_small(Register s,Register d,Register count,Register tmp,int step)1023   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1024     bool is_backwards = step < 0;
1025     size_t granularity = uabs(step);
1026     int direction = is_backwards ? -1 : 1;
1027     int unit = wordSize * direction;
1028 
1029     Label Lword, Lint, Lshort, Lbyte;
1030 
1031     assert(granularity
1032            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1033 
1034     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1035 
1036     // ??? I don't know if this bit-test-and-branch is the right thing
1037     // to do.  It does a lot of jumping, resulting in several
1038     // mispredicted branches.  It might make more sense to do this
1039     // with something like Duff's device with a single computed branch.
1040 
1041     __ tbz(count, 3 - exact_log2(granularity), Lword);
1042     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1043     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1044     __ bind(Lword);
1045 
1046     if (granularity <= sizeof (jint)) {
1047       __ tbz(count, 2 - exact_log2(granularity), Lint);
1048       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1049       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1050       __ bind(Lint);
1051     }
1052 
1053     if (granularity <= sizeof (jshort)) {
1054       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1055       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1056       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1057       __ bind(Lshort);
1058     }
1059 
1060     if (granularity <= sizeof (jbyte)) {
1061       __ tbz(count, 0, Lbyte);
1062       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1063       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1064       __ bind(Lbyte);
1065     }
1066   }
1067 
1068   Label copy_f, copy_b;
1069 
1070   // All-singing all-dancing memory copy.
1071   //
1072   // Copy count units of memory from s to d.  The size of a unit is
1073   // step, which can be positive or negative depending on the direction
1074   // of copy.  If is_aligned is false, we align the source address.
1075   //
1076 
copy_memory(bool is_aligned,Register s,Register d,Register count,Register tmp,int step)1077   void copy_memory(bool is_aligned, Register s, Register d,
1078                    Register count, Register tmp, int step) {
1079     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1080     bool is_backwards = step < 0;
1081     int granularity = uabs(step);
1082     const Register t0 = r3, t1 = r4;
1083 
1084     // <= 96 bytes do inline. Direction doesn't matter because we always
1085     // load all the data before writing anything
1086     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1087     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1088     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1089     const Register send = r17, dend = r18;
1090 
1091     if (PrefetchCopyIntervalInBytes > 0)
1092       __ prfm(Address(s, 0), PLDL1KEEP);
1093     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1094     __ br(Assembler::HI, copy_big);
1095 
1096     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1097     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1098 
1099     __ cmp(count, u1(16/granularity));
1100     __ br(Assembler::LS, copy16);
1101 
1102     __ cmp(count, u1(64/granularity));
1103     __ br(Assembler::HI, copy80);
1104 
1105     __ cmp(count, u1(32/granularity));
1106     __ br(Assembler::LS, copy32);
1107 
1108     // 33..64 bytes
1109     if (UseSIMDForMemoryOps) {
1110       __ ldpq(v0, v1, Address(s, 0));
1111       __ ldpq(v2, v3, Address(send, -32));
1112       __ stpq(v0, v1, Address(d, 0));
1113       __ stpq(v2, v3, Address(dend, -32));
1114     } else {
1115       __ ldp(t0, t1, Address(s, 0));
1116       __ ldp(t2, t3, Address(s, 16));
1117       __ ldp(t4, t5, Address(send, -32));
1118       __ ldp(t6, t7, Address(send, -16));
1119 
1120       __ stp(t0, t1, Address(d, 0));
1121       __ stp(t2, t3, Address(d, 16));
1122       __ stp(t4, t5, Address(dend, -32));
1123       __ stp(t6, t7, Address(dend, -16));
1124     }
1125     __ b(finish);
1126 
1127     // 17..32 bytes
1128     __ bind(copy32);
1129     __ ldp(t0, t1, Address(s, 0));
1130     __ ldp(t2, t3, Address(send, -16));
1131     __ stp(t0, t1, Address(d, 0));
1132     __ stp(t2, t3, Address(dend, -16));
1133     __ b(finish);
1134 
1135     // 65..80/96 bytes
1136     // (96 bytes if SIMD because we do 32 byes per instruction)
1137     __ bind(copy80);
1138     if (UseSIMDForMemoryOps) {
1139       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1140       __ ldpq(v4, v5, Address(send, -32));
1141       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1142       __ stpq(v4, v5, Address(dend, -32));
1143     } else {
1144       __ ldp(t0, t1, Address(s, 0));
1145       __ ldp(t2, t3, Address(s, 16));
1146       __ ldp(t4, t5, Address(s, 32));
1147       __ ldp(t6, t7, Address(s, 48));
1148       __ ldp(t8, t9, Address(send, -16));
1149 
1150       __ stp(t0, t1, Address(d, 0));
1151       __ stp(t2, t3, Address(d, 16));
1152       __ stp(t4, t5, Address(d, 32));
1153       __ stp(t6, t7, Address(d, 48));
1154       __ stp(t8, t9, Address(dend, -16));
1155     }
1156     __ b(finish);
1157 
1158     // 0..16 bytes
1159     __ bind(copy16);
1160     __ cmp(count, u1(8/granularity));
1161     __ br(Assembler::LO, copy8);
1162 
1163     // 8..16 bytes
1164     __ ldr(t0, Address(s, 0));
1165     __ ldr(t1, Address(send, -8));
1166     __ str(t0, Address(d, 0));
1167     __ str(t1, Address(dend, -8));
1168     __ b(finish);
1169 
1170     if (granularity < 8) {
1171       // 4..7 bytes
1172       __ bind(copy8);
1173       __ tbz(count, 2 - exact_log2(granularity), copy4);
1174       __ ldrw(t0, Address(s, 0));
1175       __ ldrw(t1, Address(send, -4));
1176       __ strw(t0, Address(d, 0));
1177       __ strw(t1, Address(dend, -4));
1178       __ b(finish);
1179       if (granularity < 4) {
1180         // 0..3 bytes
1181         __ bind(copy4);
1182         __ cbz(count, finish); // get rid of 0 case
1183         if (granularity == 2) {
1184           __ ldrh(t0, Address(s, 0));
1185           __ strh(t0, Address(d, 0));
1186         } else { // granularity == 1
1187           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1188           // the first and last byte.
1189           // Handle the 3 byte case by loading and storing base + count/2
1190           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1191           // This does means in the 1 byte case we load/store the same
1192           // byte 3 times.
1193           __ lsr(count, count, 1);
1194           __ ldrb(t0, Address(s, 0));
1195           __ ldrb(t1, Address(send, -1));
1196           __ ldrb(t2, Address(s, count));
1197           __ strb(t0, Address(d, 0));
1198           __ strb(t1, Address(dend, -1));
1199           __ strb(t2, Address(d, count));
1200         }
1201         __ b(finish);
1202       }
1203     }
1204 
1205     __ bind(copy_big);
1206     if (is_backwards) {
1207       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1208       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1209     }
1210 
1211     // Now we've got the small case out of the way we can align the
1212     // source address on a 2-word boundary.
1213 
1214     Label aligned;
1215 
1216     if (is_aligned) {
1217       // We may have to adjust by 1 word to get s 2-word-aligned.
1218       __ tbz(s, exact_log2(wordSize), aligned);
1219       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1220       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1221       __ sub(count, count, wordSize/granularity);
1222     } else {
1223       if (is_backwards) {
1224         __ andr(rscratch2, s, 2 * wordSize - 1);
1225       } else {
1226         __ neg(rscratch2, s);
1227         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1228       }
1229       // rscratch2 is the byte adjustment needed to align s.
1230       __ cbz(rscratch2, aligned);
1231       int shift = exact_log2(granularity);
1232       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1233       __ sub(count, count, rscratch2);
1234 
1235 #if 0
1236       // ?? This code is only correct for a disjoint copy.  It may or
1237       // may not make sense to use it in that case.
1238 
1239       // Copy the first pair; s and d may not be aligned.
1240       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1241       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1242 
1243       // Align s and d, adjust count
1244       if (is_backwards) {
1245         __ sub(s, s, rscratch2);
1246         __ sub(d, d, rscratch2);
1247       } else {
1248         __ add(s, s, rscratch2);
1249         __ add(d, d, rscratch2);
1250       }
1251 #else
1252       copy_memory_small(s, d, rscratch2, rscratch1, step);
1253 #endif
1254     }
1255 
1256     __ bind(aligned);
1257 
1258     // s is now 2-word-aligned.
1259 
1260     // We have a count of units and some trailing bytes.  Adjust the
1261     // count and do a bulk copy of words.
1262     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1263     if (direction == copy_forwards)
1264       __ bl(copy_f);
1265     else
1266       __ bl(copy_b);
1267 
1268     // And the tail.
1269     copy_memory_small(s, d, count, tmp, step);
1270 
1271     if (granularity >= 8) __ bind(copy8);
1272     if (granularity >= 4) __ bind(copy4);
1273     __ bind(finish);
1274   }
1275 
1276 
clobber_registers()1277   void clobber_registers() {
1278 #ifdef ASSERT
1279     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1280     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1281     for (Register r = r3; r <= r18; r++)
1282       if (r != rscratch1) __ mov(r, rscratch1);
1283 #endif
1284   }
1285 
1286   // Scan over array at a for count oops, verifying each one.
1287   // Preserves a and count, clobbers rscratch1 and rscratch2.
verify_oop_array(size_t size,Register a,Register count,Register temp)1288   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1289     Label loop, end;
1290     __ mov(rscratch1, a);
1291     __ mov(rscratch2, zr);
1292     __ bind(loop);
1293     __ cmp(rscratch2, count);
1294     __ br(Assembler::HS, end);
1295     if (size == (size_t)wordSize) {
1296       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1297       __ verify_oop(temp);
1298     } else {
1299       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1300       __ decode_heap_oop(temp); // calls verify_oop
1301     }
1302     __ add(rscratch2, rscratch2, size);
1303     __ b(loop);
1304     __ bind(end);
1305   }
1306 
1307   // Arguments:
1308   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1309   //             ignored
1310   //   is_oop  - true => oop array, so generate store check code
1311   //   name    - stub name string
1312   //
1313   // Inputs:
1314   //   c_rarg0   - source array address
1315   //   c_rarg1   - destination array address
1316   //   c_rarg2   - element count, treated as ssize_t, can be zero
1317   //
1318   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1319   // the hardware handle it.  The two dwords within qwords that span
1320   // cache line boundaries will still be loaded and stored atomicly.
1321   //
1322   // Side Effects:
1323   //   disjoint_int_copy_entry is set to the no-overlap entry point
1324   //   used by generate_conjoint_int_oop_copy().
1325   //
generate_disjoint_copy(size_t size,bool aligned,bool is_oop,address * entry,const char * name,bool dest_uninitialized=false)1326   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1327                                   const char *name, bool dest_uninitialized = false) {
1328     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1329     RegSet saved_reg = RegSet::of(s, d, count);
1330     __ align(CodeEntryAlignment);
1331     StubCodeMark mark(this, "StubRoutines", name);
1332     address start = __ pc();
1333     __ enter();
1334 
1335     if (entry != NULL) {
1336       *entry = __ pc();
1337       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1338       BLOCK_COMMENT("Entry:");
1339     }
1340 
1341     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1342     if (dest_uninitialized) {
1343       decorators |= IS_DEST_UNINITIALIZED;
1344     }
1345     if (aligned) {
1346       decorators |= ARRAYCOPY_ALIGNED;
1347     }
1348 
1349     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1350     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1351 
1352     if (is_oop) {
1353       // save regs before copy_memory
1354       __ push(RegSet::of(d, count), sp);
1355     }
1356     {
1357       // UnsafeCopyMemory page error: continue after ucm
1358       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1359       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1360       copy_memory(aligned, s, d, count, rscratch1, size);
1361     }
1362 
1363     if (is_oop) {
1364       __ pop(RegSet::of(d, count), sp);
1365       if (VerifyOops)
1366         verify_oop_array(size, d, count, r16);
1367     }
1368 
1369     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1370 
1371     __ leave();
1372     __ mov(r0, zr); // return 0
1373     __ ret(lr);
1374     return start;
1375   }
1376 
1377   // Arguments:
1378   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1379   //             ignored
1380   //   is_oop  - true => oop array, so generate store check code
1381   //   name    - stub name string
1382   //
1383   // Inputs:
1384   //   c_rarg0   - source array address
1385   //   c_rarg1   - destination array address
1386   //   c_rarg2   - element count, treated as ssize_t, can be zero
1387   //
1388   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1389   // the hardware handle it.  The two dwords within qwords that span
1390   // cache line boundaries will still be loaded and stored atomicly.
1391   //
generate_conjoint_copy(size_t size,bool aligned,bool is_oop,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1392   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1393                                  address *entry, const char *name,
1394                                  bool dest_uninitialized = false) {
1395     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1396     RegSet saved_regs = RegSet::of(s, d, count);
1397     StubCodeMark mark(this, "StubRoutines", name);
1398     address start = __ pc();
1399     __ enter();
1400 
1401     if (entry != NULL) {
1402       *entry = __ pc();
1403       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1404       BLOCK_COMMENT("Entry:");
1405     }
1406 
1407     // use fwd copy when (d-s) above_equal (count*size)
1408     __ sub(rscratch1, d, s);
1409     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1410     __ br(Assembler::HS, nooverlap_target);
1411 
1412     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1413     if (dest_uninitialized) {
1414       decorators |= IS_DEST_UNINITIALIZED;
1415     }
1416     if (aligned) {
1417       decorators |= ARRAYCOPY_ALIGNED;
1418     }
1419 
1420     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1421     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1422 
1423     if (is_oop) {
1424       // save regs before copy_memory
1425       __ push(RegSet::of(d, count), sp);
1426     }
1427     {
1428       // UnsafeCopyMemory page error: continue after ucm
1429       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1430       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1431       copy_memory(aligned, s, d, count, rscratch1, -size);
1432     }
1433     if (is_oop) {
1434       __ pop(RegSet::of(d, count), sp);
1435       if (VerifyOops)
1436         verify_oop_array(size, d, count, r16);
1437     }
1438     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1439     __ leave();
1440     __ mov(r0, zr); // return 0
1441     __ ret(lr);
1442     return start;
1443 }
1444 
1445   // Arguments:
1446   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1447   //             ignored
1448   //   name    - stub name string
1449   //
1450   // Inputs:
1451   //   c_rarg0   - source array address
1452   //   c_rarg1   - destination array address
1453   //   c_rarg2   - element count, treated as ssize_t, can be zero
1454   //
1455   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1456   // we let the hardware handle it.  The one to eight bytes within words,
1457   // dwords or qwords that span cache line boundaries will still be loaded
1458   // and stored atomically.
1459   //
1460   // Side Effects:
1461   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1462   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1463   // we let the hardware handle it.  The one to eight bytes within words,
1464   // dwords or qwords that span cache line boundaries will still be loaded
1465   // and stored atomically.
1466   //
1467   // Side Effects:
1468   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1469   //   used by generate_conjoint_byte_copy().
1470   //
generate_disjoint_byte_copy(bool aligned,address * entry,const char * name)1471   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1472     const bool not_oop = false;
1473     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1474   }
1475 
1476   // Arguments:
1477   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1478   //             ignored
1479   //   name    - stub name string
1480   //
1481   // Inputs:
1482   //   c_rarg0   - source array address
1483   //   c_rarg1   - destination array address
1484   //   c_rarg2   - element count, treated as ssize_t, can be zero
1485   //
1486   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1487   // we let the hardware handle it.  The one to eight bytes within words,
1488   // dwords or qwords that span cache line boundaries will still be loaded
1489   // and stored atomically.
1490   //
generate_conjoint_byte_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1491   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1492                                       address* entry, const char *name) {
1493     const bool not_oop = false;
1494     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1495   }
1496 
1497   // Arguments:
1498   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1499   //             ignored
1500   //   name    - stub name string
1501   //
1502   // Inputs:
1503   //   c_rarg0   - source array address
1504   //   c_rarg1   - destination array address
1505   //   c_rarg2   - element count, treated as ssize_t, can be zero
1506   //
1507   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1508   // let the hardware handle it.  The two or four words within dwords
1509   // or qwords that span cache line boundaries will still be loaded
1510   // and stored atomically.
1511   //
1512   // Side Effects:
1513   //   disjoint_short_copy_entry is set to the no-overlap entry point
1514   //   used by generate_conjoint_short_copy().
1515   //
generate_disjoint_short_copy(bool aligned,address * entry,const char * name)1516   address generate_disjoint_short_copy(bool aligned,
1517                                        address* entry, const char *name) {
1518     const bool not_oop = false;
1519     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1520   }
1521 
1522   // Arguments:
1523   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1524   //             ignored
1525   //   name    - stub name string
1526   //
1527   // Inputs:
1528   //   c_rarg0   - source array address
1529   //   c_rarg1   - destination array address
1530   //   c_rarg2   - element count, treated as ssize_t, can be zero
1531   //
1532   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1533   // let the hardware handle it.  The two or four words within dwords
1534   // or qwords that span cache line boundaries will still be loaded
1535   // and stored atomically.
1536   //
generate_conjoint_short_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1537   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1538                                        address *entry, const char *name) {
1539     const bool not_oop = false;
1540     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1541 
1542   }
1543   // Arguments:
1544   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1545   //             ignored
1546   //   name    - stub name string
1547   //
1548   // Inputs:
1549   //   c_rarg0   - source array address
1550   //   c_rarg1   - destination array address
1551   //   c_rarg2   - element count, treated as ssize_t, can be zero
1552   //
1553   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1554   // the hardware handle it.  The two dwords within qwords that span
1555   // cache line boundaries will still be loaded and stored atomicly.
1556   //
1557   // Side Effects:
1558   //   disjoint_int_copy_entry is set to the no-overlap entry point
1559   //   used by generate_conjoint_int_oop_copy().
1560   //
generate_disjoint_int_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1561   address generate_disjoint_int_copy(bool aligned, address *entry,
1562                                          const char *name, bool dest_uninitialized = false) {
1563     const bool not_oop = false;
1564     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1565   }
1566 
1567   // Arguments:
1568   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1569   //             ignored
1570   //   name    - stub name string
1571   //
1572   // Inputs:
1573   //   c_rarg0   - source array address
1574   //   c_rarg1   - destination array address
1575   //   c_rarg2   - element count, treated as ssize_t, can be zero
1576   //
1577   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1578   // the hardware handle it.  The two dwords within qwords that span
1579   // cache line boundaries will still be loaded and stored atomicly.
1580   //
generate_conjoint_int_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1581   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1582                                      address *entry, const char *name,
1583                                      bool dest_uninitialized = false) {
1584     const bool not_oop = false;
1585     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1586   }
1587 
1588 
1589   // Arguments:
1590   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1591   //             ignored
1592   //   name    - stub name string
1593   //
1594   // Inputs:
1595   //   c_rarg0   - source array address
1596   //   c_rarg1   - destination array address
1597   //   c_rarg2   - element count, treated as size_t, can be zero
1598   //
1599   // Side Effects:
1600   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1601   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1602   //
generate_disjoint_long_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1603   address generate_disjoint_long_copy(bool aligned, address *entry,
1604                                           const char *name, bool dest_uninitialized = false) {
1605     const bool not_oop = false;
1606     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1607   }
1608 
1609   // Arguments:
1610   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1611   //             ignored
1612   //   name    - stub name string
1613   //
1614   // Inputs:
1615   //   c_rarg0   - source array address
1616   //   c_rarg1   - destination array address
1617   //   c_rarg2   - element count, treated as size_t, can be zero
1618   //
generate_conjoint_long_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1619   address generate_conjoint_long_copy(bool aligned,
1620                                       address nooverlap_target, address *entry,
1621                                       const char *name, bool dest_uninitialized = false) {
1622     const bool not_oop = false;
1623     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1624   }
1625 
1626   // Arguments:
1627   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1628   //             ignored
1629   //   name    - stub name string
1630   //
1631   // Inputs:
1632   //   c_rarg0   - source array address
1633   //   c_rarg1   - destination array address
1634   //   c_rarg2   - element count, treated as size_t, can be zero
1635   //
1636   // Side Effects:
1637   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1638   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1639   //
generate_disjoint_oop_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized)1640   address generate_disjoint_oop_copy(bool aligned, address *entry,
1641                                      const char *name, bool dest_uninitialized) {
1642     const bool is_oop = true;
1643     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1644     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1645   }
1646 
1647   // Arguments:
1648   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1649   //             ignored
1650   //   name    - stub name string
1651   //
1652   // Inputs:
1653   //   c_rarg0   - source array address
1654   //   c_rarg1   - destination array address
1655   //   c_rarg2   - element count, treated as size_t, can be zero
1656   //
generate_conjoint_oop_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized)1657   address generate_conjoint_oop_copy(bool aligned,
1658                                      address nooverlap_target, address *entry,
1659                                      const char *name, bool dest_uninitialized) {
1660     const bool is_oop = true;
1661     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1662     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1663                                   name, dest_uninitialized);
1664   }
1665 
1666 
1667   // Helper for generating a dynamic type check.
1668   // Smashes rscratch1, rscratch2.
generate_type_check(Register sub_klass,Register super_check_offset,Register super_klass,Label & L_success)1669   void generate_type_check(Register sub_klass,
1670                            Register super_check_offset,
1671                            Register super_klass,
1672                            Label& L_success) {
1673     assert_different_registers(sub_klass, super_check_offset, super_klass);
1674 
1675     BLOCK_COMMENT("type_check:");
1676 
1677     Label L_miss;
1678 
1679     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1680                                      super_check_offset);
1681     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1682 
1683     // Fall through on failure!
1684     __ BIND(L_miss);
1685   }
1686 
1687   //
1688   //  Generate checkcasting array copy stub
1689   //
1690   //  Input:
1691   //    c_rarg0   - source array address
1692   //    c_rarg1   - destination array address
1693   //    c_rarg2   - element count, treated as ssize_t, can be zero
1694   //    c_rarg3   - size_t ckoff (super_check_offset)
1695   //    c_rarg4   - oop ckval (super_klass)
1696   //
1697   //  Output:
1698   //    r0 ==  0  -  success
1699   //    r0 == -1^K - failure, where K is partial transfer count
1700   //
generate_checkcast_copy(const char * name,address * entry,bool dest_uninitialized=false)1701   address generate_checkcast_copy(const char *name, address *entry,
1702                                   bool dest_uninitialized = false) {
1703 
1704     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1705 
1706     // Input registers (after setup_arg_regs)
1707     const Register from        = c_rarg0;   // source array address
1708     const Register to          = c_rarg1;   // destination array address
1709     const Register count       = c_rarg2;   // elementscount
1710     const Register ckoff       = c_rarg3;   // super_check_offset
1711     const Register ckval       = c_rarg4;   // super_klass
1712 
1713     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1714     RegSet wb_post_saved_regs = RegSet::of(count);
1715 
1716     // Registers used as temps (r18, r19, r20 are save-on-entry)
1717     const Register count_save  = r21;       // orig elementscount
1718     const Register start_to    = r20;       // destination array start address
1719     const Register copied_oop  = r18;       // actual oop copied
1720     const Register r19_klass   = r19;       // oop._klass
1721 
1722     //---------------------------------------------------------------
1723     // Assembler stub will be used for this call to arraycopy
1724     // if the two arrays are subtypes of Object[] but the
1725     // destination array type is not equal to or a supertype
1726     // of the source type.  Each element must be separately
1727     // checked.
1728 
1729     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1730                                copied_oop, r19_klass, count_save);
1731 
1732     __ align(CodeEntryAlignment);
1733     StubCodeMark mark(this, "StubRoutines", name);
1734     address start = __ pc();
1735 
1736     __ enter(); // required for proper stackwalking of RuntimeStub frame
1737 
1738 #ifdef ASSERT
1739     // caller guarantees that the arrays really are different
1740     // otherwise, we would have to make conjoint checks
1741     { Label L;
1742       array_overlap_test(L, TIMES_OOP);
1743       __ stop("checkcast_copy within a single array");
1744       __ bind(L);
1745     }
1746 #endif //ASSERT
1747 
1748     // Caller of this entry point must set up the argument registers.
1749     if (entry != NULL) {
1750       *entry = __ pc();
1751       BLOCK_COMMENT("Entry:");
1752     }
1753 
1754      // Empty array:  Nothing to do.
1755     __ cbz(count, L_done);
1756 
1757     __ push(RegSet::of(r18, r19, r20, r21), sp);
1758 
1759 #ifdef ASSERT
1760     BLOCK_COMMENT("assert consistent ckoff/ckval");
1761     // The ckoff and ckval must be mutually consistent,
1762     // even though caller generates both.
1763     { Label L;
1764       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1765       __ ldrw(start_to, Address(ckval, sco_offset));
1766       __ cmpw(ckoff, start_to);
1767       __ br(Assembler::EQ, L);
1768       __ stop("super_check_offset inconsistent");
1769       __ bind(L);
1770     }
1771 #endif //ASSERT
1772 
1773     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1774     bool is_oop = true;
1775     if (dest_uninitialized) {
1776       decorators |= IS_DEST_UNINITIALIZED;
1777     }
1778 
1779     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1780     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1781 
1782     // save the original count
1783     __ mov(count_save, count);
1784 
1785     // Copy from low to high addresses
1786     __ mov(start_to, to);              // Save destination array start address
1787     __ b(L_load_element);
1788 
1789     // ======== begin loop ========
1790     // (Loop is rotated; its entry is L_load_element.)
1791     // Loop control:
1792     //   for (; count != 0; count--) {
1793     //     copied_oop = load_heap_oop(from++);
1794     //     ... generate_type_check ...;
1795     //     store_heap_oop(to++, copied_oop);
1796     //   }
1797     __ align(OptoLoopAlignment);
1798 
1799     __ BIND(L_store_element);
1800     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1801     __ sub(count, count, 1);
1802     __ cbz(count, L_do_card_marks);
1803 
1804     // ======== loop entry is here ========
1805     __ BIND(L_load_element);
1806     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1807     __ cbz(copied_oop, L_store_element);
1808 
1809     __ load_klass(r19_klass, copied_oop);// query the object klass
1810     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1811     // ======== end loop ========
1812 
1813     // It was a real error; we must depend on the caller to finish the job.
1814     // Register count = remaining oops, count_orig = total oops.
1815     // Emit GC store barriers for the oops we have copied and report
1816     // their number to the caller.
1817 
1818     __ subs(count, count_save, count);     // K = partially copied oop count
1819     __ eon(count, count, zr);                   // report (-1^K) to caller
1820     __ br(Assembler::EQ, L_done_pop);
1821 
1822     __ BIND(L_do_card_marks);
1823     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1824 
1825     __ bind(L_done_pop);
1826     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1827     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1828 
1829     __ bind(L_done);
1830     __ mov(r0, count);
1831     __ leave();
1832     __ ret(lr);
1833 
1834     return start;
1835   }
1836 
1837   // Perform range checks on the proposed arraycopy.
1838   // Kills temp, but nothing else.
1839   // Also, clean the sign bits of src_pos and dst_pos.
arraycopy_range_checks(Register src,Register src_pos,Register dst,Register dst_pos,Register length,Register temp,Label & L_failed)1840   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1841                               Register src_pos, // source position (c_rarg1)
1842                               Register dst,     // destination array oo (c_rarg2)
1843                               Register dst_pos, // destination position (c_rarg3)
1844                               Register length,
1845                               Register temp,
1846                               Label& L_failed) {
1847     BLOCK_COMMENT("arraycopy_range_checks:");
1848 
1849     assert_different_registers(rscratch1, temp);
1850 
1851     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1852     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1853     __ addw(temp, length, src_pos);
1854     __ cmpw(temp, rscratch1);
1855     __ br(Assembler::HI, L_failed);
1856 
1857     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1858     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1859     __ addw(temp, length, dst_pos);
1860     __ cmpw(temp, rscratch1);
1861     __ br(Assembler::HI, L_failed);
1862 
1863     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1864     __ movw(src_pos, src_pos);
1865     __ movw(dst_pos, dst_pos);
1866 
1867     BLOCK_COMMENT("arraycopy_range_checks done");
1868   }
1869 
1870   // These stubs get called from some dumb test routine.
1871   // I'll write them properly when they're called from
1872   // something that's actually doing something.
fake_arraycopy_stub(address src,address dst,int count)1873   static void fake_arraycopy_stub(address src, address dst, int count) {
1874     assert(count == 0, "huh?");
1875   }
1876 
1877 
1878   //
1879   //  Generate 'unsafe' array copy stub
1880   //  Though just as safe as the other stubs, it takes an unscaled
1881   //  size_t argument instead of an element count.
1882   //
1883   //  Input:
1884   //    c_rarg0   - source array address
1885   //    c_rarg1   - destination array address
1886   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1887   //
1888   // Examines the alignment of the operands and dispatches
1889   // to a long, int, short, or byte copy loop.
1890   //
generate_unsafe_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address long_copy_entry)1891   address generate_unsafe_copy(const char *name,
1892                                address byte_copy_entry,
1893                                address short_copy_entry,
1894                                address int_copy_entry,
1895                                address long_copy_entry) {
1896     Label L_long_aligned, L_int_aligned, L_short_aligned;
1897     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1898 
1899     __ align(CodeEntryAlignment);
1900     StubCodeMark mark(this, "StubRoutines", name);
1901     address start = __ pc();
1902     __ enter(); // required for proper stackwalking of RuntimeStub frame
1903 
1904     // bump this on entry, not on exit:
1905     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1906 
1907     __ orr(rscratch1, s, d);
1908     __ orr(rscratch1, rscratch1, count);
1909 
1910     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1911     __ cbz(rscratch1, L_long_aligned);
1912     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1913     __ cbz(rscratch1, L_int_aligned);
1914     __ tbz(rscratch1, 0, L_short_aligned);
1915     __ b(RuntimeAddress(byte_copy_entry));
1916 
1917     __ BIND(L_short_aligned);
1918     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1919     __ b(RuntimeAddress(short_copy_entry));
1920     __ BIND(L_int_aligned);
1921     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1922     __ b(RuntimeAddress(int_copy_entry));
1923     __ BIND(L_long_aligned);
1924     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1925     __ b(RuntimeAddress(long_copy_entry));
1926 
1927     return start;
1928   }
1929 
1930   //
1931   //  Generate generic array copy stubs
1932   //
1933   //  Input:
1934   //    c_rarg0    -  src oop
1935   //    c_rarg1    -  src_pos (32-bits)
1936   //    c_rarg2    -  dst oop
1937   //    c_rarg3    -  dst_pos (32-bits)
1938   //    c_rarg4    -  element count (32-bits)
1939   //
1940   //  Output:
1941   //    r0 ==  0  -  success
1942   //    r0 == -1^K - failure, where K is partial transfer count
1943   //
generate_generic_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address oop_copy_entry,address long_copy_entry,address checkcast_copy_entry)1944   address generate_generic_copy(const char *name,
1945                                 address byte_copy_entry, address short_copy_entry,
1946                                 address int_copy_entry, address oop_copy_entry,
1947                                 address long_copy_entry, address checkcast_copy_entry) {
1948 
1949     Label L_failed, L_objArray;
1950     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1951 
1952     // Input registers
1953     const Register src        = c_rarg0;  // source array oop
1954     const Register src_pos    = c_rarg1;  // source position
1955     const Register dst        = c_rarg2;  // destination array oop
1956     const Register dst_pos    = c_rarg3;  // destination position
1957     const Register length     = c_rarg4;
1958 
1959 
1960     // Registers used as temps
1961     const Register dst_klass  = c_rarg5;
1962 
1963     __ align(CodeEntryAlignment);
1964 
1965     StubCodeMark mark(this, "StubRoutines", name);
1966 
1967     address start = __ pc();
1968 
1969     __ enter(); // required for proper stackwalking of RuntimeStub frame
1970 
1971     // bump this on entry, not on exit:
1972     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1973 
1974     //-----------------------------------------------------------------------
1975     // Assembler stub will be used for this call to arraycopy
1976     // if the following conditions are met:
1977     //
1978     // (1) src and dst must not be null.
1979     // (2) src_pos must not be negative.
1980     // (3) dst_pos must not be negative.
1981     // (4) length  must not be negative.
1982     // (5) src klass and dst klass should be the same and not NULL.
1983     // (6) src and dst should be arrays.
1984     // (7) src_pos + length must not exceed length of src.
1985     // (8) dst_pos + length must not exceed length of dst.
1986     //
1987 
1988     //  if (src == NULL) return -1;
1989     __ cbz(src, L_failed);
1990 
1991     //  if (src_pos < 0) return -1;
1992     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1993 
1994     //  if (dst == NULL) return -1;
1995     __ cbz(dst, L_failed);
1996 
1997     //  if (dst_pos < 0) return -1;
1998     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
1999 
2000     // registers used as temp
2001     const Register scratch_length    = r16; // elements count to copy
2002     const Register scratch_src_klass = r17; // array klass
2003     const Register lh                = r18; // layout helper
2004 
2005     //  if (length < 0) return -1;
2006     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2007     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2008 
2009     __ load_klass(scratch_src_klass, src);
2010 #ifdef ASSERT
2011     //  assert(src->klass() != NULL);
2012     {
2013       BLOCK_COMMENT("assert klasses not null {");
2014       Label L1, L2;
2015       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2016       __ bind(L1);
2017       __ stop("broken null klass");
2018       __ bind(L2);
2019       __ load_klass(rscratch1, dst);
2020       __ cbz(rscratch1, L1);     // this would be broken also
2021       BLOCK_COMMENT("} assert klasses not null done");
2022     }
2023 #endif
2024 
2025     // Load layout helper (32-bits)
2026     //
2027     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2028     // 32        30    24            16              8     2                 0
2029     //
2030     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2031     //
2032 
2033     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2034 
2035     // Handle objArrays completely differently...
2036     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2037     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2038     __ movw(rscratch1, objArray_lh);
2039     __ eorw(rscratch2, lh, rscratch1);
2040     __ cbzw(rscratch2, L_objArray);
2041 
2042     //  if (src->klass() != dst->klass()) return -1;
2043     __ load_klass(rscratch2, dst);
2044     __ eor(rscratch2, rscratch2, scratch_src_klass);
2045     __ cbnz(rscratch2, L_failed);
2046 
2047     //  if (!src->is_Array()) return -1;
2048     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2049 
2050     // At this point, it is known to be a typeArray (array_tag 0x3).
2051 #ifdef ASSERT
2052     {
2053       BLOCK_COMMENT("assert primitive array {");
2054       Label L;
2055       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2056       __ cmpw(lh, rscratch2);
2057       __ br(Assembler::GE, L);
2058       __ stop("must be a primitive array");
2059       __ bind(L);
2060       BLOCK_COMMENT("} assert primitive array done");
2061     }
2062 #endif
2063 
2064     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2065                            rscratch2, L_failed);
2066 
2067     // TypeArrayKlass
2068     //
2069     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2070     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2071     //
2072 
2073     const Register rscratch1_offset = rscratch1;    // array offset
2074     const Register r18_elsize = lh; // element size
2075 
2076     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2077            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2078     __ add(src, src, rscratch1_offset);           // src array offset
2079     __ add(dst, dst, rscratch1_offset);           // dst array offset
2080     BLOCK_COMMENT("choose copy loop based on element size");
2081 
2082     // next registers should be set before the jump to corresponding stub
2083     const Register from     = c_rarg0;  // source array address
2084     const Register to       = c_rarg1;  // destination array address
2085     const Register count    = c_rarg2;  // elements count
2086 
2087     // 'from', 'to', 'count' registers should be set in such order
2088     // since they are the same as 'src', 'src_pos', 'dst'.
2089 
2090     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2091 
2092     // The possible values of elsize are 0-3, i.e. exact_log2(element
2093     // size in bytes).  We do a simple bitwise binary search.
2094   __ BIND(L_copy_bytes);
2095     __ tbnz(r18_elsize, 1, L_copy_ints);
2096     __ tbnz(r18_elsize, 0, L_copy_shorts);
2097     __ lea(from, Address(src, src_pos));// src_addr
2098     __ lea(to,   Address(dst, dst_pos));// dst_addr
2099     __ movw(count, scratch_length); // length
2100     __ b(RuntimeAddress(byte_copy_entry));
2101 
2102   __ BIND(L_copy_shorts);
2103     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2104     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2105     __ movw(count, scratch_length); // length
2106     __ b(RuntimeAddress(short_copy_entry));
2107 
2108   __ BIND(L_copy_ints);
2109     __ tbnz(r18_elsize, 0, L_copy_longs);
2110     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2111     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2112     __ movw(count, scratch_length); // length
2113     __ b(RuntimeAddress(int_copy_entry));
2114 
2115   __ BIND(L_copy_longs);
2116 #ifdef ASSERT
2117     {
2118       BLOCK_COMMENT("assert long copy {");
2119       Label L;
2120       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2121       __ cmpw(r18_elsize, LogBytesPerLong);
2122       __ br(Assembler::EQ, L);
2123       __ stop("must be long copy, but elsize is wrong");
2124       __ bind(L);
2125       BLOCK_COMMENT("} assert long copy done");
2126     }
2127 #endif
2128     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2129     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2130     __ movw(count, scratch_length); // length
2131     __ b(RuntimeAddress(long_copy_entry));
2132 
2133     // ObjArrayKlass
2134   __ BIND(L_objArray);
2135     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2136 
2137     Label L_plain_copy, L_checkcast_copy;
2138     //  test array classes for subtyping
2139     __ load_klass(r18, dst);
2140     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2141     __ br(Assembler::NE, L_checkcast_copy);
2142 
2143     // Identically typed arrays can be copied without element-wise checks.
2144     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2145                            rscratch2, L_failed);
2146 
2147     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2148     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2149     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2150     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2151     __ movw(count, scratch_length); // length
2152   __ BIND(L_plain_copy);
2153     __ b(RuntimeAddress(oop_copy_entry));
2154 
2155   __ BIND(L_checkcast_copy);
2156     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2157     {
2158       // Before looking at dst.length, make sure dst is also an objArray.
2159       __ ldrw(rscratch1, Address(r18, lh_offset));
2160       __ movw(rscratch2, objArray_lh);
2161       __ eorw(rscratch1, rscratch1, rscratch2);
2162       __ cbnzw(rscratch1, L_failed);
2163 
2164       // It is safe to examine both src.length and dst.length.
2165       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2166                              r18, L_failed);
2167 
2168       __ load_klass(dst_klass, dst); // reload
2169 
2170       // Marshal the base address arguments now, freeing registers.
2171       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2172       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2173       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2174       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2175       __ movw(count, length);           // length (reloaded)
2176       Register sco_temp = c_rarg3;      // this register is free now
2177       assert_different_registers(from, to, count, sco_temp,
2178                                  dst_klass, scratch_src_klass);
2179       // assert_clean_int(count, sco_temp);
2180 
2181       // Generate the type check.
2182       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2183       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2184 
2185       // Smashes rscratch1, rscratch2
2186       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2187 
2188       // Fetch destination element klass from the ObjArrayKlass header.
2189       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2190       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2191       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2192 
2193       // the checkcast_copy loop needs two extra arguments:
2194       assert(c_rarg3 == sco_temp, "#3 already in place");
2195       // Set up arguments for checkcast_copy_entry.
2196       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2197       __ b(RuntimeAddress(checkcast_copy_entry));
2198     }
2199 
2200   __ BIND(L_failed);
2201     __ mov(r0, -1);
2202     __ leave();   // required for proper stackwalking of RuntimeStub frame
2203     __ ret(lr);
2204 
2205     return start;
2206   }
2207 
2208   //
2209   // Generate stub for array fill. If "aligned" is true, the
2210   // "to" address is assumed to be heapword aligned.
2211   //
2212   // Arguments for generated stub:
2213   //   to:    c_rarg0
2214   //   value: c_rarg1
2215   //   count: c_rarg2 treated as signed
2216   //
generate_fill(BasicType t,bool aligned,const char * name)2217   address generate_fill(BasicType t, bool aligned, const char *name) {
2218     __ align(CodeEntryAlignment);
2219     StubCodeMark mark(this, "StubRoutines", name);
2220     address start = __ pc();
2221 
2222     BLOCK_COMMENT("Entry:");
2223 
2224     const Register to        = c_rarg0;  // source array address
2225     const Register value     = c_rarg1;  // value
2226     const Register count     = c_rarg2;  // elements count
2227 
2228     const Register bz_base = r10;        // base for block_zero routine
2229     const Register cnt_words = r11;      // temp register
2230 
2231     __ enter();
2232 
2233     Label L_fill_elements, L_exit1;
2234 
2235     int shift = -1;
2236     switch (t) {
2237       case T_BYTE:
2238         shift = 0;
2239         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2240         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2241         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2242         __ br(Assembler::LO, L_fill_elements);
2243         break;
2244       case T_SHORT:
2245         shift = 1;
2246         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2247         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2248         __ br(Assembler::LO, L_fill_elements);
2249         break;
2250       case T_INT:
2251         shift = 2;
2252         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2253         __ br(Assembler::LO, L_fill_elements);
2254         break;
2255       default: ShouldNotReachHere();
2256     }
2257 
2258     // Align source address at 8 bytes address boundary.
2259     Label L_skip_align1, L_skip_align2, L_skip_align4;
2260     if (!aligned) {
2261       switch (t) {
2262         case T_BYTE:
2263           // One byte misalignment happens only for byte arrays.
2264           __ tbz(to, 0, L_skip_align1);
2265           __ strb(value, Address(__ post(to, 1)));
2266           __ subw(count, count, 1);
2267           __ bind(L_skip_align1);
2268           // Fallthrough
2269         case T_SHORT:
2270           // Two bytes misalignment happens only for byte and short (char) arrays.
2271           __ tbz(to, 1, L_skip_align2);
2272           __ strh(value, Address(__ post(to, 2)));
2273           __ subw(count, count, 2 >> shift);
2274           __ bind(L_skip_align2);
2275           // Fallthrough
2276         case T_INT:
2277           // Align to 8 bytes, we know we are 4 byte aligned to start.
2278           __ tbz(to, 2, L_skip_align4);
2279           __ strw(value, Address(__ post(to, 4)));
2280           __ subw(count, count, 4 >> shift);
2281           __ bind(L_skip_align4);
2282           break;
2283         default: ShouldNotReachHere();
2284       }
2285     }
2286 
2287     //
2288     //  Fill large chunks
2289     //
2290     __ lsrw(cnt_words, count, 3 - shift); // number of words
2291     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2292     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2293     if (UseBlockZeroing) {
2294       Label non_block_zeroing, rest;
2295       // If the fill value is zero we can use the fast zero_words().
2296       __ cbnz(value, non_block_zeroing);
2297       __ mov(bz_base, to);
2298       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2299       __ zero_words(bz_base, cnt_words);
2300       __ b(rest);
2301       __ bind(non_block_zeroing);
2302       __ fill_words(to, cnt_words, value);
2303       __ bind(rest);
2304     } else {
2305       __ fill_words(to, cnt_words, value);
2306     }
2307 
2308     // Remaining count is less than 8 bytes. Fill it by a single store.
2309     // Note that the total length is no less than 8 bytes.
2310     if (t == T_BYTE || t == T_SHORT) {
2311       Label L_exit1;
2312       __ cbzw(count, L_exit1);
2313       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2314       __ str(value, Address(to, -8));    // overwrite some elements
2315       __ bind(L_exit1);
2316       __ leave();
2317       __ ret(lr);
2318     }
2319 
2320     // Handle copies less than 8 bytes.
2321     Label L_fill_2, L_fill_4, L_exit2;
2322     __ bind(L_fill_elements);
2323     switch (t) {
2324       case T_BYTE:
2325         __ tbz(count, 0, L_fill_2);
2326         __ strb(value, Address(__ post(to, 1)));
2327         __ bind(L_fill_2);
2328         __ tbz(count, 1, L_fill_4);
2329         __ strh(value, Address(__ post(to, 2)));
2330         __ bind(L_fill_4);
2331         __ tbz(count, 2, L_exit2);
2332         __ strw(value, Address(to));
2333         break;
2334       case T_SHORT:
2335         __ tbz(count, 0, L_fill_4);
2336         __ strh(value, Address(__ post(to, 2)));
2337         __ bind(L_fill_4);
2338         __ tbz(count, 1, L_exit2);
2339         __ strw(value, Address(to));
2340         break;
2341       case T_INT:
2342         __ cbzw(count, L_exit2);
2343         __ strw(value, Address(to));
2344         break;
2345       default: ShouldNotReachHere();
2346     }
2347     __ bind(L_exit2);
2348     __ leave();
2349     __ ret(lr);
2350     return start;
2351   }
2352 
generate_arraycopy_stubs()2353   void generate_arraycopy_stubs() {
2354     address entry;
2355     address entry_jbyte_arraycopy;
2356     address entry_jshort_arraycopy;
2357     address entry_jint_arraycopy;
2358     address entry_oop_arraycopy;
2359     address entry_jlong_arraycopy;
2360     address entry_checkcast_arraycopy;
2361 
2362     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2363     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2364 
2365     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2366 
2367     //*** jbyte
2368     // Always need aligned and unaligned versions
2369     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2370                                                                                   "jbyte_disjoint_arraycopy");
2371     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2372                                                                                   &entry_jbyte_arraycopy,
2373                                                                                   "jbyte_arraycopy");
2374     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2375                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2376     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2377                                                                                   "arrayof_jbyte_arraycopy");
2378 
2379     //*** jshort
2380     // Always need aligned and unaligned versions
2381     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2382                                                                                     "jshort_disjoint_arraycopy");
2383     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2384                                                                                     &entry_jshort_arraycopy,
2385                                                                                     "jshort_arraycopy");
2386     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2387                                                                                     "arrayof_jshort_disjoint_arraycopy");
2388     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2389                                                                                     "arrayof_jshort_arraycopy");
2390 
2391     //*** jint
2392     // Aligned versions
2393     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2394                                                                                 "arrayof_jint_disjoint_arraycopy");
2395     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2396                                                                                 "arrayof_jint_arraycopy");
2397     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2398     // entry_jint_arraycopy always points to the unaligned version
2399     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2400                                                                                 "jint_disjoint_arraycopy");
2401     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2402                                                                                 &entry_jint_arraycopy,
2403                                                                                 "jint_arraycopy");
2404 
2405     //*** jlong
2406     // It is always aligned
2407     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2408                                                                                   "arrayof_jlong_disjoint_arraycopy");
2409     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2410                                                                                   "arrayof_jlong_arraycopy");
2411     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2412     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2413 
2414     //*** oops
2415     {
2416       // With compressed oops we need unaligned versions; notice that
2417       // we overwrite entry_oop_arraycopy.
2418       bool aligned = !UseCompressedOops;
2419 
2420       StubRoutines::_arrayof_oop_disjoint_arraycopy
2421         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2422                                      /*dest_uninitialized*/false);
2423       StubRoutines::_arrayof_oop_arraycopy
2424         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2425                                      /*dest_uninitialized*/false);
2426       // Aligned versions without pre-barriers
2427       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2428         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2429                                      /*dest_uninitialized*/true);
2430       StubRoutines::_arrayof_oop_arraycopy_uninit
2431         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2432                                      /*dest_uninitialized*/true);
2433     }
2434 
2435     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2436     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2437     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2438     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2439 
2440     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2441     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2442                                                                         /*dest_uninitialized*/true);
2443 
2444     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2445                                                               entry_jbyte_arraycopy,
2446                                                               entry_jshort_arraycopy,
2447                                                               entry_jint_arraycopy,
2448                                                               entry_jlong_arraycopy);
2449 
2450     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2451                                                                entry_jbyte_arraycopy,
2452                                                                entry_jshort_arraycopy,
2453                                                                entry_jint_arraycopy,
2454                                                                entry_oop_arraycopy,
2455                                                                entry_jlong_arraycopy,
2456                                                                entry_checkcast_arraycopy);
2457 
2458     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2459     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2460     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2461     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2462     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2463     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2464   }
2465 
generate_math_stubs()2466   void generate_math_stubs() { Unimplemented(); }
2467 
2468   // Arguments:
2469   //
2470   // Inputs:
2471   //   c_rarg0   - source byte array address
2472   //   c_rarg1   - destination byte array address
2473   //   c_rarg2   - K (key) in little endian int array
2474   //
generate_aescrypt_encryptBlock()2475   address generate_aescrypt_encryptBlock() {
2476     __ align(CodeEntryAlignment);
2477     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2478 
2479     Label L_doLast;
2480 
2481     const Register from        = c_rarg0;  // source array address
2482     const Register to          = c_rarg1;  // destination array address
2483     const Register key         = c_rarg2;  // key array address
2484     const Register keylen      = rscratch1;
2485 
2486     address start = __ pc();
2487     __ enter();
2488 
2489     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2490 
2491     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2492 
2493     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2494     __ rev32(v1, __ T16B, v1);
2495     __ rev32(v2, __ T16B, v2);
2496     __ rev32(v3, __ T16B, v3);
2497     __ rev32(v4, __ T16B, v4);
2498     __ aese(v0, v1);
2499     __ aesmc(v0, v0);
2500     __ aese(v0, v2);
2501     __ aesmc(v0, v0);
2502     __ aese(v0, v3);
2503     __ aesmc(v0, v0);
2504     __ aese(v0, v4);
2505     __ aesmc(v0, v0);
2506 
2507     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2508     __ rev32(v1, __ T16B, v1);
2509     __ rev32(v2, __ T16B, v2);
2510     __ rev32(v3, __ T16B, v3);
2511     __ rev32(v4, __ T16B, v4);
2512     __ aese(v0, v1);
2513     __ aesmc(v0, v0);
2514     __ aese(v0, v2);
2515     __ aesmc(v0, v0);
2516     __ aese(v0, v3);
2517     __ aesmc(v0, v0);
2518     __ aese(v0, v4);
2519     __ aesmc(v0, v0);
2520 
2521     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2522     __ rev32(v1, __ T16B, v1);
2523     __ rev32(v2, __ T16B, v2);
2524 
2525     __ cmpw(keylen, 44);
2526     __ br(Assembler::EQ, L_doLast);
2527 
2528     __ aese(v0, v1);
2529     __ aesmc(v0, v0);
2530     __ aese(v0, v2);
2531     __ aesmc(v0, v0);
2532 
2533     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2534     __ rev32(v1, __ T16B, v1);
2535     __ rev32(v2, __ T16B, v2);
2536 
2537     __ cmpw(keylen, 52);
2538     __ br(Assembler::EQ, L_doLast);
2539 
2540     __ aese(v0, v1);
2541     __ aesmc(v0, v0);
2542     __ aese(v0, v2);
2543     __ aesmc(v0, v0);
2544 
2545     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2546     __ rev32(v1, __ T16B, v1);
2547     __ rev32(v2, __ T16B, v2);
2548 
2549     __ BIND(L_doLast);
2550 
2551     __ aese(v0, v1);
2552     __ aesmc(v0, v0);
2553     __ aese(v0, v2);
2554 
2555     __ ld1(v1, __ T16B, key);
2556     __ rev32(v1, __ T16B, v1);
2557     __ eor(v0, __ T16B, v0, v1);
2558 
2559     __ st1(v0, __ T16B, to);
2560 
2561     __ mov(r0, 0);
2562 
2563     __ leave();
2564     __ ret(lr);
2565 
2566     return start;
2567   }
2568 
2569   // Arguments:
2570   //
2571   // Inputs:
2572   //   c_rarg0   - source byte array address
2573   //   c_rarg1   - destination byte array address
2574   //   c_rarg2   - K (key) in little endian int array
2575   //
generate_aescrypt_decryptBlock()2576   address generate_aescrypt_decryptBlock() {
2577     assert(UseAES, "need AES instructions and misaligned SSE support");
2578     __ align(CodeEntryAlignment);
2579     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2580     Label L_doLast;
2581 
2582     const Register from        = c_rarg0;  // source array address
2583     const Register to          = c_rarg1;  // destination array address
2584     const Register key         = c_rarg2;  // key array address
2585     const Register keylen      = rscratch1;
2586 
2587     address start = __ pc();
2588     __ enter(); // required for proper stackwalking of RuntimeStub frame
2589 
2590     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2591 
2592     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2593 
2594     __ ld1(v5, __ T16B, __ post(key, 16));
2595     __ rev32(v5, __ T16B, v5);
2596 
2597     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2598     __ rev32(v1, __ T16B, v1);
2599     __ rev32(v2, __ T16B, v2);
2600     __ rev32(v3, __ T16B, v3);
2601     __ rev32(v4, __ T16B, v4);
2602     __ aesd(v0, v1);
2603     __ aesimc(v0, v0);
2604     __ aesd(v0, v2);
2605     __ aesimc(v0, v0);
2606     __ aesd(v0, v3);
2607     __ aesimc(v0, v0);
2608     __ aesd(v0, v4);
2609     __ aesimc(v0, v0);
2610 
2611     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2612     __ rev32(v1, __ T16B, v1);
2613     __ rev32(v2, __ T16B, v2);
2614     __ rev32(v3, __ T16B, v3);
2615     __ rev32(v4, __ T16B, v4);
2616     __ aesd(v0, v1);
2617     __ aesimc(v0, v0);
2618     __ aesd(v0, v2);
2619     __ aesimc(v0, v0);
2620     __ aesd(v0, v3);
2621     __ aesimc(v0, v0);
2622     __ aesd(v0, v4);
2623     __ aesimc(v0, v0);
2624 
2625     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2626     __ rev32(v1, __ T16B, v1);
2627     __ rev32(v2, __ T16B, v2);
2628 
2629     __ cmpw(keylen, 44);
2630     __ br(Assembler::EQ, L_doLast);
2631 
2632     __ aesd(v0, v1);
2633     __ aesimc(v0, v0);
2634     __ aesd(v0, v2);
2635     __ aesimc(v0, v0);
2636 
2637     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2638     __ rev32(v1, __ T16B, v1);
2639     __ rev32(v2, __ T16B, v2);
2640 
2641     __ cmpw(keylen, 52);
2642     __ br(Assembler::EQ, L_doLast);
2643 
2644     __ aesd(v0, v1);
2645     __ aesimc(v0, v0);
2646     __ aesd(v0, v2);
2647     __ aesimc(v0, v0);
2648 
2649     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2650     __ rev32(v1, __ T16B, v1);
2651     __ rev32(v2, __ T16B, v2);
2652 
2653     __ BIND(L_doLast);
2654 
2655     __ aesd(v0, v1);
2656     __ aesimc(v0, v0);
2657     __ aesd(v0, v2);
2658 
2659     __ eor(v0, __ T16B, v0, v5);
2660 
2661     __ st1(v0, __ T16B, to);
2662 
2663     __ mov(r0, 0);
2664 
2665     __ leave();
2666     __ ret(lr);
2667 
2668     return start;
2669   }
2670 
2671   // Arguments:
2672   //
2673   // Inputs:
2674   //   c_rarg0   - source byte array address
2675   //   c_rarg1   - destination byte array address
2676   //   c_rarg2   - K (key) in little endian int array
2677   //   c_rarg3   - r vector byte array address
2678   //   c_rarg4   - input length
2679   //
2680   // Output:
2681   //   x0        - input length
2682   //
generate_cipherBlockChaining_encryptAESCrypt()2683   address generate_cipherBlockChaining_encryptAESCrypt() {
2684     assert(UseAES, "need AES instructions and misaligned SSE support");
2685     __ align(CodeEntryAlignment);
2686     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2687 
2688     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2689 
2690     const Register from        = c_rarg0;  // source array address
2691     const Register to          = c_rarg1;  // destination array address
2692     const Register key         = c_rarg2;  // key array address
2693     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2694                                            // and left with the results of the last encryption block
2695     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2696     const Register keylen      = rscratch1;
2697 
2698     address start = __ pc();
2699 
2700       __ enter();
2701 
2702       __ movw(rscratch2, len_reg);
2703 
2704       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2705 
2706       __ ld1(v0, __ T16B, rvec);
2707 
2708       __ cmpw(keylen, 52);
2709       __ br(Assembler::CC, L_loadkeys_44);
2710       __ br(Assembler::EQ, L_loadkeys_52);
2711 
2712       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2713       __ rev32(v17, __ T16B, v17);
2714       __ rev32(v18, __ T16B, v18);
2715     __ BIND(L_loadkeys_52);
2716       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2717       __ rev32(v19, __ T16B, v19);
2718       __ rev32(v20, __ T16B, v20);
2719     __ BIND(L_loadkeys_44);
2720       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2721       __ rev32(v21, __ T16B, v21);
2722       __ rev32(v22, __ T16B, v22);
2723       __ rev32(v23, __ T16B, v23);
2724       __ rev32(v24, __ T16B, v24);
2725       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2726       __ rev32(v25, __ T16B, v25);
2727       __ rev32(v26, __ T16B, v26);
2728       __ rev32(v27, __ T16B, v27);
2729       __ rev32(v28, __ T16B, v28);
2730       __ ld1(v29, v30, v31, __ T16B, key);
2731       __ rev32(v29, __ T16B, v29);
2732       __ rev32(v30, __ T16B, v30);
2733       __ rev32(v31, __ T16B, v31);
2734 
2735     __ BIND(L_aes_loop);
2736       __ ld1(v1, __ T16B, __ post(from, 16));
2737       __ eor(v0, __ T16B, v0, v1);
2738 
2739       __ br(Assembler::CC, L_rounds_44);
2740       __ br(Assembler::EQ, L_rounds_52);
2741 
2742       __ aese(v0, v17); __ aesmc(v0, v0);
2743       __ aese(v0, v18); __ aesmc(v0, v0);
2744     __ BIND(L_rounds_52);
2745       __ aese(v0, v19); __ aesmc(v0, v0);
2746       __ aese(v0, v20); __ aesmc(v0, v0);
2747     __ BIND(L_rounds_44);
2748       __ aese(v0, v21); __ aesmc(v0, v0);
2749       __ aese(v0, v22); __ aesmc(v0, v0);
2750       __ aese(v0, v23); __ aesmc(v0, v0);
2751       __ aese(v0, v24); __ aesmc(v0, v0);
2752       __ aese(v0, v25); __ aesmc(v0, v0);
2753       __ aese(v0, v26); __ aesmc(v0, v0);
2754       __ aese(v0, v27); __ aesmc(v0, v0);
2755       __ aese(v0, v28); __ aesmc(v0, v0);
2756       __ aese(v0, v29); __ aesmc(v0, v0);
2757       __ aese(v0, v30);
2758       __ eor(v0, __ T16B, v0, v31);
2759 
2760       __ st1(v0, __ T16B, __ post(to, 16));
2761 
2762       __ subw(len_reg, len_reg, 16);
2763       __ cbnzw(len_reg, L_aes_loop);
2764 
2765       __ st1(v0, __ T16B, rvec);
2766 
2767       __ mov(r0, rscratch2);
2768 
2769       __ leave();
2770       __ ret(lr);
2771 
2772       return start;
2773   }
2774 
2775   // Arguments:
2776   //
2777   // Inputs:
2778   //   c_rarg0   - source byte array address
2779   //   c_rarg1   - destination byte array address
2780   //   c_rarg2   - K (key) in little endian int array
2781   //   c_rarg3   - r vector byte array address
2782   //   c_rarg4   - input length
2783   //
2784   // Output:
2785   //   r0        - input length
2786   //
generate_cipherBlockChaining_decryptAESCrypt()2787   address generate_cipherBlockChaining_decryptAESCrypt() {
2788     assert(UseAES, "need AES instructions and misaligned SSE support");
2789     __ align(CodeEntryAlignment);
2790     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2791 
2792     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2793 
2794     const Register from        = c_rarg0;  // source array address
2795     const Register to          = c_rarg1;  // destination array address
2796     const Register key         = c_rarg2;  // key array address
2797     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2798                                            // and left with the results of the last encryption block
2799     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2800     const Register keylen      = rscratch1;
2801 
2802     address start = __ pc();
2803 
2804       __ enter();
2805 
2806       __ movw(rscratch2, len_reg);
2807 
2808       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2809 
2810       __ ld1(v2, __ T16B, rvec);
2811 
2812       __ ld1(v31, __ T16B, __ post(key, 16));
2813       __ rev32(v31, __ T16B, v31);
2814 
2815       __ cmpw(keylen, 52);
2816       __ br(Assembler::CC, L_loadkeys_44);
2817       __ br(Assembler::EQ, L_loadkeys_52);
2818 
2819       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2820       __ rev32(v17, __ T16B, v17);
2821       __ rev32(v18, __ T16B, v18);
2822     __ BIND(L_loadkeys_52);
2823       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2824       __ rev32(v19, __ T16B, v19);
2825       __ rev32(v20, __ T16B, v20);
2826     __ BIND(L_loadkeys_44);
2827       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2828       __ rev32(v21, __ T16B, v21);
2829       __ rev32(v22, __ T16B, v22);
2830       __ rev32(v23, __ T16B, v23);
2831       __ rev32(v24, __ T16B, v24);
2832       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2833       __ rev32(v25, __ T16B, v25);
2834       __ rev32(v26, __ T16B, v26);
2835       __ rev32(v27, __ T16B, v27);
2836       __ rev32(v28, __ T16B, v28);
2837       __ ld1(v29, v30, __ T16B, key);
2838       __ rev32(v29, __ T16B, v29);
2839       __ rev32(v30, __ T16B, v30);
2840 
2841     __ BIND(L_aes_loop);
2842       __ ld1(v0, __ T16B, __ post(from, 16));
2843       __ orr(v1, __ T16B, v0, v0);
2844 
2845       __ br(Assembler::CC, L_rounds_44);
2846       __ br(Assembler::EQ, L_rounds_52);
2847 
2848       __ aesd(v0, v17); __ aesimc(v0, v0);
2849       __ aesd(v0, v18); __ aesimc(v0, v0);
2850     __ BIND(L_rounds_52);
2851       __ aesd(v0, v19); __ aesimc(v0, v0);
2852       __ aesd(v0, v20); __ aesimc(v0, v0);
2853     __ BIND(L_rounds_44);
2854       __ aesd(v0, v21); __ aesimc(v0, v0);
2855       __ aesd(v0, v22); __ aesimc(v0, v0);
2856       __ aesd(v0, v23); __ aesimc(v0, v0);
2857       __ aesd(v0, v24); __ aesimc(v0, v0);
2858       __ aesd(v0, v25); __ aesimc(v0, v0);
2859       __ aesd(v0, v26); __ aesimc(v0, v0);
2860       __ aesd(v0, v27); __ aesimc(v0, v0);
2861       __ aesd(v0, v28); __ aesimc(v0, v0);
2862       __ aesd(v0, v29); __ aesimc(v0, v0);
2863       __ aesd(v0, v30);
2864       __ eor(v0, __ T16B, v0, v31);
2865       __ eor(v0, __ T16B, v0, v2);
2866 
2867       __ st1(v0, __ T16B, __ post(to, 16));
2868       __ orr(v2, __ T16B, v1, v1);
2869 
2870       __ subw(len_reg, len_reg, 16);
2871       __ cbnzw(len_reg, L_aes_loop);
2872 
2873       __ st1(v2, __ T16B, rvec);
2874 
2875       __ mov(r0, rscratch2);
2876 
2877       __ leave();
2878       __ ret(lr);
2879 
2880     return start;
2881   }
2882 
2883   // Arguments:
2884   //
2885   // Inputs:
2886   //   c_rarg0   - byte[]  source+offset
2887   //   c_rarg1   - int[]   SHA.state
2888   //   c_rarg2   - int     offset
2889   //   c_rarg3   - int     limit
2890   //
generate_sha1_implCompress(bool multi_block,const char * name)2891   address generate_sha1_implCompress(bool multi_block, const char *name) {
2892     __ align(CodeEntryAlignment);
2893     StubCodeMark mark(this, "StubRoutines", name);
2894     address start = __ pc();
2895 
2896     Register buf   = c_rarg0;
2897     Register state = c_rarg1;
2898     Register ofs   = c_rarg2;
2899     Register limit = c_rarg3;
2900 
2901     Label keys;
2902     Label sha1_loop;
2903 
2904     // load the keys into v0..v3
2905     __ adr(rscratch1, keys);
2906     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2907     // load 5 words state into v6, v7
2908     __ ldrq(v6, Address(state, 0));
2909     __ ldrs(v7, Address(state, 16));
2910 
2911 
2912     __ BIND(sha1_loop);
2913     // load 64 bytes of data into v16..v19
2914     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2915     __ rev32(v16, __ T16B, v16);
2916     __ rev32(v17, __ T16B, v17);
2917     __ rev32(v18, __ T16B, v18);
2918     __ rev32(v19, __ T16B, v19);
2919 
2920     // do the sha1
2921     __ addv(v4, __ T4S, v16, v0);
2922     __ orr(v20, __ T16B, v6, v6);
2923 
2924     FloatRegister d0 = v16;
2925     FloatRegister d1 = v17;
2926     FloatRegister d2 = v18;
2927     FloatRegister d3 = v19;
2928 
2929     for (int round = 0; round < 20; round++) {
2930       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2931       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2932       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2933       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2934       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2935 
2936       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2937       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2938       __ sha1h(tmp2, __ T4S, v20);
2939       if (round < 5)
2940         __ sha1c(v20, __ T4S, tmp3, tmp4);
2941       else if (round < 10 || round >= 15)
2942         __ sha1p(v20, __ T4S, tmp3, tmp4);
2943       else
2944         __ sha1m(v20, __ T4S, tmp3, tmp4);
2945       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2946 
2947       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2948     }
2949 
2950     __ addv(v7, __ T2S, v7, v21);
2951     __ addv(v6, __ T4S, v6, v20);
2952 
2953     if (multi_block) {
2954       __ add(ofs, ofs, 64);
2955       __ cmp(ofs, limit);
2956       __ br(Assembler::LE, sha1_loop);
2957       __ mov(c_rarg0, ofs); // return ofs
2958     }
2959 
2960     __ strq(v6, Address(state, 0));
2961     __ strs(v7, Address(state, 16));
2962 
2963     __ ret(lr);
2964 
2965     __ bind(keys);
2966     __ emit_int32(0x5a827999);
2967     __ emit_int32(0x6ed9eba1);
2968     __ emit_int32(0x8f1bbcdc);
2969     __ emit_int32(0xca62c1d6);
2970 
2971     return start;
2972   }
2973 
2974 
2975   // Arguments:
2976   //
2977   // Inputs:
2978   //   c_rarg0   - byte[]  source+offset
2979   //   c_rarg1   - int[]   SHA.state
2980   //   c_rarg2   - int     offset
2981   //   c_rarg3   - int     limit
2982   //
generate_sha256_implCompress(bool multi_block,const char * name)2983   address generate_sha256_implCompress(bool multi_block, const char *name) {
2984     static const uint32_t round_consts[64] = {
2985       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2986       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2987       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2988       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2989       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2990       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2991       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2992       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2993       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2994       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2995       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2996       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2997       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2998       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2999       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3000       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3001     };
3002     __ align(CodeEntryAlignment);
3003     StubCodeMark mark(this, "StubRoutines", name);
3004     address start = __ pc();
3005 
3006     Register buf   = c_rarg0;
3007     Register state = c_rarg1;
3008     Register ofs   = c_rarg2;
3009     Register limit = c_rarg3;
3010 
3011     Label sha1_loop;
3012 
3013     __ stpd(v8, v9, __ pre(sp, -32));
3014     __ stpd(v10, v11, Address(sp, 16));
3015 
3016 // dga == v0
3017 // dgb == v1
3018 // dg0 == v2
3019 // dg1 == v3
3020 // dg2 == v4
3021 // t0 == v6
3022 // t1 == v7
3023 
3024     // load 16 keys to v16..v31
3025     __ lea(rscratch1, ExternalAddress((address)round_consts));
3026     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3027     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3028     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3029     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3030 
3031     // load 8 words (256 bits) state
3032     __ ldpq(v0, v1, state);
3033 
3034     __ BIND(sha1_loop);
3035     // load 64 bytes of data into v8..v11
3036     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3037     __ rev32(v8, __ T16B, v8);
3038     __ rev32(v9, __ T16B, v9);
3039     __ rev32(v10, __ T16B, v10);
3040     __ rev32(v11, __ T16B, v11);
3041 
3042     __ addv(v6, __ T4S, v8, v16);
3043     __ orr(v2, __ T16B, v0, v0);
3044     __ orr(v3, __ T16B, v1, v1);
3045 
3046     FloatRegister d0 = v8;
3047     FloatRegister d1 = v9;
3048     FloatRegister d2 = v10;
3049     FloatRegister d3 = v11;
3050 
3051 
3052     for (int round = 0; round < 16; round++) {
3053       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3054       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3055       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3056       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3057 
3058       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3059        __ orr(v4, __ T16B, v2, v2);
3060       if (round < 15)
3061         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3062       __ sha256h(v2, __ T4S, v3, tmp2);
3063       __ sha256h2(v3, __ T4S, v4, tmp2);
3064       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3065 
3066       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3067     }
3068 
3069     __ addv(v0, __ T4S, v0, v2);
3070     __ addv(v1, __ T4S, v1, v3);
3071 
3072     if (multi_block) {
3073       __ add(ofs, ofs, 64);
3074       __ cmp(ofs, limit);
3075       __ br(Assembler::LE, sha1_loop);
3076       __ mov(c_rarg0, ofs); // return ofs
3077     }
3078 
3079     __ ldpd(v10, v11, Address(sp, 16));
3080     __ ldpd(v8, v9, __ post(sp, 32));
3081 
3082     __ stpq(v0, v1, state);
3083 
3084     __ ret(lr);
3085 
3086     return start;
3087   }
3088 
3089   // Safefetch stubs.
generate_safefetch(const char * name,int size,address * entry,address * fault_pc,address * continuation_pc)3090   void generate_safefetch(const char* name, int size, address* entry,
3091                           address* fault_pc, address* continuation_pc) {
3092     // safefetch signatures:
3093     //   int      SafeFetch32(int*      adr, int      errValue);
3094     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3095     //
3096     // arguments:
3097     //   c_rarg0 = adr
3098     //   c_rarg1 = errValue
3099     //
3100     // result:
3101     //   PPC_RET  = *adr or errValue
3102 
3103     StubCodeMark mark(this, "StubRoutines", name);
3104 
3105     // Entry point, pc or function descriptor.
3106     *entry = __ pc();
3107 
3108     // Load *adr into c_rarg1, may fault.
3109     *fault_pc = __ pc();
3110     switch (size) {
3111       case 4:
3112         // int32_t
3113         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3114         break;
3115       case 8:
3116         // int64_t
3117         __ ldr(c_rarg1, Address(c_rarg0, 0));
3118         break;
3119       default:
3120         ShouldNotReachHere();
3121     }
3122 
3123     // return errValue or *adr
3124     *continuation_pc = __ pc();
3125     __ mov(r0, c_rarg1);
3126     __ ret(lr);
3127   }
3128 
3129   /**
3130    *  Arguments:
3131    *
3132    * Inputs:
3133    *   c_rarg0   - int crc
3134    *   c_rarg1   - byte* buf
3135    *   c_rarg2   - int length
3136    *
3137    * Ouput:
3138    *       rax   - int crc result
3139    */
generate_updateBytesCRC32()3140   address generate_updateBytesCRC32() {
3141     assert(UseCRC32Intrinsics, "what are we doing here?");
3142 
3143     __ align(CodeEntryAlignment);
3144     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3145 
3146     address start = __ pc();
3147 
3148     const Register crc   = c_rarg0;  // crc
3149     const Register buf   = c_rarg1;  // source java byte array address
3150     const Register len   = c_rarg2;  // length
3151     const Register table0 = c_rarg3; // crc_table address
3152     const Register table1 = c_rarg4;
3153     const Register table2 = c_rarg5;
3154     const Register table3 = c_rarg6;
3155     const Register tmp3 = c_rarg7;
3156 
3157     BLOCK_COMMENT("Entry:");
3158     __ enter(); // required for proper stackwalking of RuntimeStub frame
3159 
3160     __ kernel_crc32(crc, buf, len,
3161               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3162 
3163     __ leave(); // required for proper stackwalking of RuntimeStub frame
3164     __ ret(lr);
3165 
3166     return start;
3167   }
3168 
3169   /**
3170    *  Arguments:
3171    *
3172    * Inputs:
3173    *   c_rarg0   - int crc
3174    *   c_rarg1   - byte* buf
3175    *   c_rarg2   - int length
3176    *   c_rarg3   - int* table
3177    *
3178    * Ouput:
3179    *       r0   - int crc result
3180    */
generate_updateBytesCRC32C()3181   address generate_updateBytesCRC32C() {
3182     assert(UseCRC32CIntrinsics, "what are we doing here?");
3183 
3184     __ align(CodeEntryAlignment);
3185     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3186 
3187     address start = __ pc();
3188 
3189     const Register crc   = c_rarg0;  // crc
3190     const Register buf   = c_rarg1;  // source java byte array address
3191     const Register len   = c_rarg2;  // length
3192     const Register table0 = c_rarg3; // crc_table address
3193     const Register table1 = c_rarg4;
3194     const Register table2 = c_rarg5;
3195     const Register table3 = c_rarg6;
3196     const Register tmp3 = c_rarg7;
3197 
3198     BLOCK_COMMENT("Entry:");
3199     __ enter(); // required for proper stackwalking of RuntimeStub frame
3200 
3201     __ kernel_crc32c(crc, buf, len,
3202               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3203 
3204     __ leave(); // required for proper stackwalking of RuntimeStub frame
3205     __ ret(lr);
3206 
3207     return start;
3208   }
3209 
3210   /***
3211    *  Arguments:
3212    *
3213    *  Inputs:
3214    *   c_rarg0   - int   adler
3215    *   c_rarg1   - byte* buff
3216    *   c_rarg2   - int   len
3217    *
3218    * Output:
3219    *   c_rarg0   - int adler result
3220    */
generate_updateBytesAdler32()3221   address generate_updateBytesAdler32() {
3222     __ align(CodeEntryAlignment);
3223     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3224     address start = __ pc();
3225 
3226     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3227 
3228     // Aliases
3229     Register adler  = c_rarg0;
3230     Register s1     = c_rarg0;
3231     Register s2     = c_rarg3;
3232     Register buff   = c_rarg1;
3233     Register len    = c_rarg2;
3234     Register nmax  = r4;
3235     Register base  = r5;
3236     Register count = r6;
3237     Register temp0 = rscratch1;
3238     Register temp1 = rscratch2;
3239     FloatRegister vbytes = v0;
3240     FloatRegister vs1acc = v1;
3241     FloatRegister vs2acc = v2;
3242     FloatRegister vtable = v3;
3243 
3244     // Max number of bytes we can process before having to take the mod
3245     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3246     u_int64_t BASE = 0xfff1;
3247     u_int64_t NMAX = 0x15B0;
3248 
3249     __ mov(base, BASE);
3250     __ mov(nmax, NMAX);
3251 
3252     // Load accumulation coefficients for the upper 16 bits
3253     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3254     __ ld1(vtable, __ T16B, Address(temp0));
3255 
3256     // s1 is initialized to the lower 16 bits of adler
3257     // s2 is initialized to the upper 16 bits of adler
3258     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3259     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3260 
3261     // The pipelined loop needs at least 16 elements for 1 iteration
3262     // It does check this, but it is more effective to skip to the cleanup loop
3263     __ cmp(len, (u1)16);
3264     __ br(Assembler::HS, L_nmax);
3265     __ cbz(len, L_combine);
3266 
3267     __ bind(L_simple_by1_loop);
3268     __ ldrb(temp0, Address(__ post(buff, 1)));
3269     __ add(s1, s1, temp0);
3270     __ add(s2, s2, s1);
3271     __ subs(len, len, 1);
3272     __ br(Assembler::HI, L_simple_by1_loop);
3273 
3274     // s1 = s1 % BASE
3275     __ subs(temp0, s1, base);
3276     __ csel(s1, temp0, s1, Assembler::HS);
3277 
3278     // s2 = s2 % BASE
3279     __ lsr(temp0, s2, 16);
3280     __ lsl(temp1, temp0, 4);
3281     __ sub(temp1, temp1, temp0);
3282     __ add(s2, temp1, s2, ext::uxth);
3283 
3284     __ subs(temp0, s2, base);
3285     __ csel(s2, temp0, s2, Assembler::HS);
3286 
3287     __ b(L_combine);
3288 
3289     __ bind(L_nmax);
3290     __ subs(len, len, nmax);
3291     __ sub(count, nmax, 16);
3292     __ br(Assembler::LO, L_by16);
3293 
3294     __ bind(L_nmax_loop);
3295 
3296     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3297                                       vbytes, vs1acc, vs2acc, vtable);
3298 
3299     __ subs(count, count, 16);
3300     __ br(Assembler::HS, L_nmax_loop);
3301 
3302     // s1 = s1 % BASE
3303     __ lsr(temp0, s1, 16);
3304     __ lsl(temp1, temp0, 4);
3305     __ sub(temp1, temp1, temp0);
3306     __ add(temp1, temp1, s1, ext::uxth);
3307 
3308     __ lsr(temp0, temp1, 16);
3309     __ lsl(s1, temp0, 4);
3310     __ sub(s1, s1, temp0);
3311     __ add(s1, s1, temp1, ext:: uxth);
3312 
3313     __ subs(temp0, s1, base);
3314     __ csel(s1, temp0, s1, Assembler::HS);
3315 
3316     // s2 = s2 % BASE
3317     __ lsr(temp0, s2, 16);
3318     __ lsl(temp1, temp0, 4);
3319     __ sub(temp1, temp1, temp0);
3320     __ add(temp1, temp1, s2, ext::uxth);
3321 
3322     __ lsr(temp0, temp1, 16);
3323     __ lsl(s2, temp0, 4);
3324     __ sub(s2, s2, temp0);
3325     __ add(s2, s2, temp1, ext:: uxth);
3326 
3327     __ subs(temp0, s2, base);
3328     __ csel(s2, temp0, s2, Assembler::HS);
3329 
3330     __ subs(len, len, nmax);
3331     __ sub(count, nmax, 16);
3332     __ br(Assembler::HS, L_nmax_loop);
3333 
3334     __ bind(L_by16);
3335     __ adds(len, len, count);
3336     __ br(Assembler::LO, L_by1);
3337 
3338     __ bind(L_by16_loop);
3339 
3340     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3341                                       vbytes, vs1acc, vs2acc, vtable);
3342 
3343     __ subs(len, len, 16);
3344     __ br(Assembler::HS, L_by16_loop);
3345 
3346     __ bind(L_by1);
3347     __ adds(len, len, 15);
3348     __ br(Assembler::LO, L_do_mod);
3349 
3350     __ bind(L_by1_loop);
3351     __ ldrb(temp0, Address(__ post(buff, 1)));
3352     __ add(s1, temp0, s1);
3353     __ add(s2, s2, s1);
3354     __ subs(len, len, 1);
3355     __ br(Assembler::HS, L_by1_loop);
3356 
3357     __ bind(L_do_mod);
3358     // s1 = s1 % BASE
3359     __ lsr(temp0, s1, 16);
3360     __ lsl(temp1, temp0, 4);
3361     __ sub(temp1, temp1, temp0);
3362     __ add(temp1, temp1, s1, ext::uxth);
3363 
3364     __ lsr(temp0, temp1, 16);
3365     __ lsl(s1, temp0, 4);
3366     __ sub(s1, s1, temp0);
3367     __ add(s1, s1, temp1, ext:: uxth);
3368 
3369     __ subs(temp0, s1, base);
3370     __ csel(s1, temp0, s1, Assembler::HS);
3371 
3372     // s2 = s2 % BASE
3373     __ lsr(temp0, s2, 16);
3374     __ lsl(temp1, temp0, 4);
3375     __ sub(temp1, temp1, temp0);
3376     __ add(temp1, temp1, s2, ext::uxth);
3377 
3378     __ lsr(temp0, temp1, 16);
3379     __ lsl(s2, temp0, 4);
3380     __ sub(s2, s2, temp0);
3381     __ add(s2, s2, temp1, ext:: uxth);
3382 
3383     __ subs(temp0, s2, base);
3384     __ csel(s2, temp0, s2, Assembler::HS);
3385 
3386     // Combine lower bits and higher bits
3387     __ bind(L_combine);
3388     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3389 
3390     __ ret(lr);
3391 
3392     return start;
3393   }
3394 
generate_updateBytesAdler32_accum(Register s1,Register s2,Register buff,Register temp0,Register temp1,FloatRegister vbytes,FloatRegister vs1acc,FloatRegister vs2acc,FloatRegister vtable)3395   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3396           Register temp0, Register temp1, FloatRegister vbytes,
3397           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3398     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3399     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3400     // In non-vectorized code, we update s1 and s2 as:
3401     //   s1 <- s1 + b1
3402     //   s2 <- s2 + s1
3403     //   s1 <- s1 + b2
3404     //   s2 <- s2 + b1
3405     //   ...
3406     //   s1 <- s1 + b16
3407     //   s2 <- s2 + s1
3408     // Putting above assignments together, we have:
3409     //   s1_new = s1 + b1 + b2 + ... + b16
3410     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3411     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3412     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3413     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3414 
3415     // s2 = s2 + s1 * 16
3416     __ add(s2, s2, s1, Assembler::LSL, 4);
3417 
3418     // vs1acc = b1 + b2 + b3 + ... + b16
3419     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3420     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3421     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3422     __ uaddlv(vs1acc, __ T16B, vbytes);
3423     __ uaddlv(vs2acc, __ T8H, vs2acc);
3424 
3425     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3426     __ fmovd(temp0, vs1acc);
3427     __ fmovd(temp1, vs2acc);
3428     __ add(s1, s1, temp0);
3429     __ add(s2, s2, temp1);
3430   }
3431 
3432   /**
3433    *  Arguments:
3434    *
3435    *  Input:
3436    *    c_rarg0   - x address
3437    *    c_rarg1   - x length
3438    *    c_rarg2   - y address
3439    *    c_rarg3   - y lenth
3440    *    c_rarg4   - z address
3441    *    c_rarg5   - z length
3442    */
generate_multiplyToLen()3443   address generate_multiplyToLen() {
3444     __ align(CodeEntryAlignment);
3445     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3446 
3447     address start = __ pc();
3448     const Register x     = r0;
3449     const Register xlen  = r1;
3450     const Register y     = r2;
3451     const Register ylen  = r3;
3452     const Register z     = r4;
3453     const Register zlen  = r5;
3454 
3455     const Register tmp1  = r10;
3456     const Register tmp2  = r11;
3457     const Register tmp3  = r12;
3458     const Register tmp4  = r13;
3459     const Register tmp5  = r14;
3460     const Register tmp6  = r15;
3461     const Register tmp7  = r16;
3462 
3463     BLOCK_COMMENT("Entry:");
3464     __ enter(); // required for proper stackwalking of RuntimeStub frame
3465     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3466     __ leave(); // required for proper stackwalking of RuntimeStub frame
3467     __ ret(lr);
3468 
3469     return start;
3470   }
3471 
generate_squareToLen()3472   address generate_squareToLen() {
3473     // squareToLen algorithm for sizes 1..127 described in java code works
3474     // faster than multiply_to_len on some CPUs and slower on others, but
3475     // multiply_to_len shows a bit better overall results
3476     __ align(CodeEntryAlignment);
3477     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3478     address start = __ pc();
3479 
3480     const Register x     = r0;
3481     const Register xlen  = r1;
3482     const Register z     = r2;
3483     const Register zlen  = r3;
3484     const Register y     = r4; // == x
3485     const Register ylen  = r5; // == xlen
3486 
3487     const Register tmp1  = r10;
3488     const Register tmp2  = r11;
3489     const Register tmp3  = r12;
3490     const Register tmp4  = r13;
3491     const Register tmp5  = r14;
3492     const Register tmp6  = r15;
3493     const Register tmp7  = r16;
3494 
3495     RegSet spilled_regs = RegSet::of(y, ylen);
3496     BLOCK_COMMENT("Entry:");
3497     __ enter();
3498     __ push(spilled_regs, sp);
3499     __ mov(y, x);
3500     __ mov(ylen, xlen);
3501     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3502     __ pop(spilled_regs, sp);
3503     __ leave();
3504     __ ret(lr);
3505     return start;
3506   }
3507 
generate_mulAdd()3508   address generate_mulAdd() {
3509     __ align(CodeEntryAlignment);
3510     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3511 
3512     address start = __ pc();
3513 
3514     const Register out     = r0;
3515     const Register in      = r1;
3516     const Register offset  = r2;
3517     const Register len     = r3;
3518     const Register k       = r4;
3519 
3520     BLOCK_COMMENT("Entry:");
3521     __ enter();
3522     __ mul_add(out, in, offset, len, k);
3523     __ leave();
3524     __ ret(lr);
3525 
3526     return start;
3527   }
3528 
ghash_multiply(FloatRegister result_lo,FloatRegister result_hi,FloatRegister a,FloatRegister b,FloatRegister a1_xor_a0,FloatRegister tmp1,FloatRegister tmp2,FloatRegister tmp3,FloatRegister tmp4)3529   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3530                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3531                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3532     // Karatsuba multiplication performs a 128*128 -> 256-bit
3533     // multiplication in three 128-bit multiplications and a few
3534     // additions.
3535     //
3536     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3537     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3538     //
3539     // Inputs:
3540     //
3541     // A0 in a.d[0]     (subkey)
3542     // A1 in a.d[1]
3543     // (A1+A0) in a1_xor_a0.d[0]
3544     //
3545     // B0 in b.d[0]     (state)
3546     // B1 in b.d[1]
3547 
3548     __ ext(tmp1, __ T16B, b, b, 0x08);
3549     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3550     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3551     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3552     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3553 
3554     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3555     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3556     __ eor(tmp2, __ T16B, tmp2, tmp4);
3557     __ eor(tmp2, __ T16B, tmp2, tmp3);
3558 
3559     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3560     __ ins(result_hi, __ D, tmp2, 0, 1);
3561     __ ins(result_lo, __ D, tmp2, 1, 0);
3562   }
3563 
ghash_reduce(FloatRegister result,FloatRegister lo,FloatRegister hi,FloatRegister p,FloatRegister z,FloatRegister t1)3564   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3565                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3566     const FloatRegister t0 = result;
3567 
3568     // The GCM field polynomial f is z^128 + p(z), where p =
3569     // z^7+z^2+z+1.
3570     //
3571     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3572     //
3573     // so, given that the product we're reducing is
3574     //    a == lo + hi * z^128
3575     // substituting,
3576     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3577     //
3578     // we reduce by multiplying hi by p(z) and subtracting the result
3579     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3580     // bits we can do this with two 64-bit multiplications, lo*p and
3581     // hi*p.
3582 
3583     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3584     __ ext(t1, __ T16B, t0, z, 8);
3585     __ eor(hi, __ T16B, hi, t1);
3586     __ ext(t1, __ T16B, z, t0, 8);
3587     __ eor(lo, __ T16B, lo, t1);
3588     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3589     __ eor(result, __ T16B, lo, t0);
3590   }
3591 
generate_has_negatives(address & has_negatives_long)3592   address generate_has_negatives(address &has_negatives_long) {
3593     const u1 large_loop_size = 64;
3594     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3595     int dcache_line = VM_Version::dcache_line_size();
3596 
3597     Register ary1 = r1, len = r2, result = r0;
3598 
3599     __ align(CodeEntryAlignment);
3600 
3601     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3602 
3603     address entry = __ pc();
3604 
3605     __ enter();
3606 
3607   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3608         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3609 
3610   __ cmp(len, (u1)15);
3611   __ br(Assembler::GT, LEN_OVER_15);
3612   // The only case when execution falls into this code is when pointer is near
3613   // the end of memory page and we have to avoid reading next page
3614   __ add(ary1, ary1, len);
3615   __ subs(len, len, 8);
3616   __ br(Assembler::GT, LEN_OVER_8);
3617   __ ldr(rscratch2, Address(ary1, -8));
3618   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3619   __ lsrv(rscratch2, rscratch2, rscratch1);
3620   __ tst(rscratch2, UPPER_BIT_MASK);
3621   __ cset(result, Assembler::NE);
3622   __ leave();
3623   __ ret(lr);
3624   __ bind(LEN_OVER_8);
3625   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3626   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3627   __ tst(rscratch2, UPPER_BIT_MASK);
3628   __ br(Assembler::NE, RET_TRUE_NO_POP);
3629   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3630   __ lsrv(rscratch1, rscratch1, rscratch2);
3631   __ tst(rscratch1, UPPER_BIT_MASK);
3632   __ cset(result, Assembler::NE);
3633   __ leave();
3634   __ ret(lr);
3635 
3636   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3637   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3638 
3639   has_negatives_long = __ pc(); // 2nd entry point
3640 
3641   __ enter();
3642 
3643   __ bind(LEN_OVER_15);
3644     __ push(spilled_regs, sp);
3645     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3646     __ cbz(rscratch2, ALIGNED);
3647     __ ldp(tmp6, tmp1, Address(ary1));
3648     __ mov(tmp5, 16);
3649     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3650     __ add(ary1, ary1, rscratch1);
3651     __ sub(len, len, rscratch1);
3652     __ orr(tmp6, tmp6, tmp1);
3653     __ tst(tmp6, UPPER_BIT_MASK);
3654     __ br(Assembler::NE, RET_TRUE);
3655 
3656   __ bind(ALIGNED);
3657     __ cmp(len, large_loop_size);
3658     __ br(Assembler::LT, CHECK_16);
3659     // Perform 16-byte load as early return in pre-loop to handle situation
3660     // when initially aligned large array has negative values at starting bytes,
3661     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3662     // slower. Cases with negative bytes further ahead won't be affected that
3663     // much. In fact, it'll be faster due to early loads, less instructions and
3664     // less branches in LARGE_LOOP.
3665     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3666     __ sub(len, len, 16);
3667     __ orr(tmp6, tmp6, tmp1);
3668     __ tst(tmp6, UPPER_BIT_MASK);
3669     __ br(Assembler::NE, RET_TRUE);
3670     __ cmp(len, large_loop_size);
3671     __ br(Assembler::LT, CHECK_16);
3672 
3673     if (SoftwarePrefetchHintDistance >= 0
3674         && SoftwarePrefetchHintDistance >= dcache_line) {
3675       // initial prefetch
3676       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3677     }
3678   __ bind(LARGE_LOOP);
3679     if (SoftwarePrefetchHintDistance >= 0) {
3680       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3681     }
3682     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3683     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3684     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3685     // instructions per cycle and have less branches, but this approach disables
3686     // early return, thus, all 64 bytes are loaded and checked every time.
3687     __ ldp(tmp2, tmp3, Address(ary1));
3688     __ ldp(tmp4, tmp5, Address(ary1, 16));
3689     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3690     __ ldp(tmp6, tmp1, Address(ary1, 48));
3691     __ add(ary1, ary1, large_loop_size);
3692     __ sub(len, len, large_loop_size);
3693     __ orr(tmp2, tmp2, tmp3);
3694     __ orr(tmp4, tmp4, tmp5);
3695     __ orr(rscratch1, rscratch1, rscratch2);
3696     __ orr(tmp6, tmp6, tmp1);
3697     __ orr(tmp2, tmp2, tmp4);
3698     __ orr(rscratch1, rscratch1, tmp6);
3699     __ orr(tmp2, tmp2, rscratch1);
3700     __ tst(tmp2, UPPER_BIT_MASK);
3701     __ br(Assembler::NE, RET_TRUE);
3702     __ cmp(len, large_loop_size);
3703     __ br(Assembler::GE, LARGE_LOOP);
3704 
3705   __ bind(CHECK_16); // small 16-byte load pre-loop
3706     __ cmp(len, (u1)16);
3707     __ br(Assembler::LT, POST_LOOP16);
3708 
3709   __ bind(LOOP16); // small 16-byte load loop
3710     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3711     __ sub(len, len, 16);
3712     __ orr(tmp2, tmp2, tmp3);
3713     __ tst(tmp2, UPPER_BIT_MASK);
3714     __ br(Assembler::NE, RET_TRUE);
3715     __ cmp(len, (u1)16);
3716     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3717 
3718   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3719     __ cmp(len, (u1)8);
3720     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3721     __ ldr(tmp3, Address(__ post(ary1, 8)));
3722     __ sub(len, len, 8);
3723     __ tst(tmp3, UPPER_BIT_MASK);
3724     __ br(Assembler::NE, RET_TRUE);
3725 
3726   __ bind(POST_LOOP16_LOAD_TAIL);
3727     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3728     __ ldr(tmp1, Address(ary1));
3729     __ mov(tmp2, 64);
3730     __ sub(tmp4, tmp2, len, __ LSL, 3);
3731     __ lslv(tmp1, tmp1, tmp4);
3732     __ tst(tmp1, UPPER_BIT_MASK);
3733     __ br(Assembler::NE, RET_TRUE);
3734     // Fallthrough
3735 
3736   __ bind(RET_FALSE);
3737     __ pop(spilled_regs, sp);
3738     __ leave();
3739     __ mov(result, zr);
3740     __ ret(lr);
3741 
3742   __ bind(RET_TRUE);
3743     __ pop(spilled_regs, sp);
3744   __ bind(RET_TRUE_NO_POP);
3745     __ leave();
3746     __ mov(result, 1);
3747     __ ret(lr);
3748 
3749   __ bind(DONE);
3750     __ pop(spilled_regs, sp);
3751     __ leave();
3752     __ ret(lr);
3753     return entry;
3754   }
3755 
generate_large_array_equals_loop_nonsimd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)3756   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3757         bool usePrefetch, Label &NOT_EQUAL) {
3758     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3759         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3760         tmp7 = r12, tmp8 = r13;
3761     Label LOOP;
3762 
3763     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3764     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3765     __ bind(LOOP);
3766     if (usePrefetch) {
3767       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3768       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3769     }
3770     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3771     __ eor(tmp1, tmp1, tmp2);
3772     __ eor(tmp3, tmp3, tmp4);
3773     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3774     __ orr(tmp1, tmp1, tmp3);
3775     __ cbnz(tmp1, NOT_EQUAL);
3776     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3777     __ eor(tmp5, tmp5, tmp6);
3778     __ eor(tmp7, tmp7, tmp8);
3779     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3780     __ orr(tmp5, tmp5, tmp7);
3781     __ cbnz(tmp5, NOT_EQUAL);
3782     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3783     __ eor(tmp1, tmp1, tmp2);
3784     __ eor(tmp3, tmp3, tmp4);
3785     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3786     __ orr(tmp1, tmp1, tmp3);
3787     __ cbnz(tmp1, NOT_EQUAL);
3788     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3789     __ eor(tmp5, tmp5, tmp6);
3790     __ sub(cnt1, cnt1, 8 * wordSize);
3791     __ eor(tmp7, tmp7, tmp8);
3792     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3793     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3794     // cmp) because subs allows an unlimited range of immediate operand.
3795     __ subs(tmp6, cnt1, loopThreshold);
3796     __ orr(tmp5, tmp5, tmp7);
3797     __ cbnz(tmp5, NOT_EQUAL);
3798     __ br(__ GE, LOOP);
3799     // post-loop
3800     __ eor(tmp1, tmp1, tmp2);
3801     __ eor(tmp3, tmp3, tmp4);
3802     __ orr(tmp1, tmp1, tmp3);
3803     __ sub(cnt1, cnt1, 2 * wordSize);
3804     __ cbnz(tmp1, NOT_EQUAL);
3805   }
3806 
generate_large_array_equals_loop_simd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)3807   void generate_large_array_equals_loop_simd(int loopThreshold,
3808         bool usePrefetch, Label &NOT_EQUAL) {
3809     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3810         tmp2 = rscratch2;
3811     Label LOOP;
3812 
3813     __ bind(LOOP);
3814     if (usePrefetch) {
3815       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3816       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3817     }
3818     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3819     __ sub(cnt1, cnt1, 8 * wordSize);
3820     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3821     __ subs(tmp1, cnt1, loopThreshold);
3822     __ eor(v0, __ T16B, v0, v4);
3823     __ eor(v1, __ T16B, v1, v5);
3824     __ eor(v2, __ T16B, v2, v6);
3825     __ eor(v3, __ T16B, v3, v7);
3826     __ orr(v0, __ T16B, v0, v1);
3827     __ orr(v1, __ T16B, v2, v3);
3828     __ orr(v0, __ T16B, v0, v1);
3829     __ umov(tmp1, v0, __ D, 0);
3830     __ umov(tmp2, v0, __ D, 1);
3831     __ orr(tmp1, tmp1, tmp2);
3832     __ cbnz(tmp1, NOT_EQUAL);
3833     __ br(__ GE, LOOP);
3834   }
3835 
3836   // a1 = r1 - array1 address
3837   // a2 = r2 - array2 address
3838   // result = r0 - return value. Already contains "false"
3839   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3840   // r3-r5 are reserved temporary registers
generate_large_array_equals()3841   address generate_large_array_equals() {
3842     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3843         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3844         tmp7 = r12, tmp8 = r13;
3845     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3846         SMALL_LOOP, POST_LOOP;
3847     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3848     // calculate if at least 32 prefetched bytes are used
3849     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3850     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3851     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3852     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3853         tmp5, tmp6, tmp7, tmp8);
3854 
3855     __ align(CodeEntryAlignment);
3856 
3857     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3858 
3859     address entry = __ pc();
3860     __ enter();
3861     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3862     // also advance pointers to use post-increment instead of pre-increment
3863     __ add(a1, a1, wordSize);
3864     __ add(a2, a2, wordSize);
3865     if (AvoidUnalignedAccesses) {
3866       // both implementations (SIMD/nonSIMD) are using relatively large load
3867       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3868       // on some CPUs in case of address is not at least 16-byte aligned.
3869       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3870       // load if needed at least for 1st address and make if 16-byte aligned.
3871       Label ALIGNED16;
3872       __ tbz(a1, 3, ALIGNED16);
3873       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3874       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3875       __ sub(cnt1, cnt1, wordSize);
3876       __ eor(tmp1, tmp1, tmp2);
3877       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3878       __ bind(ALIGNED16);
3879     }
3880     if (UseSIMDForArrayEquals) {
3881       if (SoftwarePrefetchHintDistance >= 0) {
3882         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3883         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3884         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3885             /* prfm = */ true, NOT_EQUAL);
3886         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3887         __ br(__ LT, TAIL);
3888       }
3889       __ bind(NO_PREFETCH_LARGE_LOOP);
3890       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3891           /* prfm = */ false, NOT_EQUAL);
3892     } else {
3893       __ push(spilled_regs, sp);
3894       if (SoftwarePrefetchHintDistance >= 0) {
3895         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3896         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3897         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3898             /* prfm = */ true, NOT_EQUAL);
3899         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3900         __ br(__ LT, TAIL);
3901       }
3902       __ bind(NO_PREFETCH_LARGE_LOOP);
3903       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3904           /* prfm = */ false, NOT_EQUAL);
3905     }
3906     __ bind(TAIL);
3907       __ cbz(cnt1, EQUAL);
3908       __ subs(cnt1, cnt1, wordSize);
3909       __ br(__ LE, POST_LOOP);
3910     __ bind(SMALL_LOOP);
3911       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3912       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3913       __ subs(cnt1, cnt1, wordSize);
3914       __ eor(tmp1, tmp1, tmp2);
3915       __ cbnz(tmp1, NOT_EQUAL);
3916       __ br(__ GT, SMALL_LOOP);
3917     __ bind(POST_LOOP);
3918       __ ldr(tmp1, Address(a1, cnt1));
3919       __ ldr(tmp2, Address(a2, cnt1));
3920       __ eor(tmp1, tmp1, tmp2);
3921       __ cbnz(tmp1, NOT_EQUAL);
3922     __ bind(EQUAL);
3923       __ mov(result, true);
3924     __ bind(NOT_EQUAL);
3925       if (!UseSIMDForArrayEquals) {
3926         __ pop(spilled_regs, sp);
3927       }
3928     __ bind(NOT_EQUAL_NO_POP);
3929     __ leave();
3930     __ ret(lr);
3931     return entry;
3932   }
3933 
generate_dsin_dcos(bool isCos)3934   address generate_dsin_dcos(bool isCos) {
3935     __ align(CodeEntryAlignment);
3936     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3937     address start = __ pc();
3938     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3939         (address)StubRoutines::aarch64::_two_over_pi,
3940         (address)StubRoutines::aarch64::_pio2,
3941         (address)StubRoutines::aarch64::_dsin_coef,
3942         (address)StubRoutines::aarch64::_dcos_coef);
3943     return start;
3944   }
3945 
generate_dlog()3946   address generate_dlog() {
3947     __ align(CodeEntryAlignment);
3948     StubCodeMark mark(this, "StubRoutines", "dlog");
3949     address entry = __ pc();
3950     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3951         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3952     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3953     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3954         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3955     return entry;
3956   }
3957 
3958   // code for comparing 16 bytes of strings with same encoding
compare_string_16_bytes_same(Label & DIFF1,Label & DIFF2)3959   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3960     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3961     __ ldr(rscratch1, Address(__ post(str1, 8)));
3962     __ eor(rscratch2, tmp1, tmp2);
3963     __ ldr(cnt1, Address(__ post(str2, 8)));
3964     __ cbnz(rscratch2, DIFF1);
3965     __ ldr(tmp1, Address(__ post(str1, 8)));
3966     __ eor(rscratch2, rscratch1, cnt1);
3967     __ ldr(tmp2, Address(__ post(str2, 8)));
3968     __ cbnz(rscratch2, DIFF2);
3969   }
3970 
3971   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
compare_string_16_x_LU(Register tmpL,Register tmpU,Label & DIFF1,Label & DIFF2)3972   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
3973       Label &DIFF2) {
3974     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
3975     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
3976 
3977     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
3978     __ ldr(tmpU, Address(__ post(cnt1, 8)));
3979     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
3980     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
3981 
3982     __ fmovd(tmpL, vtmp3);
3983     __ eor(rscratch2, tmp3, tmpL);
3984     __ cbnz(rscratch2, DIFF2);
3985 
3986     __ ldr(tmp3, Address(__ post(cnt1, 8)));
3987     __ umov(tmpL, vtmp3, __ D, 1);
3988     __ eor(rscratch2, tmpU, tmpL);
3989     __ cbnz(rscratch2, DIFF1);
3990 
3991     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
3992     __ ldr(tmpU, Address(__ post(cnt1, 8)));
3993     __ fmovd(tmpL, vtmp);
3994     __ eor(rscratch2, tmp3, tmpL);
3995     __ cbnz(rscratch2, DIFF2);
3996 
3997     __ ldr(tmp3, Address(__ post(cnt1, 8)));
3998     __ umov(tmpL, vtmp, __ D, 1);
3999     __ eor(rscratch2, tmpU, tmpL);
4000     __ cbnz(rscratch2, DIFF1);
4001   }
4002 
4003   // r0  = result
4004   // r1  = str1
4005   // r2  = cnt1
4006   // r3  = str2
4007   // r4  = cnt2
4008   // r10 = tmp1
4009   // r11 = tmp2
generate_compare_long_string_different_encoding(bool isLU)4010   address generate_compare_long_string_different_encoding(bool isLU) {
4011     __ align(CodeEntryAlignment);
4012     StubCodeMark mark(this, "StubRoutines", isLU
4013         ? "compare_long_string_different_encoding LU"
4014         : "compare_long_string_different_encoding UL");
4015     address entry = __ pc();
4016     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4017         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4018         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4019     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4020         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4021     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4022     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4023 
4024     int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4025 
4026     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4027     // cnt2 == amount of characters left to compare
4028     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4029     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4030     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4031     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4032     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4033     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4034     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4035     __ eor(rscratch2, tmp1, tmp2);
4036     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4037     __ mov(rscratch1, tmp2);
4038     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4039     Register strU = isLU ? str2 : str1,
4040              strL = isLU ? str1 : str2,
4041              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4042              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4043     __ push(spilled_regs, sp);
4044     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4045     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4046 
4047     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4048 
4049     if (SoftwarePrefetchHintDistance >= 0) {
4050       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4051       __ br(__ LT, NO_PREFETCH);
4052       __ bind(LARGE_LOOP_PREFETCH);
4053         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4054         __ mov(tmp4, 2);
4055         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4056         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4057           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4058           __ subs(tmp4, tmp4, 1);
4059           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4060           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4061           __ mov(tmp4, 2);
4062         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4063           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4064           __ subs(tmp4, tmp4, 1);
4065           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4066           __ sub(cnt2, cnt2, 64);
4067           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4068           __ br(__ GE, LARGE_LOOP_PREFETCH);
4069     }
4070     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4071     __ bind(NO_PREFETCH);
4072     __ subs(cnt2, cnt2, 16);
4073     __ br(__ LT, TAIL);
4074     __ bind(SMALL_LOOP); // smaller loop
4075       __ subs(cnt2, cnt2, 16);
4076       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4077       __ br(__ GE, SMALL_LOOP);
4078       __ cmn(cnt2, (u1)16);
4079       __ br(__ EQ, LOAD_LAST);
4080     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4081       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string
4082       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4083       __ ldr(tmp3, Address(cnt1, -8));
4084       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4085       __ b(LOAD_LAST);
4086     __ bind(DIFF2);
4087       __ mov(tmpU, tmp3);
4088     __ bind(DIFF1);
4089       __ pop(spilled_regs, sp);
4090       __ b(CALCULATE_DIFFERENCE);
4091     __ bind(LOAD_LAST);
4092       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4093       // No need to load it again
4094       __ mov(tmpU, tmp3);
4095       __ pop(spilled_regs, sp);
4096 
4097       __ ldrs(vtmp, Address(strL));
4098       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4099       __ fmovd(tmpL, vtmp);
4100 
4101       __ eor(rscratch2, tmpU, tmpL);
4102       __ cbz(rscratch2, DONE);
4103 
4104     // Find the first different characters in the longwords and
4105     // compute their difference.
4106     __ bind(CALCULATE_DIFFERENCE);
4107       __ rev(rscratch2, rscratch2);
4108       __ clz(rscratch2, rscratch2);
4109       __ andr(rscratch2, rscratch2, -16);
4110       __ lsrv(tmp1, tmp1, rscratch2);
4111       __ uxthw(tmp1, tmp1);
4112       __ lsrv(rscratch1, rscratch1, rscratch2);
4113       __ uxthw(rscratch1, rscratch1);
4114       __ subw(result, tmp1, rscratch1);
4115     __ bind(DONE);
4116       __ ret(lr);
4117     return entry;
4118   }
4119 
4120   // r0  = result
4121   // r1  = str1
4122   // r2  = cnt1
4123   // r3  = str2
4124   // r4  = cnt2
4125   // r10 = tmp1
4126   // r11 = tmp2
generate_compare_long_string_same_encoding(bool isLL)4127   address generate_compare_long_string_same_encoding(bool isLL) {
4128     __ align(CodeEntryAlignment);
4129     StubCodeMark mark(this, "StubRoutines", isLL
4130         ? "compare_long_string_same_encoding LL"
4131         : "compare_long_string_same_encoding UU");
4132     address entry = __ pc();
4133     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4134         tmp1 = r10, tmp2 = r11;
4135     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4136         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4137         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4138     // exit from large loop when less than 64 bytes left to read or we're about
4139     // to prefetch memory behind array border
4140     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4141     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4142     // update cnt2 counter with already loaded 8 bytes
4143     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4144     // update pointers, because of previous read
4145     __ add(str1, str1, wordSize);
4146     __ add(str2, str2, wordSize);
4147     if (SoftwarePrefetchHintDistance >= 0) {
4148       __ bind(LARGE_LOOP_PREFETCH);
4149         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4150         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4151         compare_string_16_bytes_same(DIFF, DIFF2);
4152         compare_string_16_bytes_same(DIFF, DIFF2);
4153         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4154         compare_string_16_bytes_same(DIFF, DIFF2);
4155         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4156         compare_string_16_bytes_same(DIFF, DIFF2);
4157         __ br(__ GT, LARGE_LOOP_PREFETCH);
4158         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4159     }
4160     // less than 16 bytes left?
4161     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4162     __ br(__ LT, TAIL);
4163     __ bind(SMALL_LOOP);
4164       compare_string_16_bytes_same(DIFF, DIFF2);
4165       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4166       __ br(__ GE, SMALL_LOOP);
4167     __ bind(TAIL);
4168       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4169       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4170       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4171       __ br(__ LE, CHECK_LAST);
4172       __ eor(rscratch2, tmp1, tmp2);
4173       __ cbnz(rscratch2, DIFF);
4174       __ ldr(tmp1, Address(__ post(str1, 8)));
4175       __ ldr(tmp2, Address(__ post(str2, 8)));
4176       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4177     __ bind(CHECK_LAST);
4178       if (!isLL) {
4179         __ add(cnt2, cnt2, cnt2); // now in bytes
4180       }
4181       __ eor(rscratch2, tmp1, tmp2);
4182       __ cbnz(rscratch2, DIFF);
4183       __ ldr(rscratch1, Address(str1, cnt2));
4184       __ ldr(cnt1, Address(str2, cnt2));
4185       __ eor(rscratch2, rscratch1, cnt1);
4186       __ cbz(rscratch2, LENGTH_DIFF);
4187       // Find the first different characters in the longwords and
4188       // compute their difference.
4189     __ bind(DIFF2);
4190       __ rev(rscratch2, rscratch2);
4191       __ clz(rscratch2, rscratch2);
4192       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4193       __ lsrv(rscratch1, rscratch1, rscratch2);
4194       if (isLL) {
4195         __ lsrv(cnt1, cnt1, rscratch2);
4196         __ uxtbw(rscratch1, rscratch1);
4197         __ uxtbw(cnt1, cnt1);
4198       } else {
4199         __ lsrv(cnt1, cnt1, rscratch2);
4200         __ uxthw(rscratch1, rscratch1);
4201         __ uxthw(cnt1, cnt1);
4202       }
4203       __ subw(result, rscratch1, cnt1);
4204       __ b(LENGTH_DIFF);
4205     __ bind(DIFF);
4206       __ rev(rscratch2, rscratch2);
4207       __ clz(rscratch2, rscratch2);
4208       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4209       __ lsrv(tmp1, tmp1, rscratch2);
4210       if (isLL) {
4211         __ lsrv(tmp2, tmp2, rscratch2);
4212         __ uxtbw(tmp1, tmp1);
4213         __ uxtbw(tmp2, tmp2);
4214       } else {
4215         __ lsrv(tmp2, tmp2, rscratch2);
4216         __ uxthw(tmp1, tmp1);
4217         __ uxthw(tmp2, tmp2);
4218       }
4219       __ subw(result, tmp1, tmp2);
4220       __ b(LENGTH_DIFF);
4221     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4222       __ eor(rscratch2, tmp1, tmp2);
4223       __ cbnz(rscratch2, DIFF);
4224     __ bind(LENGTH_DIFF);
4225       __ ret(lr);
4226     return entry;
4227   }
4228 
generate_compare_long_strings()4229   void generate_compare_long_strings() {
4230       StubRoutines::aarch64::_compare_long_string_LL
4231           = generate_compare_long_string_same_encoding(true);
4232       StubRoutines::aarch64::_compare_long_string_UU
4233           = generate_compare_long_string_same_encoding(false);
4234       StubRoutines::aarch64::_compare_long_string_LU
4235           = generate_compare_long_string_different_encoding(true);
4236       StubRoutines::aarch64::_compare_long_string_UL
4237           = generate_compare_long_string_different_encoding(false);
4238   }
4239 
4240   // R0 = result
4241   // R1 = str2
4242   // R2 = cnt1
4243   // R3 = str1
4244   // R4 = cnt2
4245   // This generic linear code use few additional ideas, which makes it faster:
4246   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4247   // in order to skip initial loading(help in systems with 1 ld pipeline)
4248   // 2) we can use "fast" algorithm of finding single character to search for
4249   // first symbol with less branches(1 branch per each loaded register instead
4250   // of branch for each symbol), so, this is where constants like
4251   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4252   // 3) after loading and analyzing 1st register of source string, it can be
4253   // used to search for every 1st character entry, saving few loads in
4254   // comparison with "simplier-but-slower" implementation
4255   // 4) in order to avoid lots of push/pop operations, code below is heavily
4256   // re-using/re-initializing/compressing register values, which makes code
4257   // larger and a bit less readable, however, most of extra operations are
4258   // issued during loads or branches, so, penalty is minimal
generate_string_indexof_linear(bool str1_isL,bool str2_isL)4259   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4260     const char* stubName = str1_isL
4261         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4262         : "indexof_linear_uu";
4263     __ align(CodeEntryAlignment);
4264     StubCodeMark mark(this, "StubRoutines", stubName);
4265     address entry = __ pc();
4266 
4267     int str1_chr_size = str1_isL ? 1 : 2;
4268     int str2_chr_size = str2_isL ? 1 : 2;
4269     int str1_chr_shift = str1_isL ? 0 : 1;
4270     int str2_chr_shift = str2_isL ? 0 : 1;
4271     bool isL = str1_isL && str2_isL;
4272    // parameters
4273     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4274     // temporary registers
4275     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4276     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4277     // redefinitions
4278     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4279 
4280     __ push(spilled_regs, sp);
4281     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4282         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4283         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4284         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4285         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4286         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4287     // Read whole register from str1. It is safe, because length >=8 here
4288     __ ldr(ch1, Address(str1));
4289     // Read whole register from str2. It is safe, because length >=8 here
4290     __ ldr(ch2, Address(str2));
4291     __ sub(cnt2, cnt2, cnt1);
4292     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4293     if (str1_isL != str2_isL) {
4294       __ eor(v0, __ T16B, v0, v0);
4295     }
4296     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4297     __ mul(first, first, tmp1);
4298     // check if we have less than 1 register to check
4299     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4300     if (str1_isL != str2_isL) {
4301       __ fmovd(v1, ch1);
4302     }
4303     __ br(__ LE, L_SMALL);
4304     __ eor(ch2, first, ch2);
4305     if (str1_isL != str2_isL) {
4306       __ zip1(v1, __ T16B, v1, v0);
4307     }
4308     __ sub(tmp2, ch2, tmp1);
4309     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4310     __ bics(tmp2, tmp2, ch2);
4311     if (str1_isL != str2_isL) {
4312       __ fmovd(ch1, v1);
4313     }
4314     __ br(__ NE, L_HAS_ZERO);
4315     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4316     __ add(result, result, wordSize/str2_chr_size);
4317     __ add(str2, str2, wordSize);
4318     __ br(__ LT, L_POST_LOOP);
4319     __ BIND(L_LOOP);
4320       __ ldr(ch2, Address(str2));
4321       __ eor(ch2, first, ch2);
4322       __ sub(tmp2, ch2, tmp1);
4323       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4324       __ bics(tmp2, tmp2, ch2);
4325       __ br(__ NE, L_HAS_ZERO);
4326     __ BIND(L_LOOP_PROCEED);
4327       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4328       __ add(str2, str2, wordSize);
4329       __ add(result, result, wordSize/str2_chr_size);
4330       __ br(__ GE, L_LOOP);
4331     __ BIND(L_POST_LOOP);
4332       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4333       __ br(__ LE, NOMATCH);
4334       __ ldr(ch2, Address(str2));
4335       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4336       __ eor(ch2, first, ch2);
4337       __ sub(tmp2, ch2, tmp1);
4338       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4339       __ mov(tmp4, -1); // all bits set
4340       __ b(L_SMALL_PROCEED);
4341     __ align(OptoLoopAlignment);
4342     __ BIND(L_SMALL);
4343       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4344       __ eor(ch2, first, ch2);
4345       if (str1_isL != str2_isL) {
4346         __ zip1(v1, __ T16B, v1, v0);
4347       }
4348       __ sub(tmp2, ch2, tmp1);
4349       __ mov(tmp4, -1); // all bits set
4350       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4351       if (str1_isL != str2_isL) {
4352         __ fmovd(ch1, v1); // move converted 4 symbols
4353       }
4354     __ BIND(L_SMALL_PROCEED);
4355       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4356       __ bic(tmp2, tmp2, ch2);
4357       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4358       __ rbit(tmp2, tmp2);
4359       __ br(__ EQ, NOMATCH);
4360     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4361       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4362       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4363       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4364       if (str2_isL) { // LL
4365         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4366         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4367         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4368         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4369         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4370       } else {
4371         __ mov(ch2, 0xE); // all bits in byte set except last one
4372         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4373         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4374         __ lslv(tmp2, tmp2, tmp4);
4375         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4376         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4377         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4378         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4379       }
4380       __ cmp(ch1, ch2);
4381       __ mov(tmp4, wordSize/str2_chr_size);
4382       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4383     __ BIND(L_SMALL_CMP_LOOP);
4384       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4385                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4386       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4387                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4388       __ add(tmp4, tmp4, 1);
4389       __ cmp(tmp4, cnt1);
4390       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4391       __ cmp(first, ch2);
4392       __ br(__ EQ, L_SMALL_CMP_LOOP);
4393     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4394       __ cbz(tmp2, NOMATCH); // no more matches. exit
4395       __ clz(tmp4, tmp2);
4396       __ add(result, result, 1); // advance index
4397       __ add(str2, str2, str2_chr_size); // advance pointer
4398       __ b(L_SMALL_HAS_ZERO_LOOP);
4399     __ align(OptoLoopAlignment);
4400     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4401       __ cmp(first, ch2);
4402       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4403       __ b(DONE);
4404     __ align(OptoLoopAlignment);
4405     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4406       if (str2_isL) { // LL
4407         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4408         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4409         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4410         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4411         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4412       } else {
4413         __ mov(ch2, 0xE); // all bits in byte set except last one
4414         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4415         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4416         __ lslv(tmp2, tmp2, tmp4);
4417         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4418         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4419         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4420         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4421       }
4422       __ cmp(ch1, ch2);
4423       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4424       __ b(DONE);
4425     __ align(OptoLoopAlignment);
4426     __ BIND(L_HAS_ZERO);
4427       __ rbit(tmp2, tmp2);
4428       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4429       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4430       // It's fine because both counters are 32bit and are not changed in this
4431       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4432       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4433       __ sub(result, result, 1);
4434     __ BIND(L_HAS_ZERO_LOOP);
4435       __ mov(cnt1, wordSize/str2_chr_size);
4436       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4437       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4438       if (str2_isL) {
4439         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4440         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4441         __ lslv(tmp2, tmp2, tmp4);
4442         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4443         __ add(tmp4, tmp4, 1);
4444         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4445         __ lsl(tmp2, tmp2, 1);
4446         __ mov(tmp4, wordSize/str2_chr_size);
4447       } else {
4448         __ mov(ch2, 0xE);
4449         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4450         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4451         __ lslv(tmp2, tmp2, tmp4);
4452         __ add(tmp4, tmp4, 1);
4453         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4454         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4455         __ lsl(tmp2, tmp2, 1);
4456         __ mov(tmp4, wordSize/str2_chr_size);
4457         __ sub(str2, str2, str2_chr_size);
4458       }
4459       __ cmp(ch1, ch2);
4460       __ mov(tmp4, wordSize/str2_chr_size);
4461       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4462     __ BIND(L_CMP_LOOP);
4463       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4464                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4465       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4466                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4467       __ add(tmp4, tmp4, 1);
4468       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4469       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4470       __ cmp(cnt1, ch2);
4471       __ br(__ EQ, L_CMP_LOOP);
4472     __ BIND(L_CMP_LOOP_NOMATCH);
4473       // here we're not matched
4474       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4475       __ clz(tmp4, tmp2);
4476       __ add(str2, str2, str2_chr_size); // advance pointer
4477       __ b(L_HAS_ZERO_LOOP);
4478     __ align(OptoLoopAlignment);
4479     __ BIND(L_CMP_LOOP_LAST_CMP);
4480       __ cmp(cnt1, ch2);
4481       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4482       __ b(DONE);
4483     __ align(OptoLoopAlignment);
4484     __ BIND(L_CMP_LOOP_LAST_CMP2);
4485       if (str2_isL) {
4486         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4487         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4488         __ lslv(tmp2, tmp2, tmp4);
4489         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4490         __ add(tmp4, tmp4, 1);
4491         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4492         __ lsl(tmp2, tmp2, 1);
4493       } else {
4494         __ mov(ch2, 0xE);
4495         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4496         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4497         __ lslv(tmp2, tmp2, tmp4);
4498         __ add(tmp4, tmp4, 1);
4499         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4500         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4501         __ lsl(tmp2, tmp2, 1);
4502         __ sub(str2, str2, str2_chr_size);
4503       }
4504       __ cmp(ch1, ch2);
4505       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4506       __ b(DONE);
4507     __ align(OptoLoopAlignment);
4508     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4509       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4510       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4511       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4512       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4513       // result by analyzed characters value, so, we can just reset lower bits
4514       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4515       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4516       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4517       // index of last analyzed substring inside current octet. So, str2 in at
4518       // respective start address. We need to advance it to next octet
4519       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4520       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4521       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4522       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4523       __ movw(cnt2, cnt2);
4524       __ b(L_LOOP_PROCEED);
4525     __ align(OptoLoopAlignment);
4526     __ BIND(NOMATCH);
4527       __ mov(result, -1);
4528     __ BIND(DONE);
4529       __ pop(spilled_regs, sp);
4530       __ ret(lr);
4531     return entry;
4532   }
4533 
generate_string_indexof_stubs()4534   void generate_string_indexof_stubs() {
4535     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4536     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4537     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4538   }
4539 
inflate_and_store_2_fp_registers(bool generatePrfm,FloatRegister src1,FloatRegister src2)4540   void inflate_and_store_2_fp_registers(bool generatePrfm,
4541       FloatRegister src1, FloatRegister src2) {
4542     Register dst = r1;
4543     __ zip1(v1, __ T16B, src1, v0);
4544     __ zip2(v2, __ T16B, src1, v0);
4545     if (generatePrfm) {
4546       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4547     }
4548     __ zip1(v3, __ T16B, src2, v0);
4549     __ zip2(v4, __ T16B, src2, v0);
4550     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4551   }
4552 
4553   // R0 = src
4554   // R1 = dst
4555   // R2 = len
4556   // R3 = len >> 3
4557   // V0 = 0
4558   // v1 = loaded 8 bytes
generate_large_byte_array_inflate()4559   address generate_large_byte_array_inflate() {
4560     __ align(CodeEntryAlignment);
4561     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4562     address entry = __ pc();
4563     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4564     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4565     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4566 
4567     // do one more 8-byte read to have address 16-byte aligned in most cases
4568     // also use single store instruction
4569     __ ldrd(v2, __ post(src, 8));
4570     __ sub(octetCounter, octetCounter, 2);
4571     __ zip1(v1, __ T16B, v1, v0);
4572     __ zip1(v2, __ T16B, v2, v0);
4573     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4574     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4575     __ subs(rscratch1, octetCounter, large_loop_threshold);
4576     __ br(__ LE, LOOP_START);
4577     __ b(LOOP_PRFM_START);
4578     __ bind(LOOP_PRFM);
4579       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4580     __ bind(LOOP_PRFM_START);
4581       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4582       __ sub(octetCounter, octetCounter, 8);
4583       __ subs(rscratch1, octetCounter, large_loop_threshold);
4584       inflate_and_store_2_fp_registers(true, v3, v4);
4585       inflate_and_store_2_fp_registers(true, v5, v6);
4586       __ br(__ GT, LOOP_PRFM);
4587       __ cmp(octetCounter, (u1)8);
4588       __ br(__ LT, DONE);
4589     __ bind(LOOP);
4590       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4591       __ bind(LOOP_START);
4592       __ sub(octetCounter, octetCounter, 8);
4593       __ cmp(octetCounter, (u1)8);
4594       inflate_and_store_2_fp_registers(false, v3, v4);
4595       inflate_and_store_2_fp_registers(false, v5, v6);
4596       __ br(__ GE, LOOP);
4597     __ bind(DONE);
4598       __ ret(lr);
4599     return entry;
4600   }
4601 
4602   /**
4603    *  Arguments:
4604    *
4605    *  Input:
4606    *  c_rarg0   - current state address
4607    *  c_rarg1   - H key address
4608    *  c_rarg2   - data address
4609    *  c_rarg3   - number of blocks
4610    *
4611    *  Output:
4612    *  Updated state at c_rarg0
4613    */
generate_ghash_processBlocks()4614   address generate_ghash_processBlocks() {
4615     // Bafflingly, GCM uses little-endian for the byte order, but
4616     // big-endian for the bit order.  For example, the polynomial 1 is
4617     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4618     //
4619     // So, we must either reverse the bytes in each word and do
4620     // everything big-endian or reverse the bits in each byte and do
4621     // it little-endian.  On AArch64 it's more idiomatic to reverse
4622     // the bits in each byte (we have an instruction, RBIT, to do
4623     // that) and keep the data in little-endian bit order throught the
4624     // calculation, bit-reversing the inputs and outputs.
4625 
4626     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4627     __ align(wordSize * 2);
4628     address p = __ pc();
4629     __ emit_int64(0x87);  // The low-order bits of the field
4630                           // polynomial (i.e. p = z^7+z^2+z+1)
4631                           // repeated in the low and high parts of a
4632                           // 128-bit vector
4633     __ emit_int64(0x87);
4634 
4635     __ align(CodeEntryAlignment);
4636     address start = __ pc();
4637 
4638     Register state   = c_rarg0;
4639     Register subkeyH = c_rarg1;
4640     Register data    = c_rarg2;
4641     Register blocks  = c_rarg3;
4642 
4643     FloatRegister vzr = v30;
4644     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4645 
4646     __ ldrq(v0, Address(state));
4647     __ ldrq(v1, Address(subkeyH));
4648 
4649     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4650     __ rbit(v0, __ T16B, v0);
4651     __ rev64(v1, __ T16B, v1);
4652     __ rbit(v1, __ T16B, v1);
4653 
4654     __ ldrq(v26, p);
4655 
4656     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4657     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4658 
4659     {
4660       Label L_ghash_loop;
4661       __ bind(L_ghash_loop);
4662 
4663       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4664                                                  // reversing each byte
4665       __ rbit(v2, __ T16B, v2);
4666       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4667 
4668       // Multiply state in v2 by subkey in v1
4669       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4670                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4671                      /*temps*/v6, v20, v18, v21);
4672       // Reduce v7:v5 by the field polynomial
4673       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4674 
4675       __ sub(blocks, blocks, 1);
4676       __ cbnz(blocks, L_ghash_loop);
4677     }
4678 
4679     // The bit-reversed result is at this point in v0
4680     __ rev64(v1, __ T16B, v0);
4681     __ rbit(v1, __ T16B, v1);
4682 
4683     __ st1(v1, __ T16B, state);
4684     __ ret(lr);
4685 
4686     return start;
4687   }
4688 
generate_base64_encode_simdround(Register src,Register dst,FloatRegister codec,u8 size)4689   void generate_base64_encode_simdround(Register src, Register dst,
4690         FloatRegister codec, u8 size) {
4691 
4692     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
4693     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
4694     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
4695 
4696     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
4697 
4698     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
4699 
4700     __ ushr(ind0, arrangement, in0,  2);
4701 
4702     __ ushr(ind1, arrangement, in1,  2);
4703     __ shl(in0,   arrangement, in0,  6);
4704     __ orr(ind1,  arrangement, ind1, in0);
4705     __ ushr(ind1, arrangement, ind1, 2);
4706 
4707     __ ushr(ind2, arrangement, in2,  4);
4708     __ shl(in1,   arrangement, in1,  4);
4709     __ orr(ind2,  arrangement, in1,  ind2);
4710     __ ushr(ind2, arrangement, ind2, 2);
4711 
4712     __ shl(ind3,  arrangement, in2,  2);
4713     __ ushr(ind3, arrangement, ind3, 2);
4714 
4715     __ tbl(out0,  arrangement, codec,  4, ind0);
4716     __ tbl(out1,  arrangement, codec,  4, ind1);
4717     __ tbl(out2,  arrangement, codec,  4, ind2);
4718     __ tbl(out3,  arrangement, codec,  4, ind3);
4719 
4720     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
4721   }
4722 
4723    /**
4724    *  Arguments:
4725    *
4726    *  Input:
4727    *  c_rarg0   - src_start
4728    *  c_rarg1   - src_offset
4729    *  c_rarg2   - src_length
4730    *  c_rarg3   - dest_start
4731    *  c_rarg4   - dest_offset
4732    *  c_rarg5   - isURL
4733    *
4734    */
generate_base64_encodeBlock()4735   address generate_base64_encodeBlock() {
4736 
4737     static const char toBase64[64] = {
4738       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4739       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4740       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4741       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4742       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
4743     };
4744 
4745     static const char toBase64URL[64] = {
4746       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4747       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4748       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4749       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4750       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
4751     };
4752 
4753     __ align(CodeEntryAlignment);
4754     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
4755     address start = __ pc();
4756 
4757     Register src   = c_rarg0;  // source array
4758     Register soff  = c_rarg1;  // source start offset
4759     Register send  = c_rarg2;  // source end offset
4760     Register dst   = c_rarg3;  // dest array
4761     Register doff  = c_rarg4;  // position for writing to dest array
4762     Register isURL = c_rarg5;  // Base64 or URL chracter set
4763 
4764     // c_rarg6 and c_rarg7 are free to use as temps
4765     Register codec  = c_rarg6;
4766     Register length = c_rarg7;
4767 
4768     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
4769 
4770     __ add(src, src, soff);
4771     __ add(dst, dst, doff);
4772     __ sub(length, send, soff);
4773 
4774     // load the codec base address
4775     __ lea(codec, ExternalAddress((address) toBase64));
4776     __ cbz(isURL, ProcessData);
4777     __ lea(codec, ExternalAddress((address) toBase64URL));
4778 
4779     __ BIND(ProcessData);
4780 
4781     // too short to formup a SIMD loop, roll back
4782     __ cmp(length, (u1)24);
4783     __ br(Assembler::LT, Process3B);
4784 
4785     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
4786 
4787     __ BIND(Process48B);
4788     __ cmp(length, (u1)48);
4789     __ br(Assembler::LT, Process24B);
4790     generate_base64_encode_simdround(src, dst, v0, 16);
4791     __ sub(length, length, 48);
4792     __ b(Process48B);
4793 
4794     __ BIND(Process24B);
4795     __ cmp(length, (u1)24);
4796     __ br(Assembler::LT, SIMDExit);
4797     generate_base64_encode_simdround(src, dst, v0, 8);
4798     __ sub(length, length, 24);
4799 
4800     __ BIND(SIMDExit);
4801     __ cbz(length, Exit);
4802 
4803     __ BIND(Process3B);
4804     //  3 src bytes, 24 bits
4805     __ ldrb(r10, __ post(src, 1));
4806     __ ldrb(r11, __ post(src, 1));
4807     __ ldrb(r12, __ post(src, 1));
4808     __ orrw(r11, r11, r10, Assembler::LSL, 8);
4809     __ orrw(r12, r12, r11, Assembler::LSL, 8);
4810     // codec index
4811     __ ubfmw(r15, r12, 18, 23);
4812     __ ubfmw(r14, r12, 12, 17);
4813     __ ubfmw(r13, r12, 6,  11);
4814     __ andw(r12,  r12, 63);
4815     // get the code based on the codec
4816     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
4817     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
4818     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
4819     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
4820     __ strb(r15, __ post(dst, 1));
4821     __ strb(r14, __ post(dst, 1));
4822     __ strb(r13, __ post(dst, 1));
4823     __ strb(r12, __ post(dst, 1));
4824     __ sub(length, length, 3);
4825     __ cbnz(length, Process3B);
4826 
4827     __ BIND(Exit);
4828     __ ret(lr);
4829 
4830     return start;
4831   }
4832 
4833   // Continuation point for throwing of implicit exceptions that are
4834   // not handled in the current activation. Fabricates an exception
4835   // oop and initiates normal exception dispatching in this
4836   // frame. Since we need to preserve callee-saved values (currently
4837   // only for C2, but done for C1 as well) we need a callee-saved oop
4838   // map and therefore have to make these stubs into RuntimeStubs
4839   // rather than BufferBlobs.  If the compiler needs all registers to
4840   // be preserved between the fault point and the exception handler
4841   // then it must assume responsibility for that in
4842   // AbstractCompiler::continuation_for_implicit_null_exception or
4843   // continuation_for_implicit_division_by_zero_exception. All other
4844   // implicit exceptions (e.g., NullPointerException or
4845   // AbstractMethodError on entry) are either at call sites or
4846   // otherwise assume that stack unwinding will be initiated, so
4847   // caller saved registers were assumed volatile in the compiler.
4848 
4849 #undef __
4850 #define __ masm->
4851 
generate_throw_exception(const char * name,address runtime_entry,Register arg1=noreg,Register arg2=noreg)4852   address generate_throw_exception(const char* name,
4853                                    address runtime_entry,
4854                                    Register arg1 = noreg,
4855                                    Register arg2 = noreg) {
4856     // Information about frame layout at time of blocking runtime call.
4857     // Note that we only have to preserve callee-saved registers since
4858     // the compilers are responsible for supplying a continuation point
4859     // if they expect all registers to be preserved.
4860     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4861     enum layout {
4862       rfp_off = 0,
4863       rfp_off2,
4864       return_off,
4865       return_off2,
4866       framesize // inclusive of return address
4867     };
4868 
4869     int insts_size = 512;
4870     int locs_size  = 64;
4871 
4872     CodeBuffer code(name, insts_size, locs_size);
4873     OopMapSet* oop_maps  = new OopMapSet();
4874     MacroAssembler* masm = new MacroAssembler(&code);
4875 
4876     address start = __ pc();
4877 
4878     // This is an inlined and slightly modified version of call_VM
4879     // which has the ability to fetch the return PC out of
4880     // thread-local storage and also sets up last_Java_sp slightly
4881     // differently than the real call_VM
4882 
4883     __ enter(); // Save FP and LR before call
4884 
4885     assert(is_even(framesize/2), "sp not 16-byte aligned");
4886 
4887     // lr and fp are already in place
4888     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4889 
4890     int frame_complete = __ pc() - start;
4891 
4892     // Set up last_Java_sp and last_Java_fp
4893     address the_pc = __ pc();
4894     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4895 
4896     // Call runtime
4897     if (arg1 != noreg) {
4898       assert(arg2 != c_rarg1, "clobbered");
4899       __ mov(c_rarg1, arg1);
4900     }
4901     if (arg2 != noreg) {
4902       __ mov(c_rarg2, arg2);
4903     }
4904     __ mov(c_rarg0, rthread);
4905     BLOCK_COMMENT("call runtime_entry");
4906     __ mov(rscratch1, runtime_entry);
4907     __ blr(rscratch1);
4908 
4909     // Generate oop map
4910     OopMap* map = new OopMap(framesize, 0);
4911 
4912     oop_maps->add_gc_map(the_pc - start, map);
4913 
4914     __ reset_last_Java_frame(true);
4915     __ maybe_isb();
4916 
4917     __ leave();
4918 
4919     // check for pending exceptions
4920 #ifdef ASSERT
4921     Label L;
4922     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4923     __ cbnz(rscratch1, L);
4924     __ should_not_reach_here();
4925     __ bind(L);
4926 #endif // ASSERT
4927     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4928 
4929 
4930     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4931     RuntimeStub* stub =
4932       RuntimeStub::new_runtime_stub(name,
4933                                     &code,
4934                                     frame_complete,
4935                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4936                                     oop_maps, false);
4937     return stub->entry_point();
4938   }
4939 
4940   class MontgomeryMultiplyGenerator : public MacroAssembler {
4941 
4942     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4943       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4944 
4945     RegSet _toSave;
4946     bool _squaring;
4947 
4948   public:
MontgomeryMultiplyGenerator(Assembler * as,bool squaring)4949     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4950       : MacroAssembler(as->code()), _squaring(squaring) {
4951 
4952       // Register allocation
4953 
4954       Register reg = c_rarg0;
4955       Pa_base = reg;       // Argument registers
4956       if (squaring)
4957         Pb_base = Pa_base;
4958       else
4959         Pb_base = ++reg;
4960       Pn_base = ++reg;
4961       Rlen= ++reg;
4962       inv = ++reg;
4963       Pm_base = ++reg;
4964 
4965                           // Working registers:
4966       Ra =  ++reg;        // The current digit of a, b, n, and m.
4967       Rb =  ++reg;
4968       Rm =  ++reg;
4969       Rn =  ++reg;
4970 
4971       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4972       Pb =  ++reg;
4973       Pm =  ++reg;
4974       Pn =  ++reg;
4975 
4976       t0 =  ++reg;        // Three registers which form a
4977       t1 =  ++reg;        // triple-precision accumuator.
4978       t2 =  ++reg;
4979 
4980       Ri =  ++reg;        // Inner and outer loop indexes.
4981       Rj =  ++reg;
4982 
4983       Rhi_ab = ++reg;     // Product registers: low and high parts
4984       Rlo_ab = ++reg;     // of a*b and m*n.
4985       Rhi_mn = ++reg;
4986       Rlo_mn = ++reg;
4987 
4988       // r19 and up are callee-saved.
4989       _toSave = RegSet::range(r19, reg) + Pm_base;
4990     }
4991 
4992   private:
save_regs()4993     void save_regs() {
4994       push(_toSave, sp);
4995     }
4996 
restore_regs()4997     void restore_regs() {
4998       pop(_toSave, sp);
4999     }
5000 
5001     template <typename T>
unroll_2(Register count,T block)5002     void unroll_2(Register count, T block) {
5003       Label loop, end, odd;
5004       tbnz(count, 0, odd);
5005       cbz(count, end);
5006       align(16);
5007       bind(loop);
5008       (this->*block)();
5009       bind(odd);
5010       (this->*block)();
5011       subs(count, count, 2);
5012       br(Assembler::GT, loop);
5013       bind(end);
5014     }
5015 
5016     template <typename T>
unroll_2(Register count,T block,Register d,Register s,Register tmp)5017     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
5018       Label loop, end, odd;
5019       tbnz(count, 0, odd);
5020       cbz(count, end);
5021       align(16);
5022       bind(loop);
5023       (this->*block)(d, s, tmp);
5024       bind(odd);
5025       (this->*block)(d, s, tmp);
5026       subs(count, count, 2);
5027       br(Assembler::GT, loop);
5028       bind(end);
5029     }
5030 
pre1(RegisterOrConstant i)5031     void pre1(RegisterOrConstant i) {
5032       block_comment("pre1");
5033       // Pa = Pa_base;
5034       // Pb = Pb_base + i;
5035       // Pm = Pm_base;
5036       // Pn = Pn_base + i;
5037       // Ra = *Pa;
5038       // Rb = *Pb;
5039       // Rm = *Pm;
5040       // Rn = *Pn;
5041       ldr(Ra, Address(Pa_base));
5042       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5043       ldr(Rm, Address(Pm_base));
5044       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5045       lea(Pa, Address(Pa_base));
5046       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5047       lea(Pm, Address(Pm_base));
5048       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5049 
5050       // Zero the m*n result.
5051       mov(Rhi_mn, zr);
5052       mov(Rlo_mn, zr);
5053     }
5054 
5055     // The core multiply-accumulate step of a Montgomery
5056     // multiplication.  The idea is to schedule operations as a
5057     // pipeline so that instructions with long latencies (loads and
5058     // multiplies) have time to complete before their results are
5059     // used.  This most benefits in-order implementations of the
5060     // architecture but out-of-order ones also benefit.
step()5061     void step() {
5062       block_comment("step");
5063       // MACC(Ra, Rb, t0, t1, t2);
5064       // Ra = *++Pa;
5065       // Rb = *--Pb;
5066       umulh(Rhi_ab, Ra, Rb);
5067       mul(Rlo_ab, Ra, Rb);
5068       ldr(Ra, pre(Pa, wordSize));
5069       ldr(Rb, pre(Pb, -wordSize));
5070       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5071                                        // previous iteration.
5072       // MACC(Rm, Rn, t0, t1, t2);
5073       // Rm = *++Pm;
5074       // Rn = *--Pn;
5075       umulh(Rhi_mn, Rm, Rn);
5076       mul(Rlo_mn, Rm, Rn);
5077       ldr(Rm, pre(Pm, wordSize));
5078       ldr(Rn, pre(Pn, -wordSize));
5079       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5080     }
5081 
post1()5082     void post1() {
5083       block_comment("post1");
5084 
5085       // MACC(Ra, Rb, t0, t1, t2);
5086       // Ra = *++Pa;
5087       // Rb = *--Pb;
5088       umulh(Rhi_ab, Ra, Rb);
5089       mul(Rlo_ab, Ra, Rb);
5090       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5091       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5092 
5093       // *Pm = Rm = t0 * inv;
5094       mul(Rm, t0, inv);
5095       str(Rm, Address(Pm));
5096 
5097       // MACC(Rm, Rn, t0, t1, t2);
5098       // t0 = t1; t1 = t2; t2 = 0;
5099       umulh(Rhi_mn, Rm, Rn);
5100 
5101 #ifndef PRODUCT
5102       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5103       {
5104         mul(Rlo_mn, Rm, Rn);
5105         add(Rlo_mn, t0, Rlo_mn);
5106         Label ok;
5107         cbz(Rlo_mn, ok); {
5108           stop("broken Montgomery multiply");
5109         } bind(ok);
5110       }
5111 #endif
5112       // We have very carefully set things up so that
5113       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5114       // the lower half of Rm * Rn because we know the result already:
5115       // it must be -t0.  t0 + (-t0) must generate a carry iff
5116       // t0 != 0.  So, rather than do a mul and an adds we just set
5117       // the carry flag iff t0 is nonzero.
5118       //
5119       // mul(Rlo_mn, Rm, Rn);
5120       // adds(zr, t0, Rlo_mn);
5121       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5122       adcs(t0, t1, Rhi_mn);
5123       adc(t1, t2, zr);
5124       mov(t2, zr);
5125     }
5126 
pre2(RegisterOrConstant i,RegisterOrConstant len)5127     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5128       block_comment("pre2");
5129       // Pa = Pa_base + i-len;
5130       // Pb = Pb_base + len;
5131       // Pm = Pm_base + i-len;
5132       // Pn = Pn_base + len;
5133 
5134       if (i.is_register()) {
5135         sub(Rj, i.as_register(), len);
5136       } else {
5137         mov(Rj, i.as_constant());
5138         sub(Rj, Rj, len);
5139       }
5140       // Rj == i-len
5141 
5142       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5143       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5144       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5145       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5146 
5147       // Ra = *++Pa;
5148       // Rb = *--Pb;
5149       // Rm = *++Pm;
5150       // Rn = *--Pn;
5151       ldr(Ra, pre(Pa, wordSize));
5152       ldr(Rb, pre(Pb, -wordSize));
5153       ldr(Rm, pre(Pm, wordSize));
5154       ldr(Rn, pre(Pn, -wordSize));
5155 
5156       mov(Rhi_mn, zr);
5157       mov(Rlo_mn, zr);
5158     }
5159 
post2(RegisterOrConstant i,RegisterOrConstant len)5160     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5161       block_comment("post2");
5162       if (i.is_constant()) {
5163         mov(Rj, i.as_constant()-len.as_constant());
5164       } else {
5165         sub(Rj, i.as_register(), len);
5166       }
5167 
5168       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5169 
5170       // As soon as we know the least significant digit of our result,
5171       // store it.
5172       // Pm_base[i-len] = t0;
5173       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5174 
5175       // t0 = t1; t1 = t2; t2 = 0;
5176       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5177       adc(t1, t2, zr);
5178       mov(t2, zr);
5179     }
5180 
5181     // A carry in t0 after Montgomery multiplication means that we
5182     // should subtract multiples of n from our result in m.  We'll
5183     // keep doing that until there is no carry.
normalize(RegisterOrConstant len)5184     void normalize(RegisterOrConstant len) {
5185       block_comment("normalize");
5186       // while (t0)
5187       //   t0 = sub(Pm_base, Pn_base, t0, len);
5188       Label loop, post, again;
5189       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5190       cbz(t0, post); {
5191         bind(again); {
5192           mov(i, zr);
5193           mov(cnt, len);
5194           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5195           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5196           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5197           align(16);
5198           bind(loop); {
5199             sbcs(Rm, Rm, Rn);
5200             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5201             add(i, i, 1);
5202             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5203             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5204             sub(cnt, cnt, 1);
5205           } cbnz(cnt, loop);
5206           sbc(t0, t0, zr);
5207         } cbnz(t0, again);
5208       } bind(post);
5209     }
5210 
5211     // Move memory at s to d, reversing words.
5212     //    Increments d to end of copied memory
5213     //    Destroys tmp1, tmp2
5214     //    Preserves len
5215     //    Leaves s pointing to the address which was in d at start
reverse(Register d,Register s,Register len,Register tmp1,Register tmp2)5216     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5217       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5218 
5219       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5220       mov(tmp1, len);
5221       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5222       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5223     }
5224     // where
reverse1(Register d,Register s,Register tmp)5225     void reverse1(Register d, Register s, Register tmp) {
5226       ldr(tmp, pre(s, -wordSize));
5227       ror(tmp, tmp, 32);
5228       str(tmp, post(d, wordSize));
5229     }
5230 
step_squaring()5231     void step_squaring() {
5232       // An extra ACC
5233       step();
5234       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5235     }
5236 
last_squaring(RegisterOrConstant i)5237     void last_squaring(RegisterOrConstant i) {
5238       Label dont;
5239       // if ((i & 1) == 0) {
5240       tbnz(i.as_register(), 0, dont); {
5241         // MACC(Ra, Rb, t0, t1, t2);
5242         // Ra = *++Pa;
5243         // Rb = *--Pb;
5244         umulh(Rhi_ab, Ra, Rb);
5245         mul(Rlo_ab, Ra, Rb);
5246         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5247       } bind(dont);
5248     }
5249 
extra_step_squaring()5250     void extra_step_squaring() {
5251       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5252 
5253       // MACC(Rm, Rn, t0, t1, t2);
5254       // Rm = *++Pm;
5255       // Rn = *--Pn;
5256       umulh(Rhi_mn, Rm, Rn);
5257       mul(Rlo_mn, Rm, Rn);
5258       ldr(Rm, pre(Pm, wordSize));
5259       ldr(Rn, pre(Pn, -wordSize));
5260     }
5261 
post1_squaring()5262     void post1_squaring() {
5263       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5264 
5265       // *Pm = Rm = t0 * inv;
5266       mul(Rm, t0, inv);
5267       str(Rm, Address(Pm));
5268 
5269       // MACC(Rm, Rn, t0, t1, t2);
5270       // t0 = t1; t1 = t2; t2 = 0;
5271       umulh(Rhi_mn, Rm, Rn);
5272 
5273 #ifndef PRODUCT
5274       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5275       {
5276         mul(Rlo_mn, Rm, Rn);
5277         add(Rlo_mn, t0, Rlo_mn);
5278         Label ok;
5279         cbz(Rlo_mn, ok); {
5280           stop("broken Montgomery multiply");
5281         } bind(ok);
5282       }
5283 #endif
5284       // We have very carefully set things up so that
5285       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5286       // the lower half of Rm * Rn because we know the result already:
5287       // it must be -t0.  t0 + (-t0) must generate a carry iff
5288       // t0 != 0.  So, rather than do a mul and an adds we just set
5289       // the carry flag iff t0 is nonzero.
5290       //
5291       // mul(Rlo_mn, Rm, Rn);
5292       // adds(zr, t0, Rlo_mn);
5293       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5294       adcs(t0, t1, Rhi_mn);
5295       adc(t1, t2, zr);
5296       mov(t2, zr);
5297     }
5298 
acc(Register Rhi,Register Rlo,Register t0,Register t1,Register t2)5299     void acc(Register Rhi, Register Rlo,
5300              Register t0, Register t1, Register t2) {
5301       adds(t0, t0, Rlo);
5302       adcs(t1, t1, Rhi);
5303       adc(t2, t2, zr);
5304     }
5305 
5306   public:
5307     /**
5308      * Fast Montgomery multiplication.  The derivation of the
5309      * algorithm is in A Cryptographic Library for the Motorola
5310      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5311      *
5312      * Arguments:
5313      *
5314      * Inputs for multiplication:
5315      *   c_rarg0   - int array elements a
5316      *   c_rarg1   - int array elements b
5317      *   c_rarg2   - int array elements n (the modulus)
5318      *   c_rarg3   - int length
5319      *   c_rarg4   - int inv
5320      *   c_rarg5   - int array elements m (the result)
5321      *
5322      * Inputs for squaring:
5323      *   c_rarg0   - int array elements a
5324      *   c_rarg1   - int array elements n (the modulus)
5325      *   c_rarg2   - int length
5326      *   c_rarg3   - int inv
5327      *   c_rarg4   - int array elements m (the result)
5328      *
5329      */
generate_multiply()5330     address generate_multiply() {
5331       Label argh, nothing;
5332       bind(argh);
5333       stop("MontgomeryMultiply total_allocation must be <= 8192");
5334 
5335       align(CodeEntryAlignment);
5336       address entry = pc();
5337 
5338       cbzw(Rlen, nothing);
5339 
5340       enter();
5341 
5342       // Make room.
5343       cmpw(Rlen, 512);
5344       br(Assembler::HI, argh);
5345       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5346       andr(sp, Ra, -2 * wordSize);
5347 
5348       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5349 
5350       {
5351         // Copy input args, reversing as we go.  We use Ra as a
5352         // temporary variable.
5353         reverse(Ra, Pa_base, Rlen, t0, t1);
5354         if (!_squaring)
5355           reverse(Ra, Pb_base, Rlen, t0, t1);
5356         reverse(Ra, Pn_base, Rlen, t0, t1);
5357       }
5358 
5359       // Push all call-saved registers and also Pm_base which we'll need
5360       // at the end.
5361       save_regs();
5362 
5363 #ifndef PRODUCT
5364       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5365       {
5366         ldr(Rn, Address(Pn_base, 0));
5367         mul(Rlo_mn, Rn, inv);
5368         subs(zr, Rlo_mn, -1);
5369         Label ok;
5370         br(EQ, ok); {
5371           stop("broken inverse in Montgomery multiply");
5372         } bind(ok);
5373       }
5374 #endif
5375 
5376       mov(Pm_base, Ra);
5377 
5378       mov(t0, zr);
5379       mov(t1, zr);
5380       mov(t2, zr);
5381 
5382       block_comment("for (int i = 0; i < len; i++) {");
5383       mov(Ri, zr); {
5384         Label loop, end;
5385         cmpw(Ri, Rlen);
5386         br(Assembler::GE, end);
5387 
5388         bind(loop);
5389         pre1(Ri);
5390 
5391         block_comment("  for (j = i; j; j--) {"); {
5392           movw(Rj, Ri);
5393           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5394         } block_comment("  } // j");
5395 
5396         post1();
5397         addw(Ri, Ri, 1);
5398         cmpw(Ri, Rlen);
5399         br(Assembler::LT, loop);
5400         bind(end);
5401         block_comment("} // i");
5402       }
5403 
5404       block_comment("for (int i = len; i < 2*len; i++) {");
5405       mov(Ri, Rlen); {
5406         Label loop, end;
5407         cmpw(Ri, Rlen, Assembler::LSL, 1);
5408         br(Assembler::GE, end);
5409 
5410         bind(loop);
5411         pre2(Ri, Rlen);
5412 
5413         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5414           lslw(Rj, Rlen, 1);
5415           subw(Rj, Rj, Ri);
5416           subw(Rj, Rj, 1);
5417           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5418         } block_comment("  } // j");
5419 
5420         post2(Ri, Rlen);
5421         addw(Ri, Ri, 1);
5422         cmpw(Ri, Rlen, Assembler::LSL, 1);
5423         br(Assembler::LT, loop);
5424         bind(end);
5425       }
5426       block_comment("} // i");
5427 
5428       normalize(Rlen);
5429 
5430       mov(Ra, Pm_base);  // Save Pm_base in Ra
5431       restore_regs();  // Restore caller's Pm_base
5432 
5433       // Copy our result into caller's Pm_base
5434       reverse(Pm_base, Ra, Rlen, t0, t1);
5435 
5436       leave();
5437       bind(nothing);
5438       ret(lr);
5439 
5440       return entry;
5441     }
5442     // In C, approximately:
5443 
5444     // void
5445     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5446     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5447     //                     unsigned long inv, int len) {
5448     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5449     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5450     //   unsigned long Ra, Rb, Rn, Rm;
5451 
5452     //   int i;
5453 
5454     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5455 
5456     //   for (i = 0; i < len; i++) {
5457     //     int j;
5458 
5459     //     Pa = Pa_base;
5460     //     Pb = Pb_base + i;
5461     //     Pm = Pm_base;
5462     //     Pn = Pn_base + i;
5463 
5464     //     Ra = *Pa;
5465     //     Rb = *Pb;
5466     //     Rm = *Pm;
5467     //     Rn = *Pn;
5468 
5469     //     int iters = i;
5470     //     for (j = 0; iters--; j++) {
5471     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5472     //       MACC(Ra, Rb, t0, t1, t2);
5473     //       Ra = *++Pa;
5474     //       Rb = *--Pb;
5475     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5476     //       MACC(Rm, Rn, t0, t1, t2);
5477     //       Rm = *++Pm;
5478     //       Rn = *--Pn;
5479     //     }
5480 
5481     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5482     //     MACC(Ra, Rb, t0, t1, t2);
5483     //     *Pm = Rm = t0 * inv;
5484     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5485     //     MACC(Rm, Rn, t0, t1, t2);
5486 
5487     //     assert(t0 == 0, "broken Montgomery multiply");
5488 
5489     //     t0 = t1; t1 = t2; t2 = 0;
5490     //   }
5491 
5492     //   for (i = len; i < 2*len; i++) {
5493     //     int j;
5494 
5495     //     Pa = Pa_base + i-len;
5496     //     Pb = Pb_base + len;
5497     //     Pm = Pm_base + i-len;
5498     //     Pn = Pn_base + len;
5499 
5500     //     Ra = *++Pa;
5501     //     Rb = *--Pb;
5502     //     Rm = *++Pm;
5503     //     Rn = *--Pn;
5504 
5505     //     int iters = len*2-i-1;
5506     //     for (j = i-len+1; iters--; j++) {
5507     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5508     //       MACC(Ra, Rb, t0, t1, t2);
5509     //       Ra = *++Pa;
5510     //       Rb = *--Pb;
5511     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5512     //       MACC(Rm, Rn, t0, t1, t2);
5513     //       Rm = *++Pm;
5514     //       Rn = *--Pn;
5515     //     }
5516 
5517     //     Pm_base[i-len] = t0;
5518     //     t0 = t1; t1 = t2; t2 = 0;
5519     //   }
5520 
5521     //   while (t0)
5522     //     t0 = sub(Pm_base, Pn_base, t0, len);
5523     // }
5524 
5525     /**
5526      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5527      * multiplies than Montgomery multiplication so it should be up to
5528      * 25% faster.  However, its loop control is more complex and it
5529      * may actually run slower on some machines.
5530      *
5531      * Arguments:
5532      *
5533      * Inputs:
5534      *   c_rarg0   - int array elements a
5535      *   c_rarg1   - int array elements n (the modulus)
5536      *   c_rarg2   - int length
5537      *   c_rarg3   - int inv
5538      *   c_rarg4   - int array elements m (the result)
5539      *
5540      */
generate_square()5541     address generate_square() {
5542       Label argh;
5543       bind(argh);
5544       stop("MontgomeryMultiply total_allocation must be <= 8192");
5545 
5546       align(CodeEntryAlignment);
5547       address entry = pc();
5548 
5549       enter();
5550 
5551       // Make room.
5552       cmpw(Rlen, 512);
5553       br(Assembler::HI, argh);
5554       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5555       andr(sp, Ra, -2 * wordSize);
5556 
5557       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5558 
5559       {
5560         // Copy input args, reversing as we go.  We use Ra as a
5561         // temporary variable.
5562         reverse(Ra, Pa_base, Rlen, t0, t1);
5563         reverse(Ra, Pn_base, Rlen, t0, t1);
5564       }
5565 
5566       // Push all call-saved registers and also Pm_base which we'll need
5567       // at the end.
5568       save_regs();
5569 
5570       mov(Pm_base, Ra);
5571 
5572       mov(t0, zr);
5573       mov(t1, zr);
5574       mov(t2, zr);
5575 
5576       block_comment("for (int i = 0; i < len; i++) {");
5577       mov(Ri, zr); {
5578         Label loop, end;
5579         bind(loop);
5580         cmp(Ri, Rlen);
5581         br(Assembler::GE, end);
5582 
5583         pre1(Ri);
5584 
5585         block_comment("for (j = (i+1)/2; j; j--) {"); {
5586           add(Rj, Ri, 1);
5587           lsr(Rj, Rj, 1);
5588           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5589         } block_comment("  } // j");
5590 
5591         last_squaring(Ri);
5592 
5593         block_comment("  for (j = i/2; j; j--) {"); {
5594           lsr(Rj, Ri, 1);
5595           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5596         } block_comment("  } // j");
5597 
5598         post1_squaring();
5599         add(Ri, Ri, 1);
5600         cmp(Ri, Rlen);
5601         br(Assembler::LT, loop);
5602 
5603         bind(end);
5604         block_comment("} // i");
5605       }
5606 
5607       block_comment("for (int i = len; i < 2*len; i++) {");
5608       mov(Ri, Rlen); {
5609         Label loop, end;
5610         bind(loop);
5611         cmp(Ri, Rlen, Assembler::LSL, 1);
5612         br(Assembler::GE, end);
5613 
5614         pre2(Ri, Rlen);
5615 
5616         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5617           lsl(Rj, Rlen, 1);
5618           sub(Rj, Rj, Ri);
5619           sub(Rj, Rj, 1);
5620           lsr(Rj, Rj, 1);
5621           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5622         } block_comment("  } // j");
5623 
5624         last_squaring(Ri);
5625 
5626         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5627           lsl(Rj, Rlen, 1);
5628           sub(Rj, Rj, Ri);
5629           lsr(Rj, Rj, 1);
5630           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5631         } block_comment("  } // j");
5632 
5633         post2(Ri, Rlen);
5634         add(Ri, Ri, 1);
5635         cmp(Ri, Rlen, Assembler::LSL, 1);
5636 
5637         br(Assembler::LT, loop);
5638         bind(end);
5639         block_comment("} // i");
5640       }
5641 
5642       normalize(Rlen);
5643 
5644       mov(Ra, Pm_base);  // Save Pm_base in Ra
5645       restore_regs();  // Restore caller's Pm_base
5646 
5647       // Copy our result into caller's Pm_base
5648       reverse(Pm_base, Ra, Rlen, t0, t1);
5649 
5650       leave();
5651       ret(lr);
5652 
5653       return entry;
5654     }
5655     // In C, approximately:
5656 
5657     // void
5658     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5659     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5660     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5661     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5662     //   unsigned long Ra, Rb, Rn, Rm;
5663 
5664     //   int i;
5665 
5666     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5667 
5668     //   for (i = 0; i < len; i++) {
5669     //     int j;
5670 
5671     //     Pa = Pa_base;
5672     //     Pb = Pa_base + i;
5673     //     Pm = Pm_base;
5674     //     Pn = Pn_base + i;
5675 
5676     //     Ra = *Pa;
5677     //     Rb = *Pb;
5678     //     Rm = *Pm;
5679     //     Rn = *Pn;
5680 
5681     //     int iters = (i+1)/2;
5682     //     for (j = 0; iters--; j++) {
5683     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5684     //       MACC2(Ra, Rb, t0, t1, t2);
5685     //       Ra = *++Pa;
5686     //       Rb = *--Pb;
5687     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5688     //       MACC(Rm, Rn, t0, t1, t2);
5689     //       Rm = *++Pm;
5690     //       Rn = *--Pn;
5691     //     }
5692     //     if ((i & 1) == 0) {
5693     //       assert(Ra == Pa_base[j], "must be");
5694     //       MACC(Ra, Ra, t0, t1, t2);
5695     //     }
5696     //     iters = i/2;
5697     //     assert(iters == i-j, "must be");
5698     //     for (; iters--; j++) {
5699     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5700     //       MACC(Rm, Rn, t0, t1, t2);
5701     //       Rm = *++Pm;
5702     //       Rn = *--Pn;
5703     //     }
5704 
5705     //     *Pm = Rm = t0 * inv;
5706     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5707     //     MACC(Rm, Rn, t0, t1, t2);
5708 
5709     //     assert(t0 == 0, "broken Montgomery multiply");
5710 
5711     //     t0 = t1; t1 = t2; t2 = 0;
5712     //   }
5713 
5714     //   for (i = len; i < 2*len; i++) {
5715     //     int start = i-len+1;
5716     //     int end = start + (len - start)/2;
5717     //     int j;
5718 
5719     //     Pa = Pa_base + i-len;
5720     //     Pb = Pa_base + len;
5721     //     Pm = Pm_base + i-len;
5722     //     Pn = Pn_base + len;
5723 
5724     //     Ra = *++Pa;
5725     //     Rb = *--Pb;
5726     //     Rm = *++Pm;
5727     //     Rn = *--Pn;
5728 
5729     //     int iters = (2*len-i-1)/2;
5730     //     assert(iters == end-start, "must be");
5731     //     for (j = start; iters--; j++) {
5732     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5733     //       MACC2(Ra, Rb, t0, t1, t2);
5734     //       Ra = *++Pa;
5735     //       Rb = *--Pb;
5736     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5737     //       MACC(Rm, Rn, t0, t1, t2);
5738     //       Rm = *++Pm;
5739     //       Rn = *--Pn;
5740     //     }
5741     //     if ((i & 1) == 0) {
5742     //       assert(Ra == Pa_base[j], "must be");
5743     //       MACC(Ra, Ra, t0, t1, t2);
5744     //     }
5745     //     iters =  (2*len-i)/2;
5746     //     assert(iters == len-j, "must be");
5747     //     for (; iters--; j++) {
5748     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5749     //       MACC(Rm, Rn, t0, t1, t2);
5750     //       Rm = *++Pm;
5751     //       Rn = *--Pn;
5752     //     }
5753     //     Pm_base[i-len] = t0;
5754     //     t0 = t1; t1 = t2; t2 = 0;
5755     //   }
5756 
5757     //   while (t0)
5758     //     t0 = sub(Pm_base, Pn_base, t0, len);
5759     // }
5760   };
5761 
5762 
5763   // Initialization
generate_initial()5764   void generate_initial() {
5765     // Generate initial stubs and initializes the entry points
5766 
5767     // entry points that exist in all platforms Note: This is code
5768     // that could be shared among different platforms - however the
5769     // benefit seems to be smaller than the disadvantage of having a
5770     // much more complicated generator structure. See also comment in
5771     // stubRoutines.hpp.
5772 
5773     StubRoutines::_forward_exception_entry = generate_forward_exception();
5774 
5775     StubRoutines::_call_stub_entry =
5776       generate_call_stub(StubRoutines::_call_stub_return_address);
5777 
5778     // is referenced by megamorphic call
5779     StubRoutines::_catch_exception_entry = generate_catch_exception();
5780 
5781     // Build this early so it's available for the interpreter.
5782     StubRoutines::_throw_StackOverflowError_entry =
5783       generate_throw_exception("StackOverflowError throw_exception",
5784                                CAST_FROM_FN_PTR(address,
5785                                                 SharedRuntime::throw_StackOverflowError));
5786     StubRoutines::_throw_delayed_StackOverflowError_entry =
5787       generate_throw_exception("delayed StackOverflowError throw_exception",
5788                                CAST_FROM_FN_PTR(address,
5789                                                 SharedRuntime::throw_delayed_StackOverflowError));
5790     if (UseCRC32Intrinsics) {
5791       // set table address before stub generation which use it
5792       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5793       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5794     }
5795 
5796     if (UseCRC32CIntrinsics) {
5797       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5798     }
5799 
5800     // Disabled until JDK-8210858 is fixed
5801     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5802     //   StubRoutines::_dlog = generate_dlog();
5803     // }
5804 
5805     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5806       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5807     }
5808 
5809     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5810       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5811     }
5812   }
5813 
generate_all()5814   void generate_all() {
5815     // support for verify_oop (must happen after universe_init)
5816     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5817     StubRoutines::_throw_AbstractMethodError_entry =
5818       generate_throw_exception("AbstractMethodError throw_exception",
5819                                CAST_FROM_FN_PTR(address,
5820                                                 SharedRuntime::
5821                                                 throw_AbstractMethodError));
5822 
5823     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5824       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5825                                CAST_FROM_FN_PTR(address,
5826                                                 SharedRuntime::
5827                                                 throw_IncompatibleClassChangeError));
5828 
5829     StubRoutines::_throw_NullPointerException_at_call_entry =
5830       generate_throw_exception("NullPointerException at call throw_exception",
5831                                CAST_FROM_FN_PTR(address,
5832                                                 SharedRuntime::
5833                                                 throw_NullPointerException_at_call));
5834 
5835     // arraycopy stubs used by compilers
5836     generate_arraycopy_stubs();
5837 
5838     // has negatives stub for large arrays.
5839     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5840 
5841     // array equals stub for large arrays.
5842     if (!UseSimpleArrayEquals) {
5843       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5844     }
5845 
5846     generate_compare_long_strings();
5847 
5848     generate_string_indexof_stubs();
5849 
5850     // byte_array_inflate stub for large arrays.
5851     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5852 
5853 #ifdef COMPILER2
5854     if (UseMultiplyToLenIntrinsic) {
5855       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5856     }
5857 
5858     if (UseSquareToLenIntrinsic) {
5859       StubRoutines::_squareToLen = generate_squareToLen();
5860     }
5861 
5862     if (UseMulAddIntrinsic) {
5863       StubRoutines::_mulAdd = generate_mulAdd();
5864     }
5865 
5866     if (UseMontgomeryMultiplyIntrinsic) {
5867       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5868       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5869       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5870     }
5871 
5872     if (UseMontgomerySquareIntrinsic) {
5873       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5874       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5875       // We use generate_multiply() rather than generate_square()
5876       // because it's faster for the sizes of modulus we care about.
5877       StubRoutines::_montgomerySquare = g.generate_multiply();
5878     }
5879 #endif // COMPILER2
5880 
5881     // generate GHASH intrinsics code
5882     if (UseGHASHIntrinsics) {
5883       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5884     }
5885 
5886     if (UseBASE64Intrinsics) {
5887         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
5888     }
5889 
5890     if (UseAESIntrinsics) {
5891       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5892       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5893       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5894       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5895     }
5896 
5897     if (UseSHA1Intrinsics) {
5898       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5899       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5900     }
5901     if (UseSHA256Intrinsics) {
5902       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5903       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5904     }
5905 
5906     // generate Adler32 intrinsics code
5907     if (UseAdler32Intrinsics) {
5908       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5909     }
5910 
5911     // Safefetch stubs.
5912     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5913                                                        &StubRoutines::_safefetch32_fault_pc,
5914                                                        &StubRoutines::_safefetch32_continuation_pc);
5915     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5916                                                        &StubRoutines::_safefetchN_fault_pc,
5917                                                        &StubRoutines::_safefetchN_continuation_pc);
5918     StubRoutines::aarch64::set_completed();
5919   }
5920 
5921  public:
StubGenerator(CodeBuffer * code,bool all)5922   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5923     if (all) {
5924       generate_all();
5925     } else {
5926       generate_initial();
5927     }
5928   }
5929 }; // end class declaration
5930 
5931 #define UCM_TABLE_MAX_ENTRIES 8
StubGenerator_generate(CodeBuffer * code,bool all)5932 void StubGenerator_generate(CodeBuffer* code, bool all) {
5933   if (UnsafeCopyMemory::_table == NULL) {
5934     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
5935   }
5936   StubGenerator g(code, all);
5937 }
5938