1 /*
2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.
9  *
10  * This code is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13  * version 2 for more details (a copy is included in the LICENSE file that
14  * accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License version
17  * 2 along with this work; if not, write to the Free Software Foundation,
18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19  *
20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21  * or visit www.oracle.com if you need additional information or have any
22  * questions.
23  *
24  */
25 
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "nativeInst_aarch64.hpp"
34 #include "oops/instanceOop.hpp"
35 #include "oops/method.hpp"
36 #include "oops/objArrayKlass.hpp"
37 #include "oops/oop.inline.hpp"
38 #include "prims/methodHandles.hpp"
39 #include "runtime/atomic.hpp"
40 #include "runtime/frame.inline.hpp"
41 #include "runtime/handles.inline.hpp"
42 #include "runtime/sharedRuntime.hpp"
43 #include "runtime/stubCodeGenerator.hpp"
44 #include "runtime/stubRoutines.hpp"
45 #include "runtime/thread.inline.hpp"
46 #include "utilities/align.hpp"
47 #ifdef COMPILER2
48 #include "opto/runtime.hpp"
49 #endif
50 
51 // Declaration and definition of StubGenerator (no .hpp file).
52 // For a more detailed description of the stub routine structure
53 // see the comment in stubRoutines.hpp
54 
55 #undef __
56 #define __ _masm->
57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
58 
59 #ifdef PRODUCT
60 #define BLOCK_COMMENT(str) /* nothing */
61 #else
62 #define BLOCK_COMMENT(str) __ block_comment(str)
63 #endif
64 
65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
66 
67 // Stub Code definitions
68 
69 class StubGenerator: public StubCodeGenerator {
70  private:
71 
72 #ifdef PRODUCT
73 #define inc_counter_np(counter) ((void)0)
74 #else
75   void inc_counter_np_(int& counter) {
76     __ lea(rscratch2, ExternalAddress((address)&counter));
77     __ ldrw(rscratch1, Address(rscratch2));
78     __ addw(rscratch1, rscratch1, 1);
79     __ strw(rscratch1, Address(rscratch2));
80   }
81 #define inc_counter_np(counter) \
82   BLOCK_COMMENT("inc_counter " #counter); \
83   inc_counter_np_(counter);
84 #endif
85 
86   // Call stubs are used to call Java from C
87   //
88   // Arguments:
89   //    c_rarg0:   call wrapper address                   address
90   //    c_rarg1:   result                                 address
91   //    c_rarg2:   result type                            BasicType
92   //    c_rarg3:   method                                 Method*
93   //    c_rarg4:   (interpreter) entry point              address
94   //    c_rarg5:   parameters                             intptr_t*
95   //    c_rarg6:   parameter size (in words)              int
96   //    c_rarg7:   thread                                 Thread*
97   //
98   // There is no return from the stub itself as any Java result
99   // is written to result
100   //
101   // we save r30 (lr) as the return PC at the base of the frame and
102   // link r29 (fp) below it as the frame pointer installing sp (r31)
103   // into fp.
104   //
105   // we save r0-r7, which accounts for all the c arguments.
106   //
107   // TODO: strictly do we need to save them all? they are treated as
108   // volatile by C so could we omit saving the ones we are going to
109   // place in global registers (thread? method?) or those we only use
110   // during setup of the Java call?
111   //
112   // we don't need to save r8 which C uses as an indirect result location
113   // return register.
114   //
115   // we don't need to save r9-r15 which both C and Java treat as
116   // volatile
117   //
118   // we don't need to save r16-18 because Java does not use them
119   //
120   // we save r19-r28 which Java uses as scratch registers and C
121   // expects to be callee-save
122   //
123   // we save the bottom 64 bits of each value stored in v8-v15; it is
124   // the responsibility of the caller to preserve larger values.
125   //
126   // so the stub frame looks like this when we enter Java code
127   //
128   //     [ return_from_Java     ] <--- sp
129   //     [ argument word n      ]
130   //      ...
131   // -27 [ argument word 1      ]
132   // -26 [ saved v15            ] <--- sp_after_call
133   // -25 [ saved v14            ]
134   // -24 [ saved v13            ]
135   // -23 [ saved v12            ]
136   // -22 [ saved v11            ]
137   // -21 [ saved v10            ]
138   // -20 [ saved v9             ]
139   // -19 [ saved v8             ]
140   // -18 [ saved r28            ]
141   // -17 [ saved r27            ]
142   // -16 [ saved r26            ]
143   // -15 [ saved r25            ]
144   // -14 [ saved r24            ]
145   // -13 [ saved r23            ]
146   // -12 [ saved r22            ]
147   // -11 [ saved r21            ]
148   // -10 [ saved r20            ]
149   //  -9 [ saved r19            ]
150   //  -8 [ call wrapper    (r0) ]
151   //  -7 [ result          (r1) ]
152   //  -6 [ result type     (r2) ]
153   //  -5 [ method          (r3) ]
154   //  -4 [ entry point     (r4) ]
155   //  -3 [ parameters      (r5) ]
156   //  -2 [ parameter size  (r6) ]
157   //  -1 [ thread (r7)          ]
158   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
159   //   1 [ saved lr       (r30) ]
160 
161   // Call stub stack layout word offsets from fp
162   enum call_stub_layout {
163     sp_after_call_off = -26,
164 
165     d15_off            = -26,
166     d13_off            = -24,
167     d11_off            = -22,
168     d9_off             = -20,
169 
170     r28_off            = -18,
171     r26_off            = -16,
172     r24_off            = -14,
173     r22_off            = -12,
174     r20_off            = -10,
175     call_wrapper_off   =  -8,
176     result_off         =  -7,
177     result_type_off    =  -6,
178     method_off         =  -5,
179     entry_point_off    =  -4,
180     parameter_size_off =  -2,
181     thread_off         =  -1,
182     fp_f               =   0,
183     retaddr_off        =   1,
184   };
185 
generate_call_stub(address & return_address)186   address generate_call_stub(address& return_address) {
187     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
188            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
189            "adjust this code");
190 
191     StubCodeMark mark(this, "StubRoutines", "call_stub");
192     address start = __ pc();
193 
194     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
195 
196     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
197     const Address result        (rfp, result_off         * wordSize);
198     const Address result_type   (rfp, result_type_off    * wordSize);
199     const Address method        (rfp, method_off         * wordSize);
200     const Address entry_point   (rfp, entry_point_off    * wordSize);
201     const Address parameter_size(rfp, parameter_size_off * wordSize);
202 
203     const Address thread        (rfp, thread_off         * wordSize);
204 
205     const Address d15_save      (rfp, d15_off * wordSize);
206     const Address d13_save      (rfp, d13_off * wordSize);
207     const Address d11_save      (rfp, d11_off * wordSize);
208     const Address d9_save       (rfp, d9_off * wordSize);
209 
210     const Address r28_save      (rfp, r28_off * wordSize);
211     const Address r26_save      (rfp, r26_off * wordSize);
212     const Address r24_save      (rfp, r24_off * wordSize);
213     const Address r22_save      (rfp, r22_off * wordSize);
214     const Address r20_save      (rfp, r20_off * wordSize);
215 
216     // stub code
217 
218     address aarch64_entry = __ pc();
219 
220     // set up frame and move sp to end of save area
221     __ enter();
222     __ sub(sp, rfp, -sp_after_call_off * wordSize);
223 
224     // save register parameters and Java scratch/global registers
225     // n.b. we save thread even though it gets installed in
226     // rthread because we want to sanity check rthread later
227     __ str(c_rarg7,  thread);
228     __ strw(c_rarg6, parameter_size);
229     __ stp(c_rarg4, c_rarg5,  entry_point);
230     __ stp(c_rarg2, c_rarg3,  result_type);
231     __ stp(c_rarg0, c_rarg1,  call_wrapper);
232 
233     __ stp(r20, r19,   r20_save);
234     __ stp(r22, r21,   r22_save);
235     __ stp(r24, r23,   r24_save);
236     __ stp(r26, r25,   r26_save);
237     __ stp(r28, r27,   r28_save);
238 
239     __ stpd(v9,  v8,   d9_save);
240     __ stpd(v11, v10,  d11_save);
241     __ stpd(v13, v12,  d13_save);
242     __ stpd(v15, v14,  d15_save);
243 
244     // install Java thread in global register now we have saved
245     // whatever value it held
246     __ mov(rthread, c_rarg7);
247     // And method
248     __ mov(rmethod, c_rarg3);
249 
250     // set up the heapbase register
251     __ reinit_heapbase();
252 
253 #ifdef ASSERT
254     // make sure we have no pending exceptions
255     {
256       Label L;
257       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
258       __ cmp(rscratch1, (unsigned)NULL_WORD);
259       __ br(Assembler::EQ, L);
260       __ stop("StubRoutines::call_stub: entered with pending exception");
261       __ BIND(L);
262     }
263 #endif
264     // pass parameters if any
265     __ mov(esp, sp);
266     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
267     __ andr(sp, rscratch1, -2 * wordSize);
268 
269     BLOCK_COMMENT("pass parameters if any");
270     Label parameters_done;
271     // parameter count is still in c_rarg6
272     // and parameter pointer identifying param 1 is in c_rarg5
273     __ cbzw(c_rarg6, parameters_done);
274 
275     address loop = __ pc();
276     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
277     __ subsw(c_rarg6, c_rarg6, 1);
278     __ push(rscratch1);
279     __ br(Assembler::GT, loop);
280 
281     __ BIND(parameters_done);
282 
283     // call Java entry -- passing methdoOop, and current sp
284     //      rmethod: Method*
285     //      r13: sender sp
286     BLOCK_COMMENT("call Java function");
287     __ mov(r13, sp);
288     __ blr(c_rarg4);
289 
290     // we do this here because the notify will already have been done
291     // if we get to the next instruction via an exception
292     //
293     // n.b. adding this instruction here affects the calculation of
294     // whether or not a routine returns to the call stub (used when
295     // doing stack walks) since the normal test is to check the return
296     // pc against the address saved below. so we may need to allow for
297     // this extra instruction in the check.
298 
299     // save current address for use by exception handling code
300 
301     return_address = __ pc();
302 
303     // store result depending on type (everything that is not
304     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
305     // n.b. this assumes Java returns an integral result in r0
306     // and a floating result in j_farg0
307     __ ldr(j_rarg2, result);
308     Label is_long, is_float, is_double, exit;
309     __ ldr(j_rarg1, result_type);
310     __ cmp(j_rarg1, T_OBJECT);
311     __ br(Assembler::EQ, is_long);
312     __ cmp(j_rarg1, T_LONG);
313     __ br(Assembler::EQ, is_long);
314     __ cmp(j_rarg1, T_FLOAT);
315     __ br(Assembler::EQ, is_float);
316     __ cmp(j_rarg1, T_DOUBLE);
317     __ br(Assembler::EQ, is_double);
318 
319     // handle T_INT case
320     __ strw(r0, Address(j_rarg2));
321 
322     __ BIND(exit);
323 
324     // pop parameters
325     __ sub(esp, rfp, -sp_after_call_off * wordSize);
326 
327 #ifdef ASSERT
328     // verify that threads correspond
329     {
330       Label L, S;
331       __ ldr(rscratch1, thread);
332       __ cmp(rthread, rscratch1);
333       __ br(Assembler::NE, S);
334       __ get_thread(rscratch1);
335       __ cmp(rthread, rscratch1);
336       __ br(Assembler::EQ, L);
337       __ BIND(S);
338       __ stop("StubRoutines::call_stub: threads must correspond");
339       __ BIND(L);
340     }
341 #endif
342 
343     // restore callee-save registers
344     __ ldpd(v15, v14,  d15_save);
345     __ ldpd(v13, v12,  d13_save);
346     __ ldpd(v11, v10,  d11_save);
347     __ ldpd(v9,  v8,   d9_save);
348 
349     __ ldp(r28, r27,   r28_save);
350     __ ldp(r26, r25,   r26_save);
351     __ ldp(r24, r23,   r24_save);
352     __ ldp(r22, r21,   r22_save);
353     __ ldp(r20, r19,   r20_save);
354 
355     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
356     __ ldrw(c_rarg2, result_type);
357     __ ldr(c_rarg3,  method);
358     __ ldp(c_rarg4, c_rarg5,  entry_point);
359     __ ldp(c_rarg6, c_rarg7,  parameter_size);
360 
361     // leave frame and return to caller
362     __ leave();
363     __ ret(lr);
364 
365     // handle return types different from T_INT
366 
367     __ BIND(is_long);
368     __ str(r0, Address(j_rarg2, 0));
369     __ br(Assembler::AL, exit);
370 
371     __ BIND(is_float);
372     __ strs(j_farg0, Address(j_rarg2, 0));
373     __ br(Assembler::AL, exit);
374 
375     __ BIND(is_double);
376     __ strd(j_farg0, Address(j_rarg2, 0));
377     __ br(Assembler::AL, exit);
378 
379     return start;
380   }
381 
382   // Return point for a Java call if there's an exception thrown in
383   // Java code.  The exception is caught and transformed into a
384   // pending exception stored in JavaThread that can be tested from
385   // within the VM.
386   //
387   // Note: Usually the parameters are removed by the callee. In case
388   // of an exception crossing an activation frame boundary, that is
389   // not the case if the callee is compiled code => need to setup the
390   // rsp.
391   //
392   // r0: exception oop
393 
generate_catch_exception()394   address generate_catch_exception() {
395     StubCodeMark mark(this, "StubRoutines", "catch_exception");
396     address start = __ pc();
397 
398     // same as in generate_call_stub():
399     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
400     const Address thread        (rfp, thread_off         * wordSize);
401 
402 #ifdef ASSERT
403     // verify that threads correspond
404     {
405       Label L, S;
406       __ ldr(rscratch1, thread);
407       __ cmp(rthread, rscratch1);
408       __ br(Assembler::NE, S);
409       __ get_thread(rscratch1);
410       __ cmp(rthread, rscratch1);
411       __ br(Assembler::EQ, L);
412       __ bind(S);
413       __ stop("StubRoutines::catch_exception: threads must correspond");
414       __ bind(L);
415     }
416 #endif
417 
418     // set pending exception
419     __ verify_oop(r0);
420 
421     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
422     __ mov(rscratch1, (address)__FILE__);
423     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
424     __ movw(rscratch1, (int)__LINE__);
425     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
426 
427     // complete return to VM
428     assert(StubRoutines::_call_stub_return_address != NULL,
429            "_call_stub_return_address must have been generated before");
430     __ b(StubRoutines::_call_stub_return_address);
431 
432     return start;
433   }
434 
435   // Continuation point for runtime calls returning with a pending
436   // exception.  The pending exception check happened in the runtime
437   // or native call stub.  The pending exception in Thread is
438   // converted into a Java-level exception.
439   //
440   // Contract with Java-level exception handlers:
441   // r0: exception
442   // r3: throwing pc
443   //
444   // NOTE: At entry of this stub, exception-pc must be in LR !!
445 
446   // NOTE: this is always used as a jump target within generated code
447   // so it just needs to be generated code wiht no x86 prolog
448 
generate_forward_exception()449   address generate_forward_exception() {
450     StubCodeMark mark(this, "StubRoutines", "forward exception");
451     address start = __ pc();
452 
453     // Upon entry, LR points to the return address returning into
454     // Java (interpreted or compiled) code; i.e., the return address
455     // becomes the throwing pc.
456     //
457     // Arguments pushed before the runtime call are still on the stack
458     // but the exception handler will reset the stack pointer ->
459     // ignore them.  A potential result in registers can be ignored as
460     // well.
461 
462 #ifdef ASSERT
463     // make sure this code is only executed if there is a pending exception
464     {
465       Label L;
466       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
467       __ cbnz(rscratch1, L);
468       __ stop("StubRoutines::forward exception: no pending exception (1)");
469       __ bind(L);
470     }
471 #endif
472 
473     // compute exception handler into r19
474 
475     // call the VM to find the handler address associated with the
476     // caller address. pass thread in r0 and caller pc (ret address)
477     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
478     // the stack.
479     __ mov(c_rarg1, lr);
480     // lr will be trashed by the VM call so we move it to R19
481     // (callee-saved) because we also need to pass it to the handler
482     // returned by this call.
483     __ mov(r19, lr);
484     BLOCK_COMMENT("call exception_handler_for_return_address");
485     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
486                          SharedRuntime::exception_handler_for_return_address),
487                     rthread, c_rarg1);
488     // we should not really care that lr is no longer the callee
489     // address. we saved the value the handler needs in r19 so we can
490     // just copy it to r3. however, the C2 handler will push its own
491     // frame and then calls into the VM and the VM code asserts that
492     // the PC for the frame above the handler belongs to a compiled
493     // Java method. So, we restore lr here to satisfy that assert.
494     __ mov(lr, r19);
495     // setup r0 & r3 & clear pending exception
496     __ mov(r3, r19);
497     __ mov(r19, r0);
498     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
499     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
500 
501 #ifdef ASSERT
502     // make sure exception is set
503     {
504       Label L;
505       __ cbnz(r0, L);
506       __ stop("StubRoutines::forward exception: no pending exception (2)");
507       __ bind(L);
508     }
509 #endif
510 
511     // continue at exception handler
512     // r0: exception
513     // r3: throwing pc
514     // r19: exception handler
515     __ verify_oop(r0);
516     __ br(r19);
517 
518     return start;
519   }
520 
521   // Non-destructive plausibility checks for oops
522   //
523   // Arguments:
524   //    r0: oop to verify
525   //    rscratch1: error message
526   //
527   // Stack after saving c_rarg3:
528   //    [tos + 0]: saved c_rarg3
529   //    [tos + 1]: saved c_rarg2
530   //    [tos + 2]: saved lr
531   //    [tos + 3]: saved rscratch2
532   //    [tos + 4]: saved r0
533   //    [tos + 5]: saved rscratch1
generate_verify_oop()534   address generate_verify_oop() {
535 
536     StubCodeMark mark(this, "StubRoutines", "verify_oop");
537     address start = __ pc();
538 
539     Label exit, error;
540 
541     // save c_rarg2 and c_rarg3
542     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
543 
544     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
545     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
546     __ ldr(c_rarg3, Address(c_rarg2));
547     __ add(c_rarg3, c_rarg3, 1);
548     __ str(c_rarg3, Address(c_rarg2));
549 
550     // object is in r0
551     // make sure object is 'reasonable'
552     __ cbz(r0, exit); // if obj is NULL it is OK
553 
554     // Check if the oop is in the right area of memory
555     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
556     __ andr(c_rarg2, r0, c_rarg3);
557     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
558 
559     // Compare c_rarg2 and c_rarg3.  We don't use a compare
560     // instruction here because the flags register is live.
561     __ eor(c_rarg2, c_rarg2, c_rarg3);
562     __ cbnz(c_rarg2, error);
563 
564     // make sure klass is 'reasonable', which is not zero.
565     __ load_klass(r0, r0);  // get klass
566     __ cbz(r0, error);      // if klass is NULL it is broken
567 
568     // return if everything seems ok
569     __ bind(exit);
570 
571     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
572     __ ret(lr);
573 
574     // handle errors
575     __ bind(error);
576     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
577 
578     __ push(RegSet::range(r0, r29), sp);
579     // debug(char* msg, int64_t pc, int64_t regs[])
580     __ mov(c_rarg0, rscratch1);      // pass address of error message
581     __ mov(c_rarg1, lr);             // pass return address
582     __ mov(c_rarg2, sp);             // pass address of regs on stack
583 #ifndef PRODUCT
584     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
585 #endif
586     BLOCK_COMMENT("call MacroAssembler::debug");
587     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
588     __ blr(rscratch1);
589 
590     return start;
591   }
592 
array_overlap_test(Label & L_no_overlap,Address::sxtw sf)593   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
594 
595   // The inner part of zero_words().  This is the bulk operation,
596   // zeroing words in blocks, possibly using DC ZVA to do it.  The
597   // caller is responsible for zeroing the last few words.
598   //
599   // Inputs:
600   // r10: the HeapWord-aligned base address of an array to zero.
601   // r11: the count in HeapWords, r11 > 0.
602   //
603   // Returns r10 and r11, adjusted for the caller to clear.
604   // r10: the base address of the tail of words left to clear.
605   // r11: the number of words in the tail.
606   //      r11 < MacroAssembler::zero_words_block_size.
607 
generate_zero_blocks()608   address generate_zero_blocks() {
609     Label store_pair, loop_store_pair, done;
610     Label base_aligned;
611 
612     Register base = r10, cnt = r11;
613 
614     __ align(CodeEntryAlignment);
615     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
616     address start = __ pc();
617 
618     if (UseBlockZeroing) {
619       int zva_length = VM_Version::zva_length();
620 
621       // Ensure ZVA length can be divided by 16. This is required by
622       // the subsequent operations.
623       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
624 
625       __ tbz(base, 3, base_aligned);
626       __ str(zr, Address(__ post(base, 8)));
627       __ sub(cnt, cnt, 1);
628       __ bind(base_aligned);
629 
630       // Ensure count >= zva_length * 2 so that it still deserves a zva after
631       // alignment.
632       Label small;
633       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
634       __ subs(rscratch1, cnt, low_limit >> 3);
635       __ br(Assembler::LT, small);
636       __ zero_dcache_blocks(base, cnt);
637       __ bind(small);
638     }
639 
640     {
641       // Number of stp instructions we'll unroll
642       const int unroll =
643         MacroAssembler::zero_words_block_size / 2;
644       // Clear the remaining blocks.
645       Label loop;
646       __ subs(cnt, cnt, unroll * 2);
647       __ br(Assembler::LT, done);
648       __ bind(loop);
649       for (int i = 0; i < unroll; i++)
650         __ stp(zr, zr, __ post(base, 16));
651       __ subs(cnt, cnt, unroll * 2);
652       __ br(Assembler::GE, loop);
653       __ bind(done);
654       __ add(cnt, cnt, unroll * 2);
655     }
656 
657     __ ret(lr);
658 
659     return start;
660   }
661 
662 
663   typedef enum {
664     copy_forwards = 1,
665     copy_backwards = -1
666   } copy_direction;
667 
668   // Bulk copy of blocks of 8 words.
669   //
670   // count is a count of words.
671   //
672   // Precondition: count >= 8
673   //
674   // Postconditions:
675   //
676   // The least significant bit of count contains the remaining count
677   // of words to copy.  The rest of count is trash.
678   //
679   // s and d are adjusted to point to the remaining words to copy
680   //
generate_copy_longs(Label & start,Register s,Register d,Register count,copy_direction direction)681   void generate_copy_longs(Label &start, Register s, Register d, Register count,
682                            copy_direction direction) {
683     int unit = wordSize * direction;
684     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
685 
686     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
687       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
688     const Register stride = r13;
689 
690     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
691     assert_different_registers(s, d, count, rscratch1);
692 
693     Label again, drain;
694     const char *stub_name;
695     if (direction == copy_forwards)
696       stub_name = "forward_copy_longs";
697     else
698       stub_name = "backward_copy_longs";
699 
700     __ align(CodeEntryAlignment);
701 
702     StubCodeMark mark(this, "StubRoutines", stub_name);
703 
704     __ bind(start);
705 
706     Label unaligned_copy_long;
707     if (AvoidUnalignedAccesses) {
708       __ tbnz(d, 3, unaligned_copy_long);
709     }
710 
711     if (direction == copy_forwards) {
712       __ sub(s, s, bias);
713       __ sub(d, d, bias);
714     }
715 
716 #ifdef ASSERT
717     // Make sure we are never given < 8 words
718     {
719       Label L;
720       __ cmp(count, 8);
721       __ br(Assembler::GE, L);
722       __ stop("genrate_copy_longs called with < 8 words");
723       __ bind(L);
724     }
725 #endif
726 
727     // Fill 8 registers
728     if (UseSIMDForMemoryOps) {
729       __ ldpq(v0, v1, Address(s, 4 * unit));
730       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
731     } else {
732       __ ldp(t0, t1, Address(s, 2 * unit));
733       __ ldp(t2, t3, Address(s, 4 * unit));
734       __ ldp(t4, t5, Address(s, 6 * unit));
735       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
736     }
737 
738     __ subs(count, count, 16);
739     __ br(Assembler::LO, drain);
740 
741     int prefetch = PrefetchCopyIntervalInBytes;
742     bool use_stride = false;
743     if (direction == copy_backwards) {
744        use_stride = prefetch > 256;
745        prefetch = -prefetch;
746        if (use_stride) __ mov(stride, prefetch);
747     }
748 
749     __ bind(again);
750 
751     if (PrefetchCopyIntervalInBytes > 0)
752       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
753 
754     if (UseSIMDForMemoryOps) {
755       __ stpq(v0, v1, Address(d, 4 * unit));
756       __ ldpq(v0, v1, Address(s, 4 * unit));
757       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
758       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
759     } else {
760       __ stp(t0, t1, Address(d, 2 * unit));
761       __ ldp(t0, t1, Address(s, 2 * unit));
762       __ stp(t2, t3, Address(d, 4 * unit));
763       __ ldp(t2, t3, Address(s, 4 * unit));
764       __ stp(t4, t5, Address(d, 6 * unit));
765       __ ldp(t4, t5, Address(s, 6 * unit));
766       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
767       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
768     }
769 
770     __ subs(count, count, 8);
771     __ br(Assembler::HS, again);
772 
773     // Drain
774     __ bind(drain);
775     if (UseSIMDForMemoryOps) {
776       __ stpq(v0, v1, Address(d, 4 * unit));
777       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
778     } else {
779       __ stp(t0, t1, Address(d, 2 * unit));
780       __ stp(t2, t3, Address(d, 4 * unit));
781       __ stp(t4, t5, Address(d, 6 * unit));
782       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
783     }
784 
785     {
786       Label L1, L2;
787       __ tbz(count, exact_log2(4), L1);
788       if (UseSIMDForMemoryOps) {
789         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
790         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
791       } else {
792         __ ldp(t0, t1, Address(s, 2 * unit));
793         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
794         __ stp(t0, t1, Address(d, 2 * unit));
795         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
796       }
797       __ bind(L1);
798 
799       if (direction == copy_forwards) {
800         __ add(s, s, bias);
801         __ add(d, d, bias);
802       }
803 
804       __ tbz(count, 1, L2);
805       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
806       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
807       __ bind(L2);
808     }
809 
810     __ ret(lr);
811 
812     if (AvoidUnalignedAccesses) {
813       Label drain, again;
814       // Register order for storing. Order is different for backward copy.
815 
816       __ bind(unaligned_copy_long);
817 
818       // source address is even aligned, target odd aligned
819       //
820       // when forward copying word pairs we read long pairs at offsets
821       // {0, 2, 4, 6} (in long words). when backwards copying we read
822       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
823       // address by -2 in the forwards case so we can compute the
824       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
825       // or -1.
826       //
827       // when forward copying we need to store 1 word, 3 pairs and
828       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
829       // zero offset We adjust the destination by -1 which means we
830       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
831       //
832       // When backwards copyng we need to store 1 word, 3 pairs and
833       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
834       // offsets {1, 3, 5, 7, 8} * unit.
835 
836       if (direction == copy_forwards) {
837         __ sub(s, s, 16);
838         __ sub(d, d, 8);
839       }
840 
841       // Fill 8 registers
842       //
843       // for forwards copy s was offset by -16 from the original input
844       // value of s so the register contents are at these offsets
845       // relative to the 64 bit block addressed by that original input
846       // and so on for each successive 64 byte block when s is updated
847       //
848       // t0 at offset 0,  t1 at offset 8
849       // t2 at offset 16, t3 at offset 24
850       // t4 at offset 32, t5 at offset 40
851       // t6 at offset 48, t7 at offset 56
852 
853       // for backwards copy s was not offset so the register contents
854       // are at these offsets into the preceding 64 byte block
855       // relative to that original input and so on for each successive
856       // preceding 64 byte block when s is updated. this explains the
857       // slightly counter-intuitive looking pattern of register usage
858       // in the stp instructions for backwards copy.
859       //
860       // t0 at offset -16, t1 at offset -8
861       // t2 at offset -32, t3 at offset -24
862       // t4 at offset -48, t5 at offset -40
863       // t6 at offset -64, t7 at offset -56
864 
865       __ ldp(t0, t1, Address(s, 2 * unit));
866       __ ldp(t2, t3, Address(s, 4 * unit));
867       __ ldp(t4, t5, Address(s, 6 * unit));
868       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
869 
870       __ subs(count, count, 16);
871       __ br(Assembler::LO, drain);
872 
873       int prefetch = PrefetchCopyIntervalInBytes;
874       bool use_stride = false;
875       if (direction == copy_backwards) {
876          use_stride = prefetch > 256;
877          prefetch = -prefetch;
878          if (use_stride) __ mov(stride, prefetch);
879       }
880 
881       __ bind(again);
882 
883       if (PrefetchCopyIntervalInBytes > 0)
884         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
885 
886       if (direction == copy_forwards) {
887        // allowing for the offset of -8 the store instructions place
888        // registers into the target 64 bit block at the following
889        // offsets
890        //
891        // t0 at offset 0
892        // t1 at offset 8,  t2 at offset 16
893        // t3 at offset 24, t4 at offset 32
894        // t5 at offset 40, t6 at offset 48
895        // t7 at offset 56
896 
897         __ str(t0, Address(d, 1 * unit));
898         __ stp(t1, t2, Address(d, 2 * unit));
899         __ ldp(t0, t1, Address(s, 2 * unit));
900         __ stp(t3, t4, Address(d, 4 * unit));
901         __ ldp(t2, t3, Address(s, 4 * unit));
902         __ stp(t5, t6, Address(d, 6 * unit));
903         __ ldp(t4, t5, Address(s, 6 * unit));
904         __ str(t7, Address(__ pre(d, 8 * unit)));
905         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
906       } else {
907        // d was not offset when we started so the registers are
908        // written into the 64 bit block preceding d with the following
909        // offsets
910        //
911        // t1 at offset -8
912        // t3 at offset -24, t0 at offset -16
913        // t5 at offset -48, t2 at offset -32
914        // t7 at offset -56, t4 at offset -48
915        //                   t6 at offset -64
916        //
917        // note that this matches the offsets previously noted for the
918        // loads
919 
920         __ str(t1, Address(d, 1 * unit));
921         __ stp(t3, t0, Address(d, 3 * unit));
922         __ ldp(t0, t1, Address(s, 2 * unit));
923         __ stp(t5, t2, Address(d, 5 * unit));
924         __ ldp(t2, t3, Address(s, 4 * unit));
925         __ stp(t7, t4, Address(d, 7 * unit));
926         __ ldp(t4, t5, Address(s, 6 * unit));
927         __ str(t6, Address(__ pre(d, 8 * unit)));
928         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
929       }
930 
931       __ subs(count, count, 8);
932       __ br(Assembler::HS, again);
933 
934       // Drain
935       //
936       // this uses the same pattern of offsets and register arguments
937       // as above
938       __ bind(drain);
939       if (direction == copy_forwards) {
940         __ str(t0, Address(d, 1 * unit));
941         __ stp(t1, t2, Address(d, 2 * unit));
942         __ stp(t3, t4, Address(d, 4 * unit));
943         __ stp(t5, t6, Address(d, 6 * unit));
944         __ str(t7, Address(__ pre(d, 8 * unit)));
945       } else {
946         __ str(t1, Address(d, 1 * unit));
947         __ stp(t3, t0, Address(d, 3 * unit));
948         __ stp(t5, t2, Address(d, 5 * unit));
949         __ stp(t7, t4, Address(d, 7 * unit));
950         __ str(t6, Address(__ pre(d, 8 * unit)));
951       }
952       // now we need to copy any remaining part block which may
953       // include a 4 word block subblock and/or a 2 word subblock.
954       // bits 2 and 1 in the count are the tell-tale for whetehr we
955       // have each such subblock
956       {
957         Label L1, L2;
958         __ tbz(count, exact_log2(4), L1);
959        // this is the same as above but copying only 4 longs hence
960        // with ony one intervening stp between the str instructions
961        // but note that the offsets and registers still follow the
962        // same pattern
963         __ ldp(t0, t1, Address(s, 2 * unit));
964         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
965         if (direction == copy_forwards) {
966           __ str(t0, Address(d, 1 * unit));
967           __ stp(t1, t2, Address(d, 2 * unit));
968           __ str(t3, Address(__ pre(d, 4 * unit)));
969         } else {
970           __ str(t1, Address(d, 1 * unit));
971           __ stp(t3, t0, Address(d, 3 * unit));
972           __ str(t2, Address(__ pre(d, 4 * unit)));
973         }
974         __ bind(L1);
975 
976         __ tbz(count, 1, L2);
977        // this is the same as above but copying only 2 longs hence
978        // there is no intervening stp between the str instructions
979        // but note that the offset and register patterns are still
980        // the same
981         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
982         if (direction == copy_forwards) {
983           __ str(t0, Address(d, 1 * unit));
984           __ str(t1, Address(__ pre(d, 2 * unit)));
985         } else {
986           __ str(t1, Address(d, 1 * unit));
987           __ str(t0, Address(__ pre(d, 2 * unit)));
988         }
989         __ bind(L2);
990 
991        // for forwards copy we need to re-adjust the offsets we
992        // applied so that s and d are follow the last words written
993 
994        if (direction == copy_forwards) {
995          __ add(s, s, 16);
996          __ add(d, d, 8);
997        }
998 
999       }
1000 
1001       __ ret(lr);
1002       }
1003   }
1004 
1005   // Small copy: less than 16 bytes.
1006   //
1007   // NB: Ignores all of the bits of count which represent more than 15
1008   // bytes, so a caller doesn't have to mask them.
1009 
copy_memory_small(Register s,Register d,Register count,Register tmp,int step)1010   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1011     bool is_backwards = step < 0;
1012     size_t granularity = uabs(step);
1013     int direction = is_backwards ? -1 : 1;
1014     int unit = wordSize * direction;
1015 
1016     Label Lpair, Lword, Lint, Lshort, Lbyte;
1017 
1018     assert(granularity
1019            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1020 
1021     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1022 
1023     // ??? I don't know if this bit-test-and-branch is the right thing
1024     // to do.  It does a lot of jumping, resulting in several
1025     // mispredicted branches.  It might make more sense to do this
1026     // with something like Duff's device with a single computed branch.
1027 
1028     __ tbz(count, 3 - exact_log2(granularity), Lword);
1029     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1030     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1031     __ bind(Lword);
1032 
1033     if (granularity <= sizeof (jint)) {
1034       __ tbz(count, 2 - exact_log2(granularity), Lint);
1035       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1036       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1037       __ bind(Lint);
1038     }
1039 
1040     if (granularity <= sizeof (jshort)) {
1041       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1042       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1043       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1044       __ bind(Lshort);
1045     }
1046 
1047     if (granularity <= sizeof (jbyte)) {
1048       __ tbz(count, 0, Lbyte);
1049       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1050       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1051       __ bind(Lbyte);
1052     }
1053   }
1054 
1055   Label copy_f, copy_b;
1056 
1057   // All-singing all-dancing memory copy.
1058   //
1059   // Copy count units of memory from s to d.  The size of a unit is
1060   // step, which can be positive or negative depending on the direction
1061   // of copy.  If is_aligned is false, we align the source address.
1062   //
1063 
copy_memory(bool is_aligned,Register s,Register d,Register count,Register tmp,int step)1064   void copy_memory(bool is_aligned, Register s, Register d,
1065                    Register count, Register tmp, int step) {
1066     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1067     bool is_backwards = step < 0;
1068     unsigned int granularity = uabs(step);
1069     const Register t0 = r3, t1 = r4;
1070 
1071     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1072     // load all the data before writing anything
1073     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1074     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1075     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1076     const Register send = r17, dend = r18;
1077 
1078     if (PrefetchCopyIntervalInBytes > 0)
1079       __ prfm(Address(s, 0), PLDL1KEEP);
1080     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1081     __ br(Assembler::HI, copy_big);
1082 
1083     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1084     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1085 
1086     __ cmp(count, 16/granularity);
1087     __ br(Assembler::LS, copy16);
1088 
1089     __ cmp(count, 64/granularity);
1090     __ br(Assembler::HI, copy80);
1091 
1092     __ cmp(count, 32/granularity);
1093     __ br(Assembler::LS, copy32);
1094 
1095     // 33..64 bytes
1096     if (UseSIMDForMemoryOps) {
1097       __ ldpq(v0, v1, Address(s, 0));
1098       __ ldpq(v2, v3, Address(send, -32));
1099       __ stpq(v0, v1, Address(d, 0));
1100       __ stpq(v2, v3, Address(dend, -32));
1101     } else {
1102       __ ldp(t0, t1, Address(s, 0));
1103       __ ldp(t2, t3, Address(s, 16));
1104       __ ldp(t4, t5, Address(send, -32));
1105       __ ldp(t6, t7, Address(send, -16));
1106 
1107       __ stp(t0, t1, Address(d, 0));
1108       __ stp(t2, t3, Address(d, 16));
1109       __ stp(t4, t5, Address(dend, -32));
1110       __ stp(t6, t7, Address(dend, -16));
1111     }
1112     __ b(finish);
1113 
1114     // 17..32 bytes
1115     __ bind(copy32);
1116     __ ldp(t0, t1, Address(s, 0));
1117     __ ldp(t2, t3, Address(send, -16));
1118     __ stp(t0, t1, Address(d, 0));
1119     __ stp(t2, t3, Address(dend, -16));
1120     __ b(finish);
1121 
1122     // 65..80/96 bytes
1123     // (96 bytes if SIMD because we do 32 byes per instruction)
1124     __ bind(copy80);
1125     if (UseSIMDForMemoryOps) {
1126       __ ldpq(v0, v1, Address(s, 0));
1127       __ ldpq(v2, v3, Address(s, 32));
1128       // Unaligned pointers can be an issue for copying.
1129       // The issue has more chances to happen when granularity of data is
1130       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1131       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1132       // The most performance drop has been seen for the range 65-80 bytes.
1133       // For such cases using the pair of ldp/stp instead of the third pair of
1134       // ldpq/stpq fixes the performance issue.
1135       if (granularity < sizeof (jint)) {
1136         Label copy96;
1137         __ cmp(count, u1(80/granularity));
1138         __ br(Assembler::HI, copy96);
1139         __ ldp(t0, t1, Address(send, -16));
1140 
1141         __ stpq(v0, v1, Address(d, 0));
1142         __ stpq(v2, v3, Address(d, 32));
1143         __ stp(t0, t1, Address(dend, -16));
1144         __ b(finish);
1145 
1146         __ bind(copy96);
1147       }
1148       __ ldpq(v4, v5, Address(send, -32));
1149 
1150       __ stpq(v0, v1, Address(d, 0));
1151       __ stpq(v2, v3, Address(d, 32));
1152       __ stpq(v4, v5, Address(dend, -32));
1153     } else {
1154       __ ldp(t0, t1, Address(s, 0));
1155       __ ldp(t2, t3, Address(s, 16));
1156       __ ldp(t4, t5, Address(s, 32));
1157       __ ldp(t6, t7, Address(s, 48));
1158       __ ldp(t8, t9, Address(send, -16));
1159 
1160       __ stp(t0, t1, Address(d, 0));
1161       __ stp(t2, t3, Address(d, 16));
1162       __ stp(t4, t5, Address(d, 32));
1163       __ stp(t6, t7, Address(d, 48));
1164       __ stp(t8, t9, Address(dend, -16));
1165     }
1166     __ b(finish);
1167 
1168     // 0..16 bytes
1169     __ bind(copy16);
1170     __ cmp(count, 8/granularity);
1171     __ br(Assembler::LO, copy8);
1172 
1173     // 8..16 bytes
1174     __ ldr(t0, Address(s, 0));
1175     __ ldr(t1, Address(send, -8));
1176     __ str(t0, Address(d, 0));
1177     __ str(t1, Address(dend, -8));
1178     __ b(finish);
1179 
1180     if (granularity < 8) {
1181       // 4..7 bytes
1182       __ bind(copy8);
1183       __ tbz(count, 2 - exact_log2(granularity), copy4);
1184       __ ldrw(t0, Address(s, 0));
1185       __ ldrw(t1, Address(send, -4));
1186       __ strw(t0, Address(d, 0));
1187       __ strw(t1, Address(dend, -4));
1188       __ b(finish);
1189       if (granularity < 4) {
1190         // 0..3 bytes
1191         __ bind(copy4);
1192         __ cbz(count, finish); // get rid of 0 case
1193         if (granularity == 2) {
1194           __ ldrh(t0, Address(s, 0));
1195           __ strh(t0, Address(d, 0));
1196         } else { // granularity == 1
1197           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1198           // the first and last byte.
1199           // Handle the 3 byte case by loading and storing base + count/2
1200           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1201           // This does means in the 1 byte case we load/store the same
1202           // byte 3 times.
1203           __ lsr(count, count, 1);
1204           __ ldrb(t0, Address(s, 0));
1205           __ ldrb(t1, Address(send, -1));
1206           __ ldrb(t2, Address(s, count));
1207           __ strb(t0, Address(d, 0));
1208           __ strb(t1, Address(dend, -1));
1209           __ strb(t2, Address(d, count));
1210         }
1211         __ b(finish);
1212       }
1213     }
1214 
1215     __ bind(copy_big);
1216     if (is_backwards) {
1217       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1218       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1219     }
1220 
1221     // Now we've got the small case out of the way we can align the
1222     // source address on a 2-word boundary.
1223 
1224     Label aligned;
1225 
1226     if (is_aligned) {
1227       // We may have to adjust by 1 word to get s 2-word-aligned.
1228       __ tbz(s, exact_log2(wordSize), aligned);
1229       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1230       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1231       __ sub(count, count, wordSize/granularity);
1232     } else {
1233       if (is_backwards) {
1234         __ andr(rscratch2, s, 2 * wordSize - 1);
1235       } else {
1236         __ neg(rscratch2, s);
1237         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1238       }
1239       // rscratch2 is the byte adjustment needed to align s.
1240       __ cbz(rscratch2, aligned);
1241       int shift = exact_log2(granularity);
1242       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1243       __ sub(count, count, rscratch2);
1244 
1245 #if 0
1246       // ?? This code is only correct for a disjoint copy.  It may or
1247       // may not make sense to use it in that case.
1248 
1249       // Copy the first pair; s and d may not be aligned.
1250       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1251       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1252 
1253       // Align s and d, adjust count
1254       if (is_backwards) {
1255         __ sub(s, s, rscratch2);
1256         __ sub(d, d, rscratch2);
1257       } else {
1258         __ add(s, s, rscratch2);
1259         __ add(d, d, rscratch2);
1260       }
1261 #else
1262       copy_memory_small(s, d, rscratch2, rscratch1, step);
1263 #endif
1264     }
1265 
1266     __ bind(aligned);
1267 
1268     // s is now 2-word-aligned.
1269 
1270     // We have a count of units and some trailing bytes.  Adjust the
1271     // count and do a bulk copy of words.
1272     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1273     if (direction == copy_forwards)
1274       __ bl(copy_f);
1275     else
1276       __ bl(copy_b);
1277 
1278     // And the tail.
1279     copy_memory_small(s, d, count, tmp, step);
1280 
1281     if (granularity >= 8) __ bind(copy8);
1282     if (granularity >= 4) __ bind(copy4);
1283     __ bind(finish);
1284   }
1285 
1286 
clobber_registers()1287   void clobber_registers() {
1288 #ifdef ASSERT
1289     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1290     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1291     for (Register r = r3; r <= r18; r++)
1292       if (r != rscratch1) __ mov(r, rscratch1);
1293 #endif
1294   }
1295 
1296   // Scan over array at a for count oops, verifying each one.
1297   // Preserves a and count, clobbers rscratch1 and rscratch2.
verify_oop_array(size_t size,Register a,Register count,Register temp)1298   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1299     Label loop, end;
1300     __ mov(rscratch1, a);
1301     __ mov(rscratch2, zr);
1302     __ bind(loop);
1303     __ cmp(rscratch2, count);
1304     __ br(Assembler::HS, end);
1305     if (size == (size_t)wordSize) {
1306       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1307       __ verify_oop(temp);
1308     } else {
1309       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1310       __ decode_heap_oop(temp); // calls verify_oop
1311     }
1312     __ add(rscratch2, rscratch2, 1);
1313     __ b(loop);
1314     __ bind(end);
1315   }
1316 
1317   // Arguments:
1318   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1319   //             ignored
1320   //   is_oop  - true => oop array, so generate store check code
1321   //   name    - stub name string
1322   //
1323   // Inputs:
1324   //   c_rarg0   - source array address
1325   //   c_rarg1   - destination array address
1326   //   c_rarg2   - element count, treated as ssize_t, can be zero
1327   //
1328   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1329   // the hardware handle it.  The two dwords within qwords that span
1330   // cache line boundaries will still be loaded and stored atomically.
1331   //
1332   // Side Effects:
1333   //   disjoint_int_copy_entry is set to the no-overlap entry point
1334   //   used by generate_conjoint_int_oop_copy().
1335   //
generate_disjoint_copy(size_t size,bool aligned,bool is_oop,address * entry,const char * name,bool dest_uninitialized=false)1336   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1337                                   const char *name, bool dest_uninitialized = false) {
1338     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1339     RegSet saved_reg = RegSet::of(s, d, count);
1340     __ align(CodeEntryAlignment);
1341     StubCodeMark mark(this, "StubRoutines", name);
1342     address start = __ pc();
1343     __ enter();
1344 
1345     if (entry != NULL) {
1346       *entry = __ pc();
1347       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1348       BLOCK_COMMENT("Entry:");
1349     }
1350 
1351     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1352     if (dest_uninitialized) {
1353       decorators |= IS_DEST_UNINITIALIZED;
1354     }
1355     if (aligned) {
1356       decorators |= ARRAYCOPY_ALIGNED;
1357     }
1358 
1359     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1360     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1361 
1362     if (is_oop) {
1363       // save regs before copy_memory
1364       __ push(RegSet::of(d, count), sp);
1365     }
1366     copy_memory(aligned, s, d, count, rscratch1, size);
1367 
1368     if (is_oop) {
1369       __ pop(RegSet::of(d, count), sp);
1370       if (VerifyOops)
1371         verify_oop_array(size, d, count, r16);
1372     }
1373 
1374     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1375 
1376     __ leave();
1377     __ mov(r0, zr); // return 0
1378     __ ret(lr);
1379     return start;
1380   }
1381 
1382   // Arguments:
1383   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1384   //             ignored
1385   //   is_oop  - true => oop array, so generate store check code
1386   //   name    - stub name string
1387   //
1388   // Inputs:
1389   //   c_rarg0   - source array address
1390   //   c_rarg1   - destination array address
1391   //   c_rarg2   - element count, treated as ssize_t, can be zero
1392   //
1393   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1394   // the hardware handle it.  The two dwords within qwords that span
1395   // cache line boundaries will still be loaded and stored atomically.
1396   //
generate_conjoint_copy(size_t size,bool aligned,bool is_oop,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1397   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1398                                  address *entry, const char *name,
1399                                  bool dest_uninitialized = false) {
1400     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1401     RegSet saved_regs = RegSet::of(s, d, count);
1402     StubCodeMark mark(this, "StubRoutines", name);
1403     address start = __ pc();
1404     __ enter();
1405 
1406     if (entry != NULL) {
1407       *entry = __ pc();
1408       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1409       BLOCK_COMMENT("Entry:");
1410     }
1411 
1412     // use fwd copy when (d-s) above_equal (count*size)
1413     __ sub(rscratch1, d, s);
1414     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1415     __ br(Assembler::HS, nooverlap_target);
1416 
1417     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1418     if (dest_uninitialized) {
1419       decorators |= IS_DEST_UNINITIALIZED;
1420     }
1421     if (aligned) {
1422       decorators |= ARRAYCOPY_ALIGNED;
1423     }
1424 
1425     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1426     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1427 
1428     if (is_oop) {
1429       // save regs before copy_memory
1430       __ push(RegSet::of(d, count), sp);
1431     }
1432     copy_memory(aligned, s, d, count, rscratch1, -size);
1433     if (is_oop) {
1434       __ pop(RegSet::of(d, count), sp);
1435       if (VerifyOops)
1436         verify_oop_array(size, d, count, r16);
1437     }
1438     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1439     __ leave();
1440     __ mov(r0, zr); // return 0
1441     __ ret(lr);
1442     return start;
1443 }
1444 
1445   // Arguments:
1446   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1447   //             ignored
1448   //   name    - stub name string
1449   //
1450   // Inputs:
1451   //   c_rarg0   - source array address
1452   //   c_rarg1   - destination array address
1453   //   c_rarg2   - element count, treated as ssize_t, can be zero
1454   //
1455   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1456   // we let the hardware handle it.  The one to eight bytes within words,
1457   // dwords or qwords that span cache line boundaries will still be loaded
1458   // and stored atomically.
1459   //
1460   // Side Effects:
1461   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1462   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1463   // we let the hardware handle it.  The one to eight bytes within words,
1464   // dwords or qwords that span cache line boundaries will still be loaded
1465   // and stored atomically.
1466   //
1467   // Side Effects:
1468   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1469   //   used by generate_conjoint_byte_copy().
1470   //
generate_disjoint_byte_copy(bool aligned,address * entry,const char * name)1471   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1472     const bool not_oop = false;
1473     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1474   }
1475 
1476   // Arguments:
1477   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1478   //             ignored
1479   //   name    - stub name string
1480   //
1481   // Inputs:
1482   //   c_rarg0   - source array address
1483   //   c_rarg1   - destination array address
1484   //   c_rarg2   - element count, treated as ssize_t, can be zero
1485   //
1486   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1487   // we let the hardware handle it.  The one to eight bytes within words,
1488   // dwords or qwords that span cache line boundaries will still be loaded
1489   // and stored atomically.
1490   //
generate_conjoint_byte_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1491   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1492                                       address* entry, const char *name) {
1493     const bool not_oop = false;
1494     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1495   }
1496 
1497   // Arguments:
1498   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1499   //             ignored
1500   //   name    - stub name string
1501   //
1502   // Inputs:
1503   //   c_rarg0   - source array address
1504   //   c_rarg1   - destination array address
1505   //   c_rarg2   - element count, treated as ssize_t, can be zero
1506   //
1507   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1508   // let the hardware handle it.  The two or four words within dwords
1509   // or qwords that span cache line boundaries will still be loaded
1510   // and stored atomically.
1511   //
1512   // Side Effects:
1513   //   disjoint_short_copy_entry is set to the no-overlap entry point
1514   //   used by generate_conjoint_short_copy().
1515   //
generate_disjoint_short_copy(bool aligned,address * entry,const char * name)1516   address generate_disjoint_short_copy(bool aligned,
1517                                        address* entry, const char *name) {
1518     const bool not_oop = false;
1519     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1520   }
1521 
1522   // Arguments:
1523   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1524   //             ignored
1525   //   name    - stub name string
1526   //
1527   // Inputs:
1528   //   c_rarg0   - source array address
1529   //   c_rarg1   - destination array address
1530   //   c_rarg2   - element count, treated as ssize_t, can be zero
1531   //
1532   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1533   // let the hardware handle it.  The two or four words within dwords
1534   // or qwords that span cache line boundaries will still be loaded
1535   // and stored atomically.
1536   //
generate_conjoint_short_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1537   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1538                                        address *entry, const char *name) {
1539     const bool not_oop = false;
1540     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1541 
1542   }
1543   // Arguments:
1544   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1545   //             ignored
1546   //   name    - stub name string
1547   //
1548   // Inputs:
1549   //   c_rarg0   - source array address
1550   //   c_rarg1   - destination array address
1551   //   c_rarg2   - element count, treated as ssize_t, can be zero
1552   //
1553   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1554   // the hardware handle it.  The two dwords within qwords that span
1555   // cache line boundaries will still be loaded and stored atomically.
1556   //
1557   // Side Effects:
1558   //   disjoint_int_copy_entry is set to the no-overlap entry point
1559   //   used by generate_conjoint_int_oop_copy().
1560   //
generate_disjoint_int_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1561   address generate_disjoint_int_copy(bool aligned, address *entry,
1562                                          const char *name, bool dest_uninitialized = false) {
1563     const bool not_oop = false;
1564     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1565   }
1566 
1567   // Arguments:
1568   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1569   //             ignored
1570   //   name    - stub name string
1571   //
1572   // Inputs:
1573   //   c_rarg0   - source array address
1574   //   c_rarg1   - destination array address
1575   //   c_rarg2   - element count, treated as ssize_t, can be zero
1576   //
1577   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1578   // the hardware handle it.  The two dwords within qwords that span
1579   // cache line boundaries will still be loaded and stored atomically.
1580   //
generate_conjoint_int_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1581   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1582                                      address *entry, const char *name,
1583                                      bool dest_uninitialized = false) {
1584     const bool not_oop = false;
1585     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1586   }
1587 
1588 
1589   // Arguments:
1590   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1591   //             ignored
1592   //   name    - stub name string
1593   //
1594   // Inputs:
1595   //   c_rarg0   - source array address
1596   //   c_rarg1   - destination array address
1597   //   c_rarg2   - element count, treated as size_t, can be zero
1598   //
1599   // Side Effects:
1600   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1601   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1602   //
generate_disjoint_long_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1603   address generate_disjoint_long_copy(bool aligned, address *entry,
1604                                           const char *name, bool dest_uninitialized = false) {
1605     const bool not_oop = false;
1606     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1607   }
1608 
1609   // Arguments:
1610   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1611   //             ignored
1612   //   name    - stub name string
1613   //
1614   // Inputs:
1615   //   c_rarg0   - source array address
1616   //   c_rarg1   - destination array address
1617   //   c_rarg2   - element count, treated as size_t, can be zero
1618   //
generate_conjoint_long_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1619   address generate_conjoint_long_copy(bool aligned,
1620                                       address nooverlap_target, address *entry,
1621                                       const char *name, bool dest_uninitialized = false) {
1622     const bool not_oop = false;
1623     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1624   }
1625 
1626   // Arguments:
1627   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1628   //             ignored
1629   //   name    - stub name string
1630   //
1631   // Inputs:
1632   //   c_rarg0   - source array address
1633   //   c_rarg1   - destination array address
1634   //   c_rarg2   - element count, treated as size_t, can be zero
1635   //
1636   // Side Effects:
1637   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1638   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1639   //
generate_disjoint_oop_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized)1640   address generate_disjoint_oop_copy(bool aligned, address *entry,
1641                                      const char *name, bool dest_uninitialized) {
1642     const bool is_oop = true;
1643     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1644     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1645   }
1646 
1647   // Arguments:
1648   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1649   //             ignored
1650   //   name    - stub name string
1651   //
1652   // Inputs:
1653   //   c_rarg0   - source array address
1654   //   c_rarg1   - destination array address
1655   //   c_rarg2   - element count, treated as size_t, can be zero
1656   //
generate_conjoint_oop_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized)1657   address generate_conjoint_oop_copy(bool aligned,
1658                                      address nooverlap_target, address *entry,
1659                                      const char *name, bool dest_uninitialized) {
1660     const bool is_oop = true;
1661     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1662     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1663                                   name, dest_uninitialized);
1664   }
1665 
1666 
1667   // Helper for generating a dynamic type check.
1668   // Smashes rscratch1, rscratch2.
generate_type_check(Register sub_klass,Register super_check_offset,Register super_klass,Label & L_success)1669   void generate_type_check(Register sub_klass,
1670                            Register super_check_offset,
1671                            Register super_klass,
1672                            Label& L_success) {
1673     assert_different_registers(sub_klass, super_check_offset, super_klass);
1674 
1675     BLOCK_COMMENT("type_check:");
1676 
1677     Label L_miss;
1678 
1679     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1680                                      super_check_offset);
1681     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1682 
1683     // Fall through on failure!
1684     __ BIND(L_miss);
1685   }
1686 
1687   //
1688   //  Generate checkcasting array copy stub
1689   //
1690   //  Input:
1691   //    c_rarg0   - source array address
1692   //    c_rarg1   - destination array address
1693   //    c_rarg2   - element count, treated as ssize_t, can be zero
1694   //    c_rarg3   - size_t ckoff (super_check_offset)
1695   //    c_rarg4   - oop ckval (super_klass)
1696   //
1697   //  Output:
1698   //    r0 ==  0  -  success
1699   //    r0 == -1^K - failure, where K is partial transfer count
1700   //
generate_checkcast_copy(const char * name,address * entry,bool dest_uninitialized=false)1701   address generate_checkcast_copy(const char *name, address *entry,
1702                                   bool dest_uninitialized = false) {
1703 
1704     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1705 
1706     // Input registers (after setup_arg_regs)
1707     const Register from        = c_rarg0;   // source array address
1708     const Register to          = c_rarg1;   // destination array address
1709     const Register count       = c_rarg2;   // elementscount
1710     const Register ckoff       = c_rarg3;   // super_check_offset
1711     const Register ckval       = c_rarg4;   // super_klass
1712 
1713     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1714     RegSet wb_post_saved_regs = RegSet::of(count);
1715 
1716     // Registers used as temps (r18, r19, r20 are save-on-entry)
1717     const Register count_save  = r21;       // orig elementscount
1718     const Register start_to    = r20;       // destination array start address
1719     const Register copied_oop  = r18;       // actual oop copied
1720     const Register r19_klass   = r19;       // oop._klass
1721 
1722     //---------------------------------------------------------------
1723     // Assembler stub will be used for this call to arraycopy
1724     // if the two arrays are subtypes of Object[] but the
1725     // destination array type is not equal to or a supertype
1726     // of the source type.  Each element must be separately
1727     // checked.
1728 
1729     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1730                                copied_oop, r19_klass, count_save);
1731 
1732     __ align(CodeEntryAlignment);
1733     StubCodeMark mark(this, "StubRoutines", name);
1734     address start = __ pc();
1735 
1736     __ enter(); // required for proper stackwalking of RuntimeStub frame
1737 
1738 #ifdef ASSERT
1739     // caller guarantees that the arrays really are different
1740     // otherwise, we would have to make conjoint checks
1741     { Label L;
1742       array_overlap_test(L, TIMES_OOP);
1743       __ stop("checkcast_copy within a single array");
1744       __ bind(L);
1745     }
1746 #endif //ASSERT
1747 
1748     // Caller of this entry point must set up the argument registers.
1749     if (entry != NULL) {
1750       *entry = __ pc();
1751       BLOCK_COMMENT("Entry:");
1752     }
1753 
1754      // Empty array:  Nothing to do.
1755     __ cbz(count, L_done);
1756 
1757     __ push(RegSet::of(r18, r19, r20, r21), sp);
1758 
1759 #ifdef ASSERT
1760     BLOCK_COMMENT("assert consistent ckoff/ckval");
1761     // The ckoff and ckval must be mutually consistent,
1762     // even though caller generates both.
1763     { Label L;
1764       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1765       __ ldrw(start_to, Address(ckval, sco_offset));
1766       __ cmpw(ckoff, start_to);
1767       __ br(Assembler::EQ, L);
1768       __ stop("super_check_offset inconsistent");
1769       __ bind(L);
1770     }
1771 #endif //ASSERT
1772 
1773     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1774     bool is_oop = true;
1775     if (dest_uninitialized) {
1776       decorators |= IS_DEST_UNINITIALIZED;
1777     }
1778 
1779     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1780     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1781 
1782     // save the original count
1783     __ mov(count_save, count);
1784 
1785     // Copy from low to high addresses
1786     __ mov(start_to, to);              // Save destination array start address
1787     __ b(L_load_element);
1788 
1789     // ======== begin loop ========
1790     // (Loop is rotated; its entry is L_load_element.)
1791     // Loop control:
1792     //   for (; count != 0; count--) {
1793     //     copied_oop = load_heap_oop(from++);
1794     //     ... generate_type_check ...;
1795     //     store_heap_oop(to++, copied_oop);
1796     //   }
1797     __ align(OptoLoopAlignment);
1798 
1799     __ BIND(L_store_element);
1800     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1801     __ sub(count, count, 1);
1802     __ cbz(count, L_do_card_marks);
1803 
1804     // ======== loop entry is here ========
1805     __ BIND(L_load_element);
1806     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1807     __ cbz(copied_oop, L_store_element);
1808 
1809     __ load_klass(r19_klass, copied_oop);// query the object klass
1810     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1811     // ======== end loop ========
1812 
1813     // It was a real error; we must depend on the caller to finish the job.
1814     // Register count = remaining oops, count_orig = total oops.
1815     // Emit GC store barriers for the oops we have copied and report
1816     // their number to the caller.
1817 
1818     __ subs(count, count_save, count);     // K = partially copied oop count
1819     __ eon(count, count, zr);                   // report (-1^K) to caller
1820     __ br(Assembler::EQ, L_done_pop);
1821 
1822     __ BIND(L_do_card_marks);
1823     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1824 
1825     __ bind(L_done_pop);
1826     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1827     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1828 
1829     __ bind(L_done);
1830     __ mov(r0, count);
1831     __ leave();
1832     __ ret(lr);
1833 
1834     return start;
1835   }
1836 
1837   // Perform range checks on the proposed arraycopy.
1838   // Kills temp, but nothing else.
1839   // Also, clean the sign bits of src_pos and dst_pos.
arraycopy_range_checks(Register src,Register src_pos,Register dst,Register dst_pos,Register length,Register temp,Label & L_failed)1840   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1841                               Register src_pos, // source position (c_rarg1)
1842                               Register dst,     // destination array oo (c_rarg2)
1843                               Register dst_pos, // destination position (c_rarg3)
1844                               Register length,
1845                               Register temp,
1846                               Label& L_failed) {
1847     BLOCK_COMMENT("arraycopy_range_checks:");
1848 
1849     assert_different_registers(rscratch1, temp);
1850 
1851     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1852     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1853     __ addw(temp, length, src_pos);
1854     __ cmpw(temp, rscratch1);
1855     __ br(Assembler::HI, L_failed);
1856 
1857     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1858     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1859     __ addw(temp, length, dst_pos);
1860     __ cmpw(temp, rscratch1);
1861     __ br(Assembler::HI, L_failed);
1862 
1863     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1864     __ movw(src_pos, src_pos);
1865     __ movw(dst_pos, dst_pos);
1866 
1867     BLOCK_COMMENT("arraycopy_range_checks done");
1868   }
1869 
1870   // These stubs get called from some dumb test routine.
1871   // I'll write them properly when they're called from
1872   // something that's actually doing something.
fake_arraycopy_stub(address src,address dst,int count)1873   static void fake_arraycopy_stub(address src, address dst, int count) {
1874     assert(count == 0, "huh?");
1875   }
1876 
1877 
1878   //
1879   //  Generate 'unsafe' array copy stub
1880   //  Though just as safe as the other stubs, it takes an unscaled
1881   //  size_t argument instead of an element count.
1882   //
1883   //  Input:
1884   //    c_rarg0   - source array address
1885   //    c_rarg1   - destination array address
1886   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1887   //
1888   // Examines the alignment of the operands and dispatches
1889   // to a long, int, short, or byte copy loop.
1890   //
generate_unsafe_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address long_copy_entry)1891   address generate_unsafe_copy(const char *name,
1892                                address byte_copy_entry,
1893                                address short_copy_entry,
1894                                address int_copy_entry,
1895                                address long_copy_entry) {
1896     Label L_long_aligned, L_int_aligned, L_short_aligned;
1897     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1898 
1899     __ align(CodeEntryAlignment);
1900     StubCodeMark mark(this, "StubRoutines", name);
1901     address start = __ pc();
1902     __ enter(); // required for proper stackwalking of RuntimeStub frame
1903 
1904     // bump this on entry, not on exit:
1905     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1906 
1907     __ orr(rscratch1, s, d);
1908     __ orr(rscratch1, rscratch1, count);
1909 
1910     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1911     __ cbz(rscratch1, L_long_aligned);
1912     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1913     __ cbz(rscratch1, L_int_aligned);
1914     __ tbz(rscratch1, 0, L_short_aligned);
1915     __ b(RuntimeAddress(byte_copy_entry));
1916 
1917     __ BIND(L_short_aligned);
1918     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1919     __ b(RuntimeAddress(short_copy_entry));
1920     __ BIND(L_int_aligned);
1921     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1922     __ b(RuntimeAddress(int_copy_entry));
1923     __ BIND(L_long_aligned);
1924     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1925     __ b(RuntimeAddress(long_copy_entry));
1926 
1927     return start;
1928   }
1929 
1930   //
1931   //  Generate generic array copy stubs
1932   //
1933   //  Input:
1934   //    c_rarg0    -  src oop
1935   //    c_rarg1    -  src_pos (32-bits)
1936   //    c_rarg2    -  dst oop
1937   //    c_rarg3    -  dst_pos (32-bits)
1938   //    c_rarg4    -  element count (32-bits)
1939   //
1940   //  Output:
1941   //    r0 ==  0  -  success
1942   //    r0 == -1^K - failure, where K is partial transfer count
1943   //
generate_generic_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address oop_copy_entry,address long_copy_entry,address checkcast_copy_entry)1944   address generate_generic_copy(const char *name,
1945                                 address byte_copy_entry, address short_copy_entry,
1946                                 address int_copy_entry, address oop_copy_entry,
1947                                 address long_copy_entry, address checkcast_copy_entry) {
1948 
1949     Label L_failed, L_failed_0, L_objArray;
1950     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1951 
1952     // Input registers
1953     const Register src        = c_rarg0;  // source array oop
1954     const Register src_pos    = c_rarg1;  // source position
1955     const Register dst        = c_rarg2;  // destination array oop
1956     const Register dst_pos    = c_rarg3;  // destination position
1957     const Register length     = c_rarg4;
1958 
1959     __ align(CodeEntryAlignment);
1960 
1961     StubCodeMark mark(this, "StubRoutines", name);
1962 
1963     // Registers used as temps
1964     const Register dst_klass  = c_rarg5;
1965 
1966     address start = __ pc();
1967 
1968     __ enter(); // required for proper stackwalking of RuntimeStub frame
1969 
1970     // bump this on entry, not on exit:
1971     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1972 
1973     //-----------------------------------------------------------------------
1974     // Assembler stub will be used for this call to arraycopy
1975     // if the following conditions are met:
1976     //
1977     // (1) src and dst must not be null.
1978     // (2) src_pos must not be negative.
1979     // (3) dst_pos must not be negative.
1980     // (4) length  must not be negative.
1981     // (5) src klass and dst klass should be the same and not NULL.
1982     // (6) src and dst should be arrays.
1983     // (7) src_pos + length must not exceed length of src.
1984     // (8) dst_pos + length must not exceed length of dst.
1985     //
1986 
1987     //  if (src == NULL) return -1;
1988     __ cbz(src, L_failed);
1989 
1990     //  if (src_pos < 0) return -1;
1991     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1992 
1993     //  if (dst == NULL) return -1;
1994     __ cbz(dst, L_failed);
1995 
1996     //  if (dst_pos < 0) return -1;
1997     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
1998 
1999     // registers used as temp
2000     const Register scratch_length    = r16; // elements count to copy
2001     const Register scratch_src_klass = r17; // array klass
2002     const Register lh                = r18; // layout helper
2003 
2004     //  if (length < 0) return -1;
2005     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2006     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2007 
2008     __ load_klass(scratch_src_klass, src);
2009 #ifdef ASSERT
2010     //  assert(src->klass() != NULL);
2011     {
2012       BLOCK_COMMENT("assert klasses not null {");
2013       Label L1, L2;
2014       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2015       __ bind(L1);
2016       __ stop("broken null klass");
2017       __ bind(L2);
2018       __ load_klass(rscratch1, dst);
2019       __ cbz(rscratch1, L1);     // this would be broken also
2020       BLOCK_COMMENT("} assert klasses not null done");
2021     }
2022 #endif
2023 
2024     // Load layout helper (32-bits)
2025     //
2026     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2027     // 32        30    24            16              8     2                 0
2028     //
2029     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2030     //
2031 
2032     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2033 
2034     // Handle objArrays completely differently...
2035     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2036     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2037     __ movw(rscratch1, objArray_lh);
2038     __ eorw(rscratch2, lh, rscratch1);
2039     __ cbzw(rscratch2, L_objArray);
2040 
2041     //  if (src->klass() != dst->klass()) return -1;
2042     __ load_klass(rscratch2, dst);
2043     __ eor(rscratch2, rscratch2, scratch_src_klass);
2044     __ cbnz(rscratch2, L_failed);
2045 
2046     //  if (!src->is_Array()) return -1;
2047     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2048 
2049     // At this point, it is known to be a typeArray (array_tag 0x3).
2050 #ifdef ASSERT
2051     {
2052       BLOCK_COMMENT("assert primitive array {");
2053       Label L;
2054       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2055       __ cmpw(lh, rscratch2);
2056       __ br(Assembler::GE, L);
2057       __ stop("must be a primitive array");
2058       __ bind(L);
2059       BLOCK_COMMENT("} assert primitive array done");
2060     }
2061 #endif
2062 
2063     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2064                            rscratch2, L_failed);
2065 
2066     // TypeArrayKlass
2067     //
2068     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2069     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2070     //
2071 
2072     const Register rscratch1_offset = rscratch1;    // array offset
2073     const Register r18_elsize = lh; // element size
2074 
2075     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2076            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2077     __ add(src, src, rscratch1_offset);           // src array offset
2078     __ add(dst, dst, rscratch1_offset);           // dst array offset
2079     BLOCK_COMMENT("choose copy loop based on element size");
2080 
2081     // next registers should be set before the jump to corresponding stub
2082     const Register from     = c_rarg0;  // source array address
2083     const Register to       = c_rarg1;  // destination array address
2084     const Register count    = c_rarg2;  // elements count
2085 
2086     // 'from', 'to', 'count' registers should be set in such order
2087     // since they are the same as 'src', 'src_pos', 'dst'.
2088 
2089     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2090 
2091     // The possible values of elsize are 0-3, i.e. exact_log2(element
2092     // size in bytes).  We do a simple bitwise binary search.
2093   __ BIND(L_copy_bytes);
2094     __ tbnz(r18_elsize, 1, L_copy_ints);
2095     __ tbnz(r18_elsize, 0, L_copy_shorts);
2096     __ lea(from, Address(src, src_pos));// src_addr
2097     __ lea(to,   Address(dst, dst_pos));// dst_addr
2098     __ movw(count, scratch_length); // length
2099     __ b(RuntimeAddress(byte_copy_entry));
2100 
2101   __ BIND(L_copy_shorts);
2102     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2103     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2104     __ movw(count, scratch_length); // length
2105     __ b(RuntimeAddress(short_copy_entry));
2106 
2107   __ BIND(L_copy_ints);
2108     __ tbnz(r18_elsize, 0, L_copy_longs);
2109     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2110     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2111     __ movw(count, scratch_length); // length
2112     __ b(RuntimeAddress(int_copy_entry));
2113 
2114   __ BIND(L_copy_longs);
2115 #ifdef ASSERT
2116     {
2117       BLOCK_COMMENT("assert long copy {");
2118       Label L;
2119       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2120       __ cmpw(r18_elsize, LogBytesPerLong);
2121       __ br(Assembler::EQ, L);
2122       __ stop("must be long copy, but elsize is wrong");
2123       __ bind(L);
2124       BLOCK_COMMENT("} assert long copy done");
2125     }
2126 #endif
2127     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2128     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2129     __ movw(count, scratch_length); // length
2130     __ b(RuntimeAddress(long_copy_entry));
2131 
2132     // ObjArrayKlass
2133   __ BIND(L_objArray);
2134     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2135 
2136     Label L_plain_copy, L_checkcast_copy;
2137     //  test array classes for subtyping
2138     __ load_klass(r18, dst);
2139     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2140     __ br(Assembler::NE, L_checkcast_copy);
2141 
2142     // Identically typed arrays can be copied without element-wise checks.
2143     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2144                            rscratch2, L_failed);
2145 
2146     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2147     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2148     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2149     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2150     __ movw(count, scratch_length); // length
2151   __ BIND(L_plain_copy);
2152     __ b(RuntimeAddress(oop_copy_entry));
2153 
2154   __ BIND(L_checkcast_copy);
2155     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2156     {
2157       // Before looking at dst.length, make sure dst is also an objArray.
2158       __ ldrw(rscratch1, Address(r18, lh_offset));
2159       __ movw(rscratch2, objArray_lh);
2160       __ eorw(rscratch1, rscratch1, rscratch2);
2161       __ cbnzw(rscratch1, L_failed);
2162 
2163       // It is safe to examine both src.length and dst.length.
2164       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2165                              r18, L_failed);
2166 
2167       __ load_klass(dst_klass, dst); // reload
2168 
2169       // Marshal the base address arguments now, freeing registers.
2170       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2171       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2172       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2173       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2174       __ movw(count, length);           // length (reloaded)
2175       Register sco_temp = c_rarg3;      // this register is free now
2176       assert_different_registers(from, to, count, sco_temp,
2177                                  dst_klass, scratch_src_klass);
2178       // assert_clean_int(count, sco_temp);
2179 
2180       // Generate the type check.
2181       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2182       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2183 
2184       // Smashes rscratch1, rscratch2
2185       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2186 
2187       // Fetch destination element klass from the ObjArrayKlass header.
2188       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2189       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2190       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2191 
2192       // the checkcast_copy loop needs two extra arguments:
2193       assert(c_rarg3 == sco_temp, "#3 already in place");
2194       // Set up arguments for checkcast_copy_entry.
2195       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2196       __ b(RuntimeAddress(checkcast_copy_entry));
2197     }
2198 
2199   __ BIND(L_failed);
2200     __ mov(r0, -1);
2201     __ leave();   // required for proper stackwalking of RuntimeStub frame
2202     __ ret(lr);
2203 
2204     return start;
2205   }
2206 
2207   //
2208   // Generate stub for array fill. If "aligned" is true, the
2209   // "to" address is assumed to be heapword aligned.
2210   //
2211   // Arguments for generated stub:
2212   //   to:    c_rarg0
2213   //   value: c_rarg1
2214   //   count: c_rarg2 treated as signed
2215   //
generate_fill(BasicType t,bool aligned,const char * name)2216   address generate_fill(BasicType t, bool aligned, const char *name) {
2217     __ align(CodeEntryAlignment);
2218     StubCodeMark mark(this, "StubRoutines", name);
2219     address start = __ pc();
2220 
2221     BLOCK_COMMENT("Entry:");
2222 
2223     const Register to        = c_rarg0;  // source array address
2224     const Register value     = c_rarg1;  // value
2225     const Register count     = c_rarg2;  // elements count
2226 
2227     const Register bz_base = r10;        // base for block_zero routine
2228     const Register cnt_words = r11;      // temp register
2229 
2230     __ enter();
2231 
2232     Label L_fill_elements, L_exit1;
2233 
2234     int shift = -1;
2235     switch (t) {
2236       case T_BYTE:
2237         shift = 0;
2238         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2239         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2240         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2241         __ br(Assembler::LO, L_fill_elements);
2242         break;
2243       case T_SHORT:
2244         shift = 1;
2245         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2246         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2247         __ br(Assembler::LO, L_fill_elements);
2248         break;
2249       case T_INT:
2250         shift = 2;
2251         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2252         __ br(Assembler::LO, L_fill_elements);
2253         break;
2254       default: ShouldNotReachHere();
2255     }
2256 
2257     // Align source address at 8 bytes address boundary.
2258     Label L_skip_align1, L_skip_align2, L_skip_align4;
2259     if (!aligned) {
2260       switch (t) {
2261         case T_BYTE:
2262           // One byte misalignment happens only for byte arrays.
2263           __ tbz(to, 0, L_skip_align1);
2264           __ strb(value, Address(__ post(to, 1)));
2265           __ subw(count, count, 1);
2266           __ bind(L_skip_align1);
2267           // Fallthrough
2268         case T_SHORT:
2269           // Two bytes misalignment happens only for byte and short (char) arrays.
2270           __ tbz(to, 1, L_skip_align2);
2271           __ strh(value, Address(__ post(to, 2)));
2272           __ subw(count, count, 2 >> shift);
2273           __ bind(L_skip_align2);
2274           // Fallthrough
2275         case T_INT:
2276           // Align to 8 bytes, we know we are 4 byte aligned to start.
2277           __ tbz(to, 2, L_skip_align4);
2278           __ strw(value, Address(__ post(to, 4)));
2279           __ subw(count, count, 4 >> shift);
2280           __ bind(L_skip_align4);
2281           break;
2282         default: ShouldNotReachHere();
2283       }
2284     }
2285 
2286     //
2287     //  Fill large chunks
2288     //
2289     __ lsrw(cnt_words, count, 3 - shift); // number of words
2290     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2291     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2292     if (UseBlockZeroing) {
2293       Label non_block_zeroing, rest;
2294       // If the fill value is zero we can use the fast zero_words().
2295       __ cbnz(value, non_block_zeroing);
2296       __ mov(bz_base, to);
2297       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2298       __ zero_words(bz_base, cnt_words);
2299       __ b(rest);
2300       __ bind(non_block_zeroing);
2301       __ fill_words(to, cnt_words, value);
2302       __ bind(rest);
2303     } else {
2304       __ fill_words(to, cnt_words, value);
2305     }
2306 
2307     // Remaining count is less than 8 bytes. Fill it by a single store.
2308     // Note that the total length is no less than 8 bytes.
2309     if (t == T_BYTE || t == T_SHORT) {
2310       Label L_exit1;
2311       __ cbzw(count, L_exit1);
2312       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2313       __ str(value, Address(to, -8));    // overwrite some elements
2314       __ bind(L_exit1);
2315       __ leave();
2316       __ ret(lr);
2317     }
2318 
2319     // Handle copies less than 8 bytes.
2320     Label L_fill_2, L_fill_4, L_exit2;
2321     __ bind(L_fill_elements);
2322     switch (t) {
2323       case T_BYTE:
2324         __ tbz(count, 0, L_fill_2);
2325         __ strb(value, Address(__ post(to, 1)));
2326         __ bind(L_fill_2);
2327         __ tbz(count, 1, L_fill_4);
2328         __ strh(value, Address(__ post(to, 2)));
2329         __ bind(L_fill_4);
2330         __ tbz(count, 2, L_exit2);
2331         __ strw(value, Address(to));
2332         break;
2333       case T_SHORT:
2334         __ tbz(count, 0, L_fill_4);
2335         __ strh(value, Address(__ post(to, 2)));
2336         __ bind(L_fill_4);
2337         __ tbz(count, 1, L_exit2);
2338         __ strw(value, Address(to));
2339         break;
2340       case T_INT:
2341         __ cbzw(count, L_exit2);
2342         __ strw(value, Address(to));
2343         break;
2344       default: ShouldNotReachHere();
2345     }
2346     __ bind(L_exit2);
2347     __ leave();
2348     __ ret(lr);
2349     return start;
2350   }
2351 
generate_arraycopy_stubs()2352   void generate_arraycopy_stubs() {
2353     address entry;
2354     address entry_jbyte_arraycopy;
2355     address entry_jshort_arraycopy;
2356     address entry_jint_arraycopy;
2357     address entry_oop_arraycopy;
2358     address entry_jlong_arraycopy;
2359     address entry_checkcast_arraycopy;
2360 
2361     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2362     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2363 
2364     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2365 
2366     //*** jbyte
2367     // Always need aligned and unaligned versions
2368     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2369                                                                                   "jbyte_disjoint_arraycopy");
2370     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2371                                                                                   &entry_jbyte_arraycopy,
2372                                                                                   "jbyte_arraycopy");
2373     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2374                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2375     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2376                                                                                   "arrayof_jbyte_arraycopy");
2377 
2378     //*** jshort
2379     // Always need aligned and unaligned versions
2380     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2381                                                                                     "jshort_disjoint_arraycopy");
2382     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2383                                                                                     &entry_jshort_arraycopy,
2384                                                                                     "jshort_arraycopy");
2385     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2386                                                                                     "arrayof_jshort_disjoint_arraycopy");
2387     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2388                                                                                     "arrayof_jshort_arraycopy");
2389 
2390     //*** jint
2391     // Aligned versions
2392     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2393                                                                                 "arrayof_jint_disjoint_arraycopy");
2394     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2395                                                                                 "arrayof_jint_arraycopy");
2396     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2397     // entry_jint_arraycopy always points to the unaligned version
2398     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2399                                                                                 "jint_disjoint_arraycopy");
2400     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2401                                                                                 &entry_jint_arraycopy,
2402                                                                                 "jint_arraycopy");
2403 
2404     //*** jlong
2405     // It is always aligned
2406     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2407                                                                                   "arrayof_jlong_disjoint_arraycopy");
2408     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2409                                                                                   "arrayof_jlong_arraycopy");
2410     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2411     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2412 
2413     //*** oops
2414     {
2415       // With compressed oops we need unaligned versions; notice that
2416       // we overwrite entry_oop_arraycopy.
2417       bool aligned = !UseCompressedOops;
2418 
2419       StubRoutines::_arrayof_oop_disjoint_arraycopy
2420         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2421                                      /*dest_uninitialized*/false);
2422       StubRoutines::_arrayof_oop_arraycopy
2423         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2424                                      /*dest_uninitialized*/false);
2425       // Aligned versions without pre-barriers
2426       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2427         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2428                                      /*dest_uninitialized*/true);
2429       StubRoutines::_arrayof_oop_arraycopy_uninit
2430         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2431                                      /*dest_uninitialized*/true);
2432     }
2433 
2434     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2435     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2436     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2437     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2438 
2439     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2440     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2441                                                                         /*dest_uninitialized*/true);
2442 
2443     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2444                                                               entry_jbyte_arraycopy,
2445                                                               entry_jshort_arraycopy,
2446                                                               entry_jint_arraycopy,
2447                                                               entry_jlong_arraycopy);
2448 
2449     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2450                                                                entry_jbyte_arraycopy,
2451                                                                entry_jshort_arraycopy,
2452                                                                entry_jint_arraycopy,
2453                                                                entry_oop_arraycopy,
2454                                                                entry_jlong_arraycopy,
2455                                                                entry_checkcast_arraycopy);
2456 
2457     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2458     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2459     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2460     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2461     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2462     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2463   }
2464 
generate_math_stubs()2465   void generate_math_stubs() { Unimplemented(); }
2466 
2467   // Arguments:
2468   //
2469   // Inputs:
2470   //   c_rarg0   - source byte array address
2471   //   c_rarg1   - destination byte array address
2472   //   c_rarg2   - K (key) in little endian int array
2473   //
generate_aescrypt_encryptBlock()2474   address generate_aescrypt_encryptBlock() {
2475     __ align(CodeEntryAlignment);
2476     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2477 
2478     Label L_doLast;
2479 
2480     const Register from        = c_rarg0;  // source array address
2481     const Register to          = c_rarg1;  // destination array address
2482     const Register key         = c_rarg2;  // key array address
2483     const Register keylen      = rscratch1;
2484 
2485     address start = __ pc();
2486     __ enter();
2487 
2488     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2489 
2490     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2491 
2492     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2493     __ rev32(v1, __ T16B, v1);
2494     __ rev32(v2, __ T16B, v2);
2495     __ rev32(v3, __ T16B, v3);
2496     __ rev32(v4, __ T16B, v4);
2497     __ aese(v0, v1);
2498     __ aesmc(v0, v0);
2499     __ aese(v0, v2);
2500     __ aesmc(v0, v0);
2501     __ aese(v0, v3);
2502     __ aesmc(v0, v0);
2503     __ aese(v0, v4);
2504     __ aesmc(v0, v0);
2505 
2506     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2507     __ rev32(v1, __ T16B, v1);
2508     __ rev32(v2, __ T16B, v2);
2509     __ rev32(v3, __ T16B, v3);
2510     __ rev32(v4, __ T16B, v4);
2511     __ aese(v0, v1);
2512     __ aesmc(v0, v0);
2513     __ aese(v0, v2);
2514     __ aesmc(v0, v0);
2515     __ aese(v0, v3);
2516     __ aesmc(v0, v0);
2517     __ aese(v0, v4);
2518     __ aesmc(v0, v0);
2519 
2520     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2521     __ rev32(v1, __ T16B, v1);
2522     __ rev32(v2, __ T16B, v2);
2523 
2524     __ cmpw(keylen, 44);
2525     __ br(Assembler::EQ, L_doLast);
2526 
2527     __ aese(v0, v1);
2528     __ aesmc(v0, v0);
2529     __ aese(v0, v2);
2530     __ aesmc(v0, v0);
2531 
2532     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2533     __ rev32(v1, __ T16B, v1);
2534     __ rev32(v2, __ T16B, v2);
2535 
2536     __ cmpw(keylen, 52);
2537     __ br(Assembler::EQ, L_doLast);
2538 
2539     __ aese(v0, v1);
2540     __ aesmc(v0, v0);
2541     __ aese(v0, v2);
2542     __ aesmc(v0, v0);
2543 
2544     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2545     __ rev32(v1, __ T16B, v1);
2546     __ rev32(v2, __ T16B, v2);
2547 
2548     __ BIND(L_doLast);
2549 
2550     __ aese(v0, v1);
2551     __ aesmc(v0, v0);
2552     __ aese(v0, v2);
2553 
2554     __ ld1(v1, __ T16B, key);
2555     __ rev32(v1, __ T16B, v1);
2556     __ eor(v0, __ T16B, v0, v1);
2557 
2558     __ st1(v0, __ T16B, to);
2559 
2560     __ mov(r0, 0);
2561 
2562     __ leave();
2563     __ ret(lr);
2564 
2565     return start;
2566   }
2567 
2568   // Arguments:
2569   //
2570   // Inputs:
2571   //   c_rarg0   - source byte array address
2572   //   c_rarg1   - destination byte array address
2573   //   c_rarg2   - K (key) in little endian int array
2574   //
generate_aescrypt_decryptBlock()2575   address generate_aescrypt_decryptBlock() {
2576     assert(UseAES, "need AES instructions and misaligned SSE support");
2577     __ align(CodeEntryAlignment);
2578     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2579     Label L_doLast;
2580 
2581     const Register from        = c_rarg0;  // source array address
2582     const Register to          = c_rarg1;  // destination array address
2583     const Register key         = c_rarg2;  // key array address
2584     const Register keylen      = rscratch1;
2585 
2586     address start = __ pc();
2587     __ enter(); // required for proper stackwalking of RuntimeStub frame
2588 
2589     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2590 
2591     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2592 
2593     __ ld1(v5, __ T16B, __ post(key, 16));
2594     __ rev32(v5, __ T16B, v5);
2595 
2596     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2597     __ rev32(v1, __ T16B, v1);
2598     __ rev32(v2, __ T16B, v2);
2599     __ rev32(v3, __ T16B, v3);
2600     __ rev32(v4, __ T16B, v4);
2601     __ aesd(v0, v1);
2602     __ aesimc(v0, v0);
2603     __ aesd(v0, v2);
2604     __ aesimc(v0, v0);
2605     __ aesd(v0, v3);
2606     __ aesimc(v0, v0);
2607     __ aesd(v0, v4);
2608     __ aesimc(v0, v0);
2609 
2610     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2611     __ rev32(v1, __ T16B, v1);
2612     __ rev32(v2, __ T16B, v2);
2613     __ rev32(v3, __ T16B, v3);
2614     __ rev32(v4, __ T16B, v4);
2615     __ aesd(v0, v1);
2616     __ aesimc(v0, v0);
2617     __ aesd(v0, v2);
2618     __ aesimc(v0, v0);
2619     __ aesd(v0, v3);
2620     __ aesimc(v0, v0);
2621     __ aesd(v0, v4);
2622     __ aesimc(v0, v0);
2623 
2624     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2625     __ rev32(v1, __ T16B, v1);
2626     __ rev32(v2, __ T16B, v2);
2627 
2628     __ cmpw(keylen, 44);
2629     __ br(Assembler::EQ, L_doLast);
2630 
2631     __ aesd(v0, v1);
2632     __ aesimc(v0, v0);
2633     __ aesd(v0, v2);
2634     __ aesimc(v0, v0);
2635 
2636     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2637     __ rev32(v1, __ T16B, v1);
2638     __ rev32(v2, __ T16B, v2);
2639 
2640     __ cmpw(keylen, 52);
2641     __ br(Assembler::EQ, L_doLast);
2642 
2643     __ aesd(v0, v1);
2644     __ aesimc(v0, v0);
2645     __ aesd(v0, v2);
2646     __ aesimc(v0, v0);
2647 
2648     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2649     __ rev32(v1, __ T16B, v1);
2650     __ rev32(v2, __ T16B, v2);
2651 
2652     __ BIND(L_doLast);
2653 
2654     __ aesd(v0, v1);
2655     __ aesimc(v0, v0);
2656     __ aesd(v0, v2);
2657 
2658     __ eor(v0, __ T16B, v0, v5);
2659 
2660     __ st1(v0, __ T16B, to);
2661 
2662     __ mov(r0, 0);
2663 
2664     __ leave();
2665     __ ret(lr);
2666 
2667     return start;
2668   }
2669 
2670   // Arguments:
2671   //
2672   // Inputs:
2673   //   c_rarg0   - source byte array address
2674   //   c_rarg1   - destination byte array address
2675   //   c_rarg2   - K (key) in little endian int array
2676   //   c_rarg3   - r vector byte array address
2677   //   c_rarg4   - input length
2678   //
2679   // Output:
2680   //   x0        - input length
2681   //
generate_cipherBlockChaining_encryptAESCrypt()2682   address generate_cipherBlockChaining_encryptAESCrypt() {
2683     assert(UseAES, "need AES instructions and misaligned SSE support");
2684     __ align(CodeEntryAlignment);
2685     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2686 
2687     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2688 
2689     const Register from        = c_rarg0;  // source array address
2690     const Register to          = c_rarg1;  // destination array address
2691     const Register key         = c_rarg2;  // key array address
2692     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2693                                            // and left with the results of the last encryption block
2694     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2695     const Register keylen      = rscratch1;
2696 
2697     address start = __ pc();
2698 
2699       __ enter();
2700 
2701       __ movw(rscratch2, len_reg);
2702 
2703       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2704 
2705       __ ld1(v0, __ T16B, rvec);
2706 
2707       __ cmpw(keylen, 52);
2708       __ br(Assembler::CC, L_loadkeys_44);
2709       __ br(Assembler::EQ, L_loadkeys_52);
2710 
2711       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2712       __ rev32(v17, __ T16B, v17);
2713       __ rev32(v18, __ T16B, v18);
2714     __ BIND(L_loadkeys_52);
2715       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2716       __ rev32(v19, __ T16B, v19);
2717       __ rev32(v20, __ T16B, v20);
2718     __ BIND(L_loadkeys_44);
2719       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2720       __ rev32(v21, __ T16B, v21);
2721       __ rev32(v22, __ T16B, v22);
2722       __ rev32(v23, __ T16B, v23);
2723       __ rev32(v24, __ T16B, v24);
2724       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2725       __ rev32(v25, __ T16B, v25);
2726       __ rev32(v26, __ T16B, v26);
2727       __ rev32(v27, __ T16B, v27);
2728       __ rev32(v28, __ T16B, v28);
2729       __ ld1(v29, v30, v31, __ T16B, key);
2730       __ rev32(v29, __ T16B, v29);
2731       __ rev32(v30, __ T16B, v30);
2732       __ rev32(v31, __ T16B, v31);
2733 
2734     __ BIND(L_aes_loop);
2735       __ ld1(v1, __ T16B, __ post(from, 16));
2736       __ eor(v0, __ T16B, v0, v1);
2737 
2738       __ br(Assembler::CC, L_rounds_44);
2739       __ br(Assembler::EQ, L_rounds_52);
2740 
2741       __ aese(v0, v17); __ aesmc(v0, v0);
2742       __ aese(v0, v18); __ aesmc(v0, v0);
2743     __ BIND(L_rounds_52);
2744       __ aese(v0, v19); __ aesmc(v0, v0);
2745       __ aese(v0, v20); __ aesmc(v0, v0);
2746     __ BIND(L_rounds_44);
2747       __ aese(v0, v21); __ aesmc(v0, v0);
2748       __ aese(v0, v22); __ aesmc(v0, v0);
2749       __ aese(v0, v23); __ aesmc(v0, v0);
2750       __ aese(v0, v24); __ aesmc(v0, v0);
2751       __ aese(v0, v25); __ aesmc(v0, v0);
2752       __ aese(v0, v26); __ aesmc(v0, v0);
2753       __ aese(v0, v27); __ aesmc(v0, v0);
2754       __ aese(v0, v28); __ aesmc(v0, v0);
2755       __ aese(v0, v29); __ aesmc(v0, v0);
2756       __ aese(v0, v30);
2757       __ eor(v0, __ T16B, v0, v31);
2758 
2759       __ st1(v0, __ T16B, __ post(to, 16));
2760 
2761       __ subw(len_reg, len_reg, 16);
2762       __ cbnzw(len_reg, L_aes_loop);
2763 
2764       __ st1(v0, __ T16B, rvec);
2765 
2766       __ mov(r0, rscratch2);
2767 
2768       __ leave();
2769       __ ret(lr);
2770 
2771       return start;
2772   }
2773 
2774   // Arguments:
2775   //
2776   // Inputs:
2777   //   c_rarg0   - source byte array address
2778   //   c_rarg1   - destination byte array address
2779   //   c_rarg2   - K (key) in little endian int array
2780   //   c_rarg3   - r vector byte array address
2781   //   c_rarg4   - input length
2782   //
2783   // Output:
2784   //   r0        - input length
2785   //
generate_cipherBlockChaining_decryptAESCrypt()2786   address generate_cipherBlockChaining_decryptAESCrypt() {
2787     assert(UseAES, "need AES instructions and misaligned SSE support");
2788     __ align(CodeEntryAlignment);
2789     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2790 
2791     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2792 
2793     const Register from        = c_rarg0;  // source array address
2794     const Register to          = c_rarg1;  // destination array address
2795     const Register key         = c_rarg2;  // key array address
2796     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2797                                            // and left with the results of the last encryption block
2798     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2799     const Register keylen      = rscratch1;
2800 
2801     address start = __ pc();
2802 
2803       __ enter();
2804 
2805       __ movw(rscratch2, len_reg);
2806 
2807       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2808 
2809       __ ld1(v2, __ T16B, rvec);
2810 
2811       __ ld1(v31, __ T16B, __ post(key, 16));
2812       __ rev32(v31, __ T16B, v31);
2813 
2814       __ cmpw(keylen, 52);
2815       __ br(Assembler::CC, L_loadkeys_44);
2816       __ br(Assembler::EQ, L_loadkeys_52);
2817 
2818       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2819       __ rev32(v17, __ T16B, v17);
2820       __ rev32(v18, __ T16B, v18);
2821     __ BIND(L_loadkeys_52);
2822       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2823       __ rev32(v19, __ T16B, v19);
2824       __ rev32(v20, __ T16B, v20);
2825     __ BIND(L_loadkeys_44);
2826       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2827       __ rev32(v21, __ T16B, v21);
2828       __ rev32(v22, __ T16B, v22);
2829       __ rev32(v23, __ T16B, v23);
2830       __ rev32(v24, __ T16B, v24);
2831       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2832       __ rev32(v25, __ T16B, v25);
2833       __ rev32(v26, __ T16B, v26);
2834       __ rev32(v27, __ T16B, v27);
2835       __ rev32(v28, __ T16B, v28);
2836       __ ld1(v29, v30, __ T16B, key);
2837       __ rev32(v29, __ T16B, v29);
2838       __ rev32(v30, __ T16B, v30);
2839 
2840     __ BIND(L_aes_loop);
2841       __ ld1(v0, __ T16B, __ post(from, 16));
2842       __ orr(v1, __ T16B, v0, v0);
2843 
2844       __ br(Assembler::CC, L_rounds_44);
2845       __ br(Assembler::EQ, L_rounds_52);
2846 
2847       __ aesd(v0, v17); __ aesimc(v0, v0);
2848       __ aesd(v0, v18); __ aesimc(v0, v0);
2849     __ BIND(L_rounds_52);
2850       __ aesd(v0, v19); __ aesimc(v0, v0);
2851       __ aesd(v0, v20); __ aesimc(v0, v0);
2852     __ BIND(L_rounds_44);
2853       __ aesd(v0, v21); __ aesimc(v0, v0);
2854       __ aesd(v0, v22); __ aesimc(v0, v0);
2855       __ aesd(v0, v23); __ aesimc(v0, v0);
2856       __ aesd(v0, v24); __ aesimc(v0, v0);
2857       __ aesd(v0, v25); __ aesimc(v0, v0);
2858       __ aesd(v0, v26); __ aesimc(v0, v0);
2859       __ aesd(v0, v27); __ aesimc(v0, v0);
2860       __ aesd(v0, v28); __ aesimc(v0, v0);
2861       __ aesd(v0, v29); __ aesimc(v0, v0);
2862       __ aesd(v0, v30);
2863       __ eor(v0, __ T16B, v0, v31);
2864       __ eor(v0, __ T16B, v0, v2);
2865 
2866       __ st1(v0, __ T16B, __ post(to, 16));
2867       __ orr(v2, __ T16B, v1, v1);
2868 
2869       __ subw(len_reg, len_reg, 16);
2870       __ cbnzw(len_reg, L_aes_loop);
2871 
2872       __ st1(v2, __ T16B, rvec);
2873 
2874       __ mov(r0, rscratch2);
2875 
2876       __ leave();
2877       __ ret(lr);
2878 
2879     return start;
2880   }
2881 
2882   // Arguments:
2883   //
2884   // Inputs:
2885   //   c_rarg0   - byte[]  source+offset
2886   //   c_rarg1   - int[]   SHA.state
2887   //   c_rarg2   - int     offset
2888   //   c_rarg3   - int     limit
2889   //
generate_sha1_implCompress(bool multi_block,const char * name)2890   address generate_sha1_implCompress(bool multi_block, const char *name) {
2891     __ align(CodeEntryAlignment);
2892     StubCodeMark mark(this, "StubRoutines", name);
2893     address start = __ pc();
2894 
2895     Register buf   = c_rarg0;
2896     Register state = c_rarg1;
2897     Register ofs   = c_rarg2;
2898     Register limit = c_rarg3;
2899 
2900     Label keys;
2901     Label sha1_loop;
2902 
2903     // load the keys into v0..v3
2904     __ adr(rscratch1, keys);
2905     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2906     // load 5 words state into v6, v7
2907     __ ldrq(v6, Address(state, 0));
2908     __ ldrs(v7, Address(state, 16));
2909 
2910 
2911     __ BIND(sha1_loop);
2912     // load 64 bytes of data into v16..v19
2913     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2914     __ rev32(v16, __ T16B, v16);
2915     __ rev32(v17, __ T16B, v17);
2916     __ rev32(v18, __ T16B, v18);
2917     __ rev32(v19, __ T16B, v19);
2918 
2919     // do the sha1
2920     __ addv(v4, __ T4S, v16, v0);
2921     __ orr(v20, __ T16B, v6, v6);
2922 
2923     FloatRegister d0 = v16;
2924     FloatRegister d1 = v17;
2925     FloatRegister d2 = v18;
2926     FloatRegister d3 = v19;
2927 
2928     for (int round = 0; round < 20; round++) {
2929       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2930       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2931       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2932       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2933       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2934 
2935       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2936       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2937       __ sha1h(tmp2, __ T4S, v20);
2938       if (round < 5)
2939         __ sha1c(v20, __ T4S, tmp3, tmp4);
2940       else if (round < 10 || round >= 15)
2941         __ sha1p(v20, __ T4S, tmp3, tmp4);
2942       else
2943         __ sha1m(v20, __ T4S, tmp3, tmp4);
2944       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2945 
2946       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2947     }
2948 
2949     __ addv(v7, __ T2S, v7, v21);
2950     __ addv(v6, __ T4S, v6, v20);
2951 
2952     if (multi_block) {
2953       __ add(ofs, ofs, 64);
2954       __ cmp(ofs, limit);
2955       __ br(Assembler::LE, sha1_loop);
2956       __ mov(c_rarg0, ofs); // return ofs
2957     }
2958 
2959     __ strq(v6, Address(state, 0));
2960     __ strs(v7, Address(state, 16));
2961 
2962     __ ret(lr);
2963 
2964     __ bind(keys);
2965     __ emit_int32(0x5a827999);
2966     __ emit_int32(0x6ed9eba1);
2967     __ emit_int32(0x8f1bbcdc);
2968     __ emit_int32(0xca62c1d6);
2969 
2970     return start;
2971   }
2972 
2973 
2974   // Arguments:
2975   //
2976   // Inputs:
2977   //   c_rarg0   - byte[]  source+offset
2978   //   c_rarg1   - int[]   SHA.state
2979   //   c_rarg2   - int     offset
2980   //   c_rarg3   - int     limit
2981   //
generate_sha256_implCompress(bool multi_block,const char * name)2982   address generate_sha256_implCompress(bool multi_block, const char *name) {
2983     static const uint32_t round_consts[64] = {
2984       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2985       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2986       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2987       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2988       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2989       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2990       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2991       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2992       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2993       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2994       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2995       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2996       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2997       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2998       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
2999       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3000     };
3001     __ align(CodeEntryAlignment);
3002     StubCodeMark mark(this, "StubRoutines", name);
3003     address start = __ pc();
3004 
3005     Register buf   = c_rarg0;
3006     Register state = c_rarg1;
3007     Register ofs   = c_rarg2;
3008     Register limit = c_rarg3;
3009 
3010     Label sha1_loop;
3011 
3012     __ stpd(v8, v9, __ pre(sp, -32));
3013     __ stpd(v10, v11, Address(sp, 16));
3014 
3015 // dga == v0
3016 // dgb == v1
3017 // dg0 == v2
3018 // dg1 == v3
3019 // dg2 == v4
3020 // t0 == v6
3021 // t1 == v7
3022 
3023     // load 16 keys to v16..v31
3024     __ lea(rscratch1, ExternalAddress((address)round_consts));
3025     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3026     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3027     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3028     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3029 
3030     // load 8 words (256 bits) state
3031     __ ldpq(v0, v1, state);
3032 
3033     __ BIND(sha1_loop);
3034     // load 64 bytes of data into v8..v11
3035     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3036     __ rev32(v8, __ T16B, v8);
3037     __ rev32(v9, __ T16B, v9);
3038     __ rev32(v10, __ T16B, v10);
3039     __ rev32(v11, __ T16B, v11);
3040 
3041     __ addv(v6, __ T4S, v8, v16);
3042     __ orr(v2, __ T16B, v0, v0);
3043     __ orr(v3, __ T16B, v1, v1);
3044 
3045     FloatRegister d0 = v8;
3046     FloatRegister d1 = v9;
3047     FloatRegister d2 = v10;
3048     FloatRegister d3 = v11;
3049 
3050 
3051     for (int round = 0; round < 16; round++) {
3052       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3053       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3054       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3055       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3056 
3057       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3058        __ orr(v4, __ T16B, v2, v2);
3059       if (round < 15)
3060         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3061       __ sha256h(v2, __ T4S, v3, tmp2);
3062       __ sha256h2(v3, __ T4S, v4, tmp2);
3063       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3064 
3065       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3066     }
3067 
3068     __ addv(v0, __ T4S, v0, v2);
3069     __ addv(v1, __ T4S, v1, v3);
3070 
3071     if (multi_block) {
3072       __ add(ofs, ofs, 64);
3073       __ cmp(ofs, limit);
3074       __ br(Assembler::LE, sha1_loop);
3075       __ mov(c_rarg0, ofs); // return ofs
3076     }
3077 
3078     __ ldpd(v10, v11, Address(sp, 16));
3079     __ ldpd(v8, v9, __ post(sp, 32));
3080 
3081     __ stpq(v0, v1, state);
3082 
3083     __ ret(lr);
3084 
3085     return start;
3086   }
3087 
3088   // Safefetch stubs.
generate_safefetch(const char * name,int size,address * entry,address * fault_pc,address * continuation_pc)3089   void generate_safefetch(const char* name, int size, address* entry,
3090                           address* fault_pc, address* continuation_pc) {
3091     // safefetch signatures:
3092     //   int      SafeFetch32(int*      adr, int      errValue);
3093     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3094     //
3095     // arguments:
3096     //   c_rarg0 = adr
3097     //   c_rarg1 = errValue
3098     //
3099     // result:
3100     //   PPC_RET  = *adr or errValue
3101 
3102     StubCodeMark mark(this, "StubRoutines", name);
3103 
3104     // Entry point, pc or function descriptor.
3105     *entry = __ pc();
3106 
3107     // Load *adr into c_rarg1, may fault.
3108     *fault_pc = __ pc();
3109     switch (size) {
3110       case 4:
3111         // int32_t
3112         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3113         break;
3114       case 8:
3115         // int64_t
3116         __ ldr(c_rarg1, Address(c_rarg0, 0));
3117         break;
3118       default:
3119         ShouldNotReachHere();
3120     }
3121 
3122     // return errValue or *adr
3123     *continuation_pc = __ pc();
3124     __ mov(r0, c_rarg1);
3125     __ ret(lr);
3126   }
3127 
3128   /**
3129    *  Arguments:
3130    *
3131    * Inputs:
3132    *   c_rarg0   - int crc
3133    *   c_rarg1   - byte* buf
3134    *   c_rarg2   - int length
3135    *
3136    * Ouput:
3137    *       rax   - int crc result
3138    */
generate_updateBytesCRC32()3139   address generate_updateBytesCRC32() {
3140     assert(UseCRC32Intrinsics, "what are we doing here?");
3141 
3142     __ align(CodeEntryAlignment);
3143     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3144 
3145     address start = __ pc();
3146 
3147     const Register crc   = c_rarg0;  // crc
3148     const Register buf   = c_rarg1;  // source java byte array address
3149     const Register len   = c_rarg2;  // length
3150     const Register table0 = c_rarg3; // crc_table address
3151     const Register table1 = c_rarg4;
3152     const Register table2 = c_rarg5;
3153     const Register table3 = c_rarg6;
3154     const Register tmp3 = c_rarg7;
3155 
3156     BLOCK_COMMENT("Entry:");
3157     __ enter(); // required for proper stackwalking of RuntimeStub frame
3158 
3159     __ kernel_crc32(crc, buf, len,
3160               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3161 
3162     __ leave(); // required for proper stackwalking of RuntimeStub frame
3163     __ ret(lr);
3164 
3165     return start;
3166   }
3167 
3168   /**
3169    *  Arguments:
3170    *
3171    * Inputs:
3172    *   c_rarg0   - int crc
3173    *   c_rarg1   - byte* buf
3174    *   c_rarg2   - int length
3175    *   c_rarg3   - int* table
3176    *
3177    * Ouput:
3178    *       r0   - int crc result
3179    */
generate_updateBytesCRC32C()3180   address generate_updateBytesCRC32C() {
3181     assert(UseCRC32CIntrinsics, "what are we doing here?");
3182 
3183     __ align(CodeEntryAlignment);
3184     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3185 
3186     address start = __ pc();
3187 
3188     const Register crc   = c_rarg0;  // crc
3189     const Register buf   = c_rarg1;  // source java byte array address
3190     const Register len   = c_rarg2;  // length
3191     const Register table0 = c_rarg3; // crc_table address
3192     const Register table1 = c_rarg4;
3193     const Register table2 = c_rarg5;
3194     const Register table3 = c_rarg6;
3195     const Register tmp3 = c_rarg7;
3196 
3197     BLOCK_COMMENT("Entry:");
3198     __ enter(); // required for proper stackwalking of RuntimeStub frame
3199 
3200     __ kernel_crc32c(crc, buf, len,
3201               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3202 
3203     __ leave(); // required for proper stackwalking of RuntimeStub frame
3204     __ ret(lr);
3205 
3206     return start;
3207   }
3208 
3209   /***
3210    *  Arguments:
3211    *
3212    *  Inputs:
3213    *   c_rarg0   - int   adler
3214    *   c_rarg1   - byte* buff
3215    *   c_rarg2   - int   len
3216    *
3217    * Output:
3218    *   c_rarg0   - int adler result
3219    */
generate_updateBytesAdler32()3220   address generate_updateBytesAdler32() {
3221     __ align(CodeEntryAlignment);
3222     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3223     address start = __ pc();
3224 
3225     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3226 
3227     // Aliases
3228     Register adler  = c_rarg0;
3229     Register s1     = c_rarg0;
3230     Register s2     = c_rarg3;
3231     Register buff   = c_rarg1;
3232     Register len    = c_rarg2;
3233     Register nmax  = r4;
3234     Register base  = r5;
3235     Register count = r6;
3236     Register temp0 = rscratch1;
3237     Register temp1 = rscratch2;
3238     FloatRegister vbytes = v0;
3239     FloatRegister vs1acc = v1;
3240     FloatRegister vs2acc = v2;
3241     FloatRegister vtable = v3;
3242 
3243     // Max number of bytes we can process before having to take the mod
3244     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3245     uint64_t BASE = 0xfff1;
3246     uint64_t NMAX = 0x15B0;
3247 
3248     __ mov(base, BASE);
3249     __ mov(nmax, NMAX);
3250 
3251     // Load accumulation coefficients for the upper 16 bits
3252     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3253     __ ld1(vtable, __ T16B, Address(temp0));
3254 
3255     // s1 is initialized to the lower 16 bits of adler
3256     // s2 is initialized to the upper 16 bits of adler
3257     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3258     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3259 
3260     // The pipelined loop needs at least 16 elements for 1 iteration
3261     // It does check this, but it is more effective to skip to the cleanup loop
3262     __ cmp(len, 16);
3263     __ br(Assembler::HS, L_nmax);
3264     __ cbz(len, L_combine);
3265 
3266     __ bind(L_simple_by1_loop);
3267     __ ldrb(temp0, Address(__ post(buff, 1)));
3268     __ add(s1, s1, temp0);
3269     __ add(s2, s2, s1);
3270     __ subs(len, len, 1);
3271     __ br(Assembler::HI, L_simple_by1_loop);
3272 
3273     // s1 = s1 % BASE
3274     __ subs(temp0, s1, base);
3275     __ csel(s1, temp0, s1, Assembler::HS);
3276 
3277     // s2 = s2 % BASE
3278     __ lsr(temp0, s2, 16);
3279     __ lsl(temp1, temp0, 4);
3280     __ sub(temp1, temp1, temp0);
3281     __ add(s2, temp1, s2, ext::uxth);
3282 
3283     __ subs(temp0, s2, base);
3284     __ csel(s2, temp0, s2, Assembler::HS);
3285 
3286     __ b(L_combine);
3287 
3288     __ bind(L_nmax);
3289     __ subs(len, len, nmax);
3290     __ sub(count, nmax, 16);
3291     __ br(Assembler::LO, L_by16);
3292 
3293     __ bind(L_nmax_loop);
3294 
3295     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3296                                       vbytes, vs1acc, vs2acc, vtable);
3297 
3298     __ subs(count, count, 16);
3299     __ br(Assembler::HS, L_nmax_loop);
3300 
3301     // s1 = s1 % BASE
3302     __ lsr(temp0, s1, 16);
3303     __ lsl(temp1, temp0, 4);
3304     __ sub(temp1, temp1, temp0);
3305     __ add(temp1, temp1, s1, ext::uxth);
3306 
3307     __ lsr(temp0, temp1, 16);
3308     __ lsl(s1, temp0, 4);
3309     __ sub(s1, s1, temp0);
3310     __ add(s1, s1, temp1, ext:: uxth);
3311 
3312     __ subs(temp0, s1, base);
3313     __ csel(s1, temp0, s1, Assembler::HS);
3314 
3315     // s2 = s2 % BASE
3316     __ lsr(temp0, s2, 16);
3317     __ lsl(temp1, temp0, 4);
3318     __ sub(temp1, temp1, temp0);
3319     __ add(temp1, temp1, s2, ext::uxth);
3320 
3321     __ lsr(temp0, temp1, 16);
3322     __ lsl(s2, temp0, 4);
3323     __ sub(s2, s2, temp0);
3324     __ add(s2, s2, temp1, ext:: uxth);
3325 
3326     __ subs(temp0, s2, base);
3327     __ csel(s2, temp0, s2, Assembler::HS);
3328 
3329     __ subs(len, len, nmax);
3330     __ sub(count, nmax, 16);
3331     __ br(Assembler::HS, L_nmax_loop);
3332 
3333     __ bind(L_by16);
3334     __ adds(len, len, count);
3335     __ br(Assembler::LO, L_by1);
3336 
3337     __ bind(L_by16_loop);
3338 
3339     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3340                                       vbytes, vs1acc, vs2acc, vtable);
3341 
3342     __ subs(len, len, 16);
3343     __ br(Assembler::HS, L_by16_loop);
3344 
3345     __ bind(L_by1);
3346     __ adds(len, len, 15);
3347     __ br(Assembler::LO, L_do_mod);
3348 
3349     __ bind(L_by1_loop);
3350     __ ldrb(temp0, Address(__ post(buff, 1)));
3351     __ add(s1, temp0, s1);
3352     __ add(s2, s2, s1);
3353     __ subs(len, len, 1);
3354     __ br(Assembler::HS, L_by1_loop);
3355 
3356     __ bind(L_do_mod);
3357     // s1 = s1 % BASE
3358     __ lsr(temp0, s1, 16);
3359     __ lsl(temp1, temp0, 4);
3360     __ sub(temp1, temp1, temp0);
3361     __ add(temp1, temp1, s1, ext::uxth);
3362 
3363     __ lsr(temp0, temp1, 16);
3364     __ lsl(s1, temp0, 4);
3365     __ sub(s1, s1, temp0);
3366     __ add(s1, s1, temp1, ext:: uxth);
3367 
3368     __ subs(temp0, s1, base);
3369     __ csel(s1, temp0, s1, Assembler::HS);
3370 
3371     // s2 = s2 % BASE
3372     __ lsr(temp0, s2, 16);
3373     __ lsl(temp1, temp0, 4);
3374     __ sub(temp1, temp1, temp0);
3375     __ add(temp1, temp1, s2, ext::uxth);
3376 
3377     __ lsr(temp0, temp1, 16);
3378     __ lsl(s2, temp0, 4);
3379     __ sub(s2, s2, temp0);
3380     __ add(s2, s2, temp1, ext:: uxth);
3381 
3382     __ subs(temp0, s2, base);
3383     __ csel(s2, temp0, s2, Assembler::HS);
3384 
3385     // Combine lower bits and higher bits
3386     __ bind(L_combine);
3387     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3388 
3389     __ ret(lr);
3390 
3391     return start;
3392   }
3393 
generate_updateBytesAdler32_accum(Register s1,Register s2,Register buff,Register temp0,Register temp1,FloatRegister vbytes,FloatRegister vs1acc,FloatRegister vs2acc,FloatRegister vtable)3394   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3395           Register temp0, Register temp1, FloatRegister vbytes,
3396           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3397     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3398     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3399     // In non-vectorized code, we update s1 and s2 as:
3400     //   s1 <- s1 + b1
3401     //   s2 <- s2 + s1
3402     //   s1 <- s1 + b2
3403     //   s2 <- s2 + b1
3404     //   ...
3405     //   s1 <- s1 + b16
3406     //   s2 <- s2 + s1
3407     // Putting above assignments together, we have:
3408     //   s1_new = s1 + b1 + b2 + ... + b16
3409     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3410     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3411     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3412     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3413 
3414     // s2 = s2 + s1 * 16
3415     __ add(s2, s2, s1, Assembler::LSL, 4);
3416 
3417     // vs1acc = b1 + b2 + b3 + ... + b16
3418     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3419     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3420     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3421     __ uaddlv(vs1acc, __ T16B, vbytes);
3422     __ uaddlv(vs2acc, __ T8H, vs2acc);
3423 
3424     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3425     __ fmovd(temp0, vs1acc);
3426     __ fmovd(temp1, vs2acc);
3427     __ add(s1, s1, temp0);
3428     __ add(s2, s2, temp1);
3429   }
3430 
3431   /**
3432    *  Arguments:
3433    *
3434    *  Input:
3435    *    c_rarg0   - x address
3436    *    c_rarg1   - x length
3437    *    c_rarg2   - y address
3438    *    c_rarg3   - y lenth
3439    *    c_rarg4   - z address
3440    *    c_rarg5   - z length
3441    */
generate_multiplyToLen()3442   address generate_multiplyToLen() {
3443     __ align(CodeEntryAlignment);
3444     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3445 
3446     address start = __ pc();
3447     const Register x     = r0;
3448     const Register xlen  = r1;
3449     const Register y     = r2;
3450     const Register ylen  = r3;
3451     const Register z     = r4;
3452     const Register zlen  = r5;
3453 
3454     const Register tmp1  = r10;
3455     const Register tmp2  = r11;
3456     const Register tmp3  = r12;
3457     const Register tmp4  = r13;
3458     const Register tmp5  = r14;
3459     const Register tmp6  = r15;
3460     const Register tmp7  = r16;
3461 
3462     BLOCK_COMMENT("Entry:");
3463     __ enter(); // required for proper stackwalking of RuntimeStub frame
3464     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3465     __ leave(); // required for proper stackwalking of RuntimeStub frame
3466     __ ret(lr);
3467 
3468     return start;
3469   }
3470 
generate_squareToLen()3471   address generate_squareToLen() {
3472     // squareToLen algorithm for sizes 1..127 described in java code works
3473     // faster than multiply_to_len on some CPUs and slower on others, but
3474     // multiply_to_len shows a bit better overall results
3475     __ align(CodeEntryAlignment);
3476     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3477     address start = __ pc();
3478 
3479     const Register x     = r0;
3480     const Register xlen  = r1;
3481     const Register z     = r2;
3482     const Register zlen  = r3;
3483     const Register y     = r4; // == x
3484     const Register ylen  = r5; // == xlen
3485 
3486     const Register tmp1  = r10;
3487     const Register tmp2  = r11;
3488     const Register tmp3  = r12;
3489     const Register tmp4  = r13;
3490     const Register tmp5  = r14;
3491     const Register tmp6  = r15;
3492     const Register tmp7  = r16;
3493 
3494     RegSet spilled_regs = RegSet::of(y, ylen);
3495     BLOCK_COMMENT("Entry:");
3496     __ enter();
3497     __ push(spilled_regs, sp);
3498     __ mov(y, x);
3499     __ mov(ylen, xlen);
3500     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3501     __ pop(spilled_regs, sp);
3502     __ leave();
3503     __ ret(lr);
3504     return start;
3505   }
3506 
generate_mulAdd()3507   address generate_mulAdd() {
3508     __ align(CodeEntryAlignment);
3509     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3510 
3511     address start = __ pc();
3512 
3513     const Register out     = r0;
3514     const Register in      = r1;
3515     const Register offset  = r2;
3516     const Register len     = r3;
3517     const Register k       = r4;
3518 
3519     BLOCK_COMMENT("Entry:");
3520     __ enter();
3521     __ mul_add(out, in, offset, len, k);
3522     __ leave();
3523     __ ret(lr);
3524 
3525     return start;
3526   }
3527 
ghash_multiply(FloatRegister result_lo,FloatRegister result_hi,FloatRegister a,FloatRegister b,FloatRegister a1_xor_a0,FloatRegister tmp1,FloatRegister tmp2,FloatRegister tmp3,FloatRegister tmp4)3528   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3529                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3530                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3531     // Karatsuba multiplication performs a 128*128 -> 256-bit
3532     // multiplication in three 128-bit multiplications and a few
3533     // additions.
3534     //
3535     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3536     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3537     //
3538     // Inputs:
3539     //
3540     // A0 in a.d[0]     (subkey)
3541     // A1 in a.d[1]
3542     // (A1+A0) in a1_xor_a0.d[0]
3543     //
3544     // B0 in b.d[0]     (state)
3545     // B1 in b.d[1]
3546 
3547     __ ext(tmp1, __ T16B, b, b, 0x08);
3548     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3549     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3550     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3551     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3552 
3553     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3554     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3555     __ eor(tmp2, __ T16B, tmp2, tmp4);
3556     __ eor(tmp2, __ T16B, tmp2, tmp3);
3557 
3558     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3559     __ ins(result_hi, __ D, tmp2, 0, 1);
3560     __ ins(result_lo, __ D, tmp2, 1, 0);
3561   }
3562 
ghash_reduce(FloatRegister result,FloatRegister lo,FloatRegister hi,FloatRegister p,FloatRegister z,FloatRegister t1)3563   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3564                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3565     const FloatRegister t0 = result;
3566 
3567     // The GCM field polynomial f is z^128 + p(z), where p =
3568     // z^7+z^2+z+1.
3569     //
3570     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3571     //
3572     // so, given that the product we're reducing is
3573     //    a == lo + hi * z^128
3574     // substituting,
3575     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3576     //
3577     // we reduce by multiplying hi by p(z) and subtracting the result
3578     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3579     // bits we can do this with two 64-bit multiplications, lo*p and
3580     // hi*p.
3581 
3582     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3583     __ ext(t1, __ T16B, t0, z, 8);
3584     __ eor(hi, __ T16B, hi, t1);
3585     __ ext(t1, __ T16B, z, t0, 8);
3586     __ eor(lo, __ T16B, lo, t1);
3587     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3588     __ eor(result, __ T16B, lo, t0);
3589   }
3590 
generate_has_negatives(address & has_negatives_long)3591   address generate_has_negatives(address &has_negatives_long) {
3592     const int large_loop_size = 64;
3593     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3594     int dcache_line = VM_Version::dcache_line_size();
3595 
3596     Register ary1 = r1, len = r2, result = r0;
3597 
3598     __ align(CodeEntryAlignment);
3599 
3600     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3601 
3602     address entry = __ pc();
3603 
3604     __ enter();
3605 
3606   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3607         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3608 
3609   __ cmp(len, 15);
3610   __ br(Assembler::GT, LEN_OVER_15);
3611   // The only case when execution falls into this code is when pointer is near
3612   // the end of memory page and we have to avoid reading next page
3613   __ add(ary1, ary1, len);
3614   __ subs(len, len, 8);
3615   __ br(Assembler::GT, LEN_OVER_8);
3616   __ ldr(rscratch2, Address(ary1, -8));
3617   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3618   __ lsrv(rscratch2, rscratch2, rscratch1);
3619   __ tst(rscratch2, UPPER_BIT_MASK);
3620   __ cset(result, Assembler::NE);
3621   __ leave();
3622   __ ret(lr);
3623   __ bind(LEN_OVER_8);
3624   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3625   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3626   __ tst(rscratch2, UPPER_BIT_MASK);
3627   __ br(Assembler::NE, RET_TRUE_NO_POP);
3628   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3629   __ lsrv(rscratch1, rscratch1, rscratch2);
3630   __ tst(rscratch1, UPPER_BIT_MASK);
3631   __ cset(result, Assembler::NE);
3632   __ leave();
3633   __ ret(lr);
3634 
3635   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3636   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3637 
3638   has_negatives_long = __ pc(); // 2nd entry point
3639 
3640   __ enter();
3641 
3642   __ bind(LEN_OVER_15);
3643     __ push(spilled_regs, sp);
3644     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3645     __ cbz(rscratch2, ALIGNED);
3646     __ ldp(tmp6, tmp1, Address(ary1));
3647     __ mov(tmp5, 16);
3648     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3649     __ add(ary1, ary1, rscratch1);
3650     __ sub(len, len, rscratch1);
3651     __ orr(tmp6, tmp6, tmp1);
3652     __ tst(tmp6, UPPER_BIT_MASK);
3653     __ br(Assembler::NE, RET_TRUE);
3654 
3655   __ bind(ALIGNED);
3656     __ cmp(len, large_loop_size);
3657     __ br(Assembler::LT, CHECK_16);
3658     // Perform 16-byte load as early return in pre-loop to handle situation
3659     // when initially aligned large array has negative values at starting bytes,
3660     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3661     // slower. Cases with negative bytes further ahead won't be affected that
3662     // much. In fact, it'll be faster due to early loads, less instructions and
3663     // less branches in LARGE_LOOP.
3664     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3665     __ sub(len, len, 16);
3666     __ orr(tmp6, tmp6, tmp1);
3667     __ tst(tmp6, UPPER_BIT_MASK);
3668     __ br(Assembler::NE, RET_TRUE);
3669     __ cmp(len, large_loop_size);
3670     __ br(Assembler::LT, CHECK_16);
3671 
3672     if (SoftwarePrefetchHintDistance >= 0
3673         && SoftwarePrefetchHintDistance >= dcache_line) {
3674       // initial prefetch
3675       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3676     }
3677   __ bind(LARGE_LOOP);
3678     if (SoftwarePrefetchHintDistance >= 0) {
3679       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3680     }
3681     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3682     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3683     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3684     // instructions per cycle and have less branches, but this approach disables
3685     // early return, thus, all 64 bytes are loaded and checked every time.
3686     __ ldp(tmp2, tmp3, Address(ary1));
3687     __ ldp(tmp4, tmp5, Address(ary1, 16));
3688     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3689     __ ldp(tmp6, tmp1, Address(ary1, 48));
3690     __ add(ary1, ary1, large_loop_size);
3691     __ sub(len, len, large_loop_size);
3692     __ orr(tmp2, tmp2, tmp3);
3693     __ orr(tmp4, tmp4, tmp5);
3694     __ orr(rscratch1, rscratch1, rscratch2);
3695     __ orr(tmp6, tmp6, tmp1);
3696     __ orr(tmp2, tmp2, tmp4);
3697     __ orr(rscratch1, rscratch1, tmp6);
3698     __ orr(tmp2, tmp2, rscratch1);
3699     __ tst(tmp2, UPPER_BIT_MASK);
3700     __ br(Assembler::NE, RET_TRUE);
3701     __ cmp(len, large_loop_size);
3702     __ br(Assembler::GE, LARGE_LOOP);
3703 
3704   __ bind(CHECK_16); // small 16-byte load pre-loop
3705     __ cmp(len, 16);
3706     __ br(Assembler::LT, POST_LOOP16);
3707 
3708   __ bind(LOOP16); // small 16-byte load loop
3709     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3710     __ sub(len, len, 16);
3711     __ orr(tmp2, tmp2, tmp3);
3712     __ tst(tmp2, UPPER_BIT_MASK);
3713     __ br(Assembler::NE, RET_TRUE);
3714     __ cmp(len, 16);
3715     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3716 
3717   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3718     __ cmp(len, 8);
3719     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3720     __ ldr(tmp3, Address(__ post(ary1, 8)));
3721     __ sub(len, len, 8);
3722     __ tst(tmp3, UPPER_BIT_MASK);
3723     __ br(Assembler::NE, RET_TRUE);
3724 
3725   __ bind(POST_LOOP16_LOAD_TAIL);
3726     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3727     __ ldr(tmp1, Address(ary1));
3728     __ mov(tmp2, 64);
3729     __ sub(tmp4, tmp2, len, __ LSL, 3);
3730     __ lslv(tmp1, tmp1, tmp4);
3731     __ tst(tmp1, UPPER_BIT_MASK);
3732     __ br(Assembler::NE, RET_TRUE);
3733     // Fallthrough
3734 
3735   __ bind(RET_FALSE);
3736     __ pop(spilled_regs, sp);
3737     __ leave();
3738     __ mov(result, zr);
3739     __ ret(lr);
3740 
3741   __ bind(RET_TRUE);
3742     __ pop(spilled_regs, sp);
3743   __ bind(RET_TRUE_NO_POP);
3744     __ leave();
3745     __ mov(result, 1);
3746     __ ret(lr);
3747 
3748   __ bind(DONE);
3749     __ pop(spilled_regs, sp);
3750     __ leave();
3751     __ ret(lr);
3752     return entry;
3753   }
3754 
generate_large_array_equals_loop_nonsimd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)3755   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3756         bool usePrefetch, Label &NOT_EQUAL) {
3757     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3758         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3759         tmp7 = r12, tmp8 = r13;
3760     Label LOOP;
3761 
3762     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3763     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3764     __ bind(LOOP);
3765     if (usePrefetch) {
3766       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3767       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3768     }
3769     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3770     __ eor(tmp1, tmp1, tmp2);
3771     __ eor(tmp3, tmp3, tmp4);
3772     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3773     __ orr(tmp1, tmp1, tmp3);
3774     __ cbnz(tmp1, NOT_EQUAL);
3775     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3776     __ eor(tmp5, tmp5, tmp6);
3777     __ eor(tmp7, tmp7, tmp8);
3778     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3779     __ orr(tmp5, tmp5, tmp7);
3780     __ cbnz(tmp5, NOT_EQUAL);
3781     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3782     __ eor(tmp1, tmp1, tmp2);
3783     __ eor(tmp3, tmp3, tmp4);
3784     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3785     __ orr(tmp1, tmp1, tmp3);
3786     __ cbnz(tmp1, NOT_EQUAL);
3787     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3788     __ eor(tmp5, tmp5, tmp6);
3789     __ sub(cnt1, cnt1, 8 * wordSize);
3790     __ eor(tmp7, tmp7, tmp8);
3791     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3792     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3793     // cmp) because subs allows an unlimited range of immediate operand.
3794     __ subs(tmp6, cnt1, loopThreshold);
3795     __ orr(tmp5, tmp5, tmp7);
3796     __ cbnz(tmp5, NOT_EQUAL);
3797     __ br(__ GE, LOOP);
3798     // post-loop
3799     __ eor(tmp1, tmp1, tmp2);
3800     __ eor(tmp3, tmp3, tmp4);
3801     __ orr(tmp1, tmp1, tmp3);
3802     __ sub(cnt1, cnt1, 2 * wordSize);
3803     __ cbnz(tmp1, NOT_EQUAL);
3804   }
3805 
generate_large_array_equals_loop_simd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)3806   void generate_large_array_equals_loop_simd(int loopThreshold,
3807         bool usePrefetch, Label &NOT_EQUAL) {
3808     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3809         tmp2 = rscratch2;
3810     Label LOOP;
3811 
3812     __ bind(LOOP);
3813     if (usePrefetch) {
3814       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3815       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3816     }
3817     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3818     __ sub(cnt1, cnt1, 8 * wordSize);
3819     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3820     __ subs(tmp1, cnt1, loopThreshold);
3821     __ eor(v0, __ T16B, v0, v4);
3822     __ eor(v1, __ T16B, v1, v5);
3823     __ eor(v2, __ T16B, v2, v6);
3824     __ eor(v3, __ T16B, v3, v7);
3825     __ orr(v0, __ T16B, v0, v1);
3826     __ orr(v1, __ T16B, v2, v3);
3827     __ orr(v0, __ T16B, v0, v1);
3828     __ umov(tmp1, v0, __ D, 0);
3829     __ umov(tmp2, v0, __ D, 1);
3830     __ orr(tmp1, tmp1, tmp2);
3831     __ cbnz(tmp1, NOT_EQUAL);
3832     __ br(__ GE, LOOP);
3833   }
3834 
3835   // a1 = r1 - array1 address
3836   // a2 = r2 - array2 address
3837   // result = r0 - return value. Already contains "false"
3838   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3839   // r3-r5 are reserved temporary registers
generate_large_array_equals()3840   address generate_large_array_equals() {
3841     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3842         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3843         tmp7 = r12, tmp8 = r13;
3844     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3845         SMALL_LOOP, POST_LOOP;
3846     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3847     // calculate if at least 32 prefetched bytes are used
3848     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3849     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3850     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3851     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3852         tmp5, tmp6, tmp7, tmp8);
3853 
3854     __ align(CodeEntryAlignment);
3855 
3856     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3857 
3858     address entry = __ pc();
3859     __ enter();
3860     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3861     // also advance pointers to use post-increment instead of pre-increment
3862     __ add(a1, a1, wordSize);
3863     __ add(a2, a2, wordSize);
3864     if (AvoidUnalignedAccesses) {
3865       // both implementations (SIMD/nonSIMD) are using relatively large load
3866       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3867       // on some CPUs in case of address is not at least 16-byte aligned.
3868       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3869       // load if needed at least for 1st address and make if 16-byte aligned.
3870       Label ALIGNED16;
3871       __ tbz(a1, 3, ALIGNED16);
3872       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3873       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3874       __ sub(cnt1, cnt1, wordSize);
3875       __ eor(tmp1, tmp1, tmp2);
3876       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3877       __ bind(ALIGNED16);
3878     }
3879     if (UseSIMDForArrayEquals) {
3880       if (SoftwarePrefetchHintDistance >= 0) {
3881         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3882         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3883         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3884             /* prfm = */ true, NOT_EQUAL);
3885         __ cmp(cnt1, nonPrefetchLoopThreshold);
3886         __ br(__ LT, TAIL);
3887       }
3888       __ bind(NO_PREFETCH_LARGE_LOOP);
3889       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3890           /* prfm = */ false, NOT_EQUAL);
3891     } else {
3892       __ push(spilled_regs, sp);
3893       if (SoftwarePrefetchHintDistance >= 0) {
3894         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3895         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3896         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3897             /* prfm = */ true, NOT_EQUAL);
3898         __ cmp(cnt1, nonPrefetchLoopThreshold);
3899         __ br(__ LT, TAIL);
3900       }
3901       __ bind(NO_PREFETCH_LARGE_LOOP);
3902       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3903           /* prfm = */ false, NOT_EQUAL);
3904     }
3905     __ bind(TAIL);
3906       __ cbz(cnt1, EQUAL);
3907       __ subs(cnt1, cnt1, wordSize);
3908       __ br(__ LE, POST_LOOP);
3909     __ bind(SMALL_LOOP);
3910       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3911       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3912       __ subs(cnt1, cnt1, wordSize);
3913       __ eor(tmp1, tmp1, tmp2);
3914       __ cbnz(tmp1, NOT_EQUAL);
3915       __ br(__ GT, SMALL_LOOP);
3916     __ bind(POST_LOOP);
3917       __ ldr(tmp1, Address(a1, cnt1));
3918       __ ldr(tmp2, Address(a2, cnt1));
3919       __ eor(tmp1, tmp1, tmp2);
3920       __ cbnz(tmp1, NOT_EQUAL);
3921     __ bind(EQUAL);
3922       __ mov(result, true);
3923     __ bind(NOT_EQUAL);
3924       if (!UseSIMDForArrayEquals) {
3925         __ pop(spilled_regs, sp);
3926       }
3927     __ bind(NOT_EQUAL_NO_POP);
3928     __ leave();
3929     __ ret(lr);
3930     return entry;
3931   }
3932 
generate_dsin_dcos(bool isCos)3933   address generate_dsin_dcos(bool isCos) {
3934     __ align(CodeEntryAlignment);
3935     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3936     address start = __ pc();
3937     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3938         (address)StubRoutines::aarch64::_two_over_pi,
3939         (address)StubRoutines::aarch64::_pio2,
3940         (address)StubRoutines::aarch64::_dsin_coef,
3941         (address)StubRoutines::aarch64::_dcos_coef);
3942     return start;
3943   }
3944 
generate_dlog()3945   address generate_dlog() {
3946     __ align(CodeEntryAlignment);
3947     StubCodeMark mark(this, "StubRoutines", "dlog");
3948     address entry = __ pc();
3949     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3950         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3951     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3952     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3953         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3954     return entry;
3955   }
3956 
3957   // code for comparing 16 bytes of strings with same encoding
compare_string_16_bytes_same(Label & DIFF1,Label & DIFF2)3958   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3959     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3960     __ ldr(rscratch1, Address(__ post(str1, 8)));
3961     __ eor(rscratch2, tmp1, tmp2);
3962     __ ldr(cnt1, Address(__ post(str2, 8)));
3963     __ cbnz(rscratch2, DIFF1);
3964     __ ldr(tmp1, Address(__ post(str1, 8)));
3965     __ eor(rscratch2, rscratch1, cnt1);
3966     __ ldr(tmp2, Address(__ post(str2, 8)));
3967     __ cbnz(rscratch2, DIFF2);
3968   }
3969 
3970   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
compare_string_16_x_LU(Register tmpL,Register tmpU,Label & DIFF1,Label & DIFF2)3971   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
3972       Label &DIFF2) {
3973     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
3974     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
3975 
3976     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
3977     __ ldr(tmpU, Address(__ post(cnt1, 8)));
3978     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
3979     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
3980 
3981     __ fmovd(tmpL, vtmp3);
3982     __ eor(rscratch2, tmp3, tmpL);
3983     __ cbnz(rscratch2, DIFF2);
3984 
3985     __ ldr(tmp3, Address(__ post(cnt1, 8)));
3986     __ umov(tmpL, vtmp3, __ D, 1);
3987     __ eor(rscratch2, tmpU, tmpL);
3988     __ cbnz(rscratch2, DIFF1);
3989 
3990     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
3991     __ ldr(tmpU, Address(__ post(cnt1, 8)));
3992     __ fmovd(tmpL, vtmp);
3993     __ eor(rscratch2, tmp3, tmpL);
3994     __ cbnz(rscratch2, DIFF2);
3995 
3996     __ ldr(tmp3, Address(__ post(cnt1, 8)));
3997     __ umov(tmpL, vtmp, __ D, 1);
3998     __ eor(rscratch2, tmpU, tmpL);
3999     __ cbnz(rscratch2, DIFF1);
4000   }
4001 
4002   // r0  = result
4003   // r1  = str1
4004   // r2  = cnt1
4005   // r3  = str2
4006   // r4  = cnt2
4007   // r10 = tmp1
4008   // r11 = tmp2
generate_compare_long_string_different_encoding(bool isLU)4009   address generate_compare_long_string_different_encoding(bool isLU) {
4010     __ align(CodeEntryAlignment);
4011     StubCodeMark mark(this, "StubRoutines", isLU
4012         ? "compare_long_string_different_encoding LU"
4013         : "compare_long_string_different_encoding UL");
4014     address entry = __ pc();
4015     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4016         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4017         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4018     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4019         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4020     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4021     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4022 
4023     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4024 
4025     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4026     // cnt2 == amount of characters left to compare
4027     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4028     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4029     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4030     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4031     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4032     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4033     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4034     __ eor(rscratch2, tmp1, tmp2);
4035     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4036     __ mov(rscratch1, tmp2);
4037     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4038     Register strU = isLU ? str2 : str1,
4039              strL = isLU ? str1 : str2,
4040              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4041              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4042     __ push(spilled_regs, sp);
4043     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4044     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4045 
4046     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4047 
4048     if (SoftwarePrefetchHintDistance >= 0) {
4049       __ cmp(cnt2, prefetchLoopExitCondition);
4050       __ br(__ LT, NO_PREFETCH);
4051       __ bind(LARGE_LOOP_PREFETCH);
4052         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4053         __ mov(tmp4, 2);
4054         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4055         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4056           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4057           __ subs(tmp4, tmp4, 1);
4058           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4059           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4060           __ mov(tmp4, 2);
4061         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4062           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4063           __ subs(tmp4, tmp4, 1);
4064           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4065           __ sub(cnt2, cnt2, 64);
4066           __ cmp(cnt2, prefetchLoopExitCondition);
4067           __ br(__ GE, LARGE_LOOP_PREFETCH);
4068     }
4069     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4070     __ bind(NO_PREFETCH);
4071     __ subs(cnt2, cnt2, 16);
4072     __ br(__ LT, TAIL);
4073     __ bind(SMALL_LOOP); // smaller loop
4074       __ subs(cnt2, cnt2, 16);
4075       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4076       __ br(__ GE, SMALL_LOOP);
4077       __ cmn(cnt2, (u1)16);
4078       __ br(__ EQ, LOAD_LAST);
4079     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4080       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string
4081       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4082       __ ldr(tmp3, Address(cnt1, -8));
4083       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4084       __ b(LOAD_LAST);
4085     __ bind(DIFF2);
4086       __ mov(tmpU, tmp3);
4087     __ bind(DIFF1);
4088       __ pop(spilled_regs, sp);
4089       __ b(CALCULATE_DIFFERENCE);
4090     __ bind(LOAD_LAST);
4091       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4092       // No need to load it again
4093       __ mov(tmpU, tmp3);
4094       __ pop(spilled_regs, sp);
4095 
4096       __ ldrs(vtmp, Address(strL));
4097       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4098       __ fmovd(tmpL, vtmp);
4099 
4100       __ eor(rscratch2, tmpU, tmpL);
4101       __ cbz(rscratch2, DONE);
4102 
4103     // Find the first different characters in the longwords and
4104     // compute their difference.
4105     __ bind(CALCULATE_DIFFERENCE);
4106       __ rev(rscratch2, rscratch2);
4107       __ clz(rscratch2, rscratch2);
4108       __ andr(rscratch2, rscratch2, -16);
4109       __ lsrv(tmp1, tmp1, rscratch2);
4110       __ uxthw(tmp1, tmp1);
4111       __ lsrv(rscratch1, rscratch1, rscratch2);
4112       __ uxthw(rscratch1, rscratch1);
4113       __ subw(result, tmp1, rscratch1);
4114     __ bind(DONE);
4115       __ ret(lr);
4116     return entry;
4117   }
4118 
4119   // r0  = result
4120   // r1  = str1
4121   // r2  = cnt1
4122   // r3  = str2
4123   // r4  = cnt2
4124   // r10 = tmp1
4125   // r11 = tmp2
generate_compare_long_string_same_encoding(bool isLL)4126   address generate_compare_long_string_same_encoding(bool isLL) {
4127     __ align(CodeEntryAlignment);
4128     StubCodeMark mark(this, "StubRoutines", isLL
4129         ? "compare_long_string_same_encoding LL"
4130         : "compare_long_string_same_encoding UU");
4131     address entry = __ pc();
4132     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4133         tmp1 = r10, tmp2 = r11;
4134     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4135         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4136         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4137     // exit from large loop when less than 64 bytes left to read or we're about
4138     // to prefetch memory behind array border
4139     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4140     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4141     // update cnt2 counter with already loaded 8 bytes
4142     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4143     // update pointers, because of previous read
4144     __ add(str1, str1, wordSize);
4145     __ add(str2, str2, wordSize);
4146     if (SoftwarePrefetchHintDistance >= 0) {
4147       __ bind(LARGE_LOOP_PREFETCH);
4148         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4149         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4150         compare_string_16_bytes_same(DIFF, DIFF2);
4151         compare_string_16_bytes_same(DIFF, DIFF2);
4152         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4153         compare_string_16_bytes_same(DIFF, DIFF2);
4154         __ cmp(cnt2, largeLoopExitCondition);
4155         compare_string_16_bytes_same(DIFF, DIFF2);
4156         __ br(__ GT, LARGE_LOOP_PREFETCH);
4157         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4158     }
4159     // less than 16 bytes left?
4160     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4161     __ br(__ LT, TAIL);
4162     __ bind(SMALL_LOOP);
4163       compare_string_16_bytes_same(DIFF, DIFF2);
4164       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4165       __ br(__ GE, SMALL_LOOP);
4166     __ bind(TAIL);
4167       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4168       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4169       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4170       __ br(__ LE, CHECK_LAST);
4171       __ eor(rscratch2, tmp1, tmp2);
4172       __ cbnz(rscratch2, DIFF);
4173       __ ldr(tmp1, Address(__ post(str1, 8)));
4174       __ ldr(tmp2, Address(__ post(str2, 8)));
4175       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4176     __ bind(CHECK_LAST);
4177       if (!isLL) {
4178         __ add(cnt2, cnt2, cnt2); // now in bytes
4179       }
4180       __ eor(rscratch2, tmp1, tmp2);
4181       __ cbnz(rscratch2, DIFF);
4182       __ ldr(rscratch1, Address(str1, cnt2));
4183       __ ldr(cnt1, Address(str2, cnt2));
4184       __ eor(rscratch2, rscratch1, cnt1);
4185       __ cbz(rscratch2, LENGTH_DIFF);
4186       // Find the first different characters in the longwords and
4187       // compute their difference.
4188     __ bind(DIFF2);
4189       __ rev(rscratch2, rscratch2);
4190       __ clz(rscratch2, rscratch2);
4191       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4192       __ lsrv(rscratch1, rscratch1, rscratch2);
4193       if (isLL) {
4194         __ lsrv(cnt1, cnt1, rscratch2);
4195         __ uxtbw(rscratch1, rscratch1);
4196         __ uxtbw(cnt1, cnt1);
4197       } else {
4198         __ lsrv(cnt1, cnt1, rscratch2);
4199         __ uxthw(rscratch1, rscratch1);
4200         __ uxthw(cnt1, cnt1);
4201       }
4202       __ subw(result, rscratch1, cnt1);
4203       __ b(LENGTH_DIFF);
4204     __ bind(DIFF);
4205       __ rev(rscratch2, rscratch2);
4206       __ clz(rscratch2, rscratch2);
4207       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4208       __ lsrv(tmp1, tmp1, rscratch2);
4209       if (isLL) {
4210         __ lsrv(tmp2, tmp2, rscratch2);
4211         __ uxtbw(tmp1, tmp1);
4212         __ uxtbw(tmp2, tmp2);
4213       } else {
4214         __ lsrv(tmp2, tmp2, rscratch2);
4215         __ uxthw(tmp1, tmp1);
4216         __ uxthw(tmp2, tmp2);
4217       }
4218       __ subw(result, tmp1, tmp2);
4219       __ b(LENGTH_DIFF);
4220     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4221       __ eor(rscratch2, tmp1, tmp2);
4222       __ cbnz(rscratch2, DIFF);
4223     __ bind(LENGTH_DIFF);
4224       __ ret(lr);
4225     return entry;
4226   }
4227 
generate_compare_long_strings()4228   void generate_compare_long_strings() {
4229       StubRoutines::aarch64::_compare_long_string_LL
4230           = generate_compare_long_string_same_encoding(true);
4231       StubRoutines::aarch64::_compare_long_string_UU
4232           = generate_compare_long_string_same_encoding(false);
4233       StubRoutines::aarch64::_compare_long_string_LU
4234           = generate_compare_long_string_different_encoding(true);
4235       StubRoutines::aarch64::_compare_long_string_UL
4236           = generate_compare_long_string_different_encoding(false);
4237   }
4238 
4239   // R0 = result
4240   // R1 = str2
4241   // R2 = cnt1
4242   // R3 = str1
4243   // R4 = cnt2
4244   // This generic linear code use few additional ideas, which makes it faster:
4245   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4246   // in order to skip initial loading(help in systems with 1 ld pipeline)
4247   // 2) we can use "fast" algorithm of finding single character to search for
4248   // first symbol with less branches(1 branch per each loaded register instead
4249   // of branch for each symbol), so, this is where constants like
4250   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4251   // 3) after loading and analyzing 1st register of source string, it can be
4252   // used to search for every 1st character entry, saving few loads in
4253   // comparison with "simplier-but-slower" implementation
4254   // 4) in order to avoid lots of push/pop operations, code below is heavily
4255   // re-using/re-initializing/compressing register values, which makes code
4256   // larger and a bit less readable, however, most of extra operations are
4257   // issued during loads or branches, so, penalty is minimal
generate_string_indexof_linear(bool str1_isL,bool str2_isL)4258   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4259     const char* stubName = str1_isL
4260         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4261         : "indexof_linear_uu";
4262     __ align(CodeEntryAlignment);
4263     StubCodeMark mark(this, "StubRoutines", stubName);
4264     address entry = __ pc();
4265 
4266     int str1_chr_size = str1_isL ? 1 : 2;
4267     int str2_chr_size = str2_isL ? 1 : 2;
4268     int str1_chr_shift = str1_isL ? 0 : 1;
4269     int str2_chr_shift = str2_isL ? 0 : 1;
4270     bool isL = str1_isL && str2_isL;
4271    // parameters
4272     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4273     // temporary registers
4274     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4275     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4276     // redefinitions
4277     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4278 
4279     __ push(spilled_regs, sp);
4280     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP,
4281         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4282         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4283         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4284         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4285         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4286     // Read whole register from str1. It is safe, because length >=8 here
4287     __ ldr(ch1, Address(str1));
4288     // Read whole register from str2. It is safe, because length >=8 here
4289     __ ldr(ch2, Address(str2));
4290     __ sub(cnt2, cnt2, cnt1);
4291     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4292     if (str1_isL != str2_isL) {
4293       __ eor(v0, __ T16B, v0, v0);
4294     }
4295     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4296     __ mul(first, first, tmp1);
4297     // check if we have less than 1 register to check
4298     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4299     if (str1_isL != str2_isL) {
4300       __ fmovd(v1, ch1);
4301     }
4302     __ br(__ LE, L_SMALL);
4303     __ eor(ch2, first, ch2);
4304     if (str1_isL != str2_isL) {
4305       __ zip1(v1, __ T16B, v1, v0);
4306     }
4307     __ sub(tmp2, ch2, tmp1);
4308     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4309     __ bics(tmp2, tmp2, ch2);
4310     if (str1_isL != str2_isL) {
4311       __ fmovd(ch1, v1);
4312     }
4313     __ br(__ NE, L_HAS_ZERO);
4314     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4315     __ add(result, result, wordSize/str2_chr_size);
4316     __ add(str2, str2, wordSize);
4317     __ br(__ LT, L_POST_LOOP);
4318     __ BIND(L_LOOP);
4319       __ ldr(ch2, Address(str2));
4320       __ eor(ch2, first, ch2);
4321       __ sub(tmp2, ch2, tmp1);
4322       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4323       __ bics(tmp2, tmp2, ch2);
4324       __ br(__ NE, L_HAS_ZERO);
4325     __ BIND(L_LOOP_PROCEED);
4326       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4327       __ add(str2, str2, wordSize);
4328       __ add(result, result, wordSize/str2_chr_size);
4329       __ br(__ GE, L_LOOP);
4330     __ BIND(L_POST_LOOP);
4331       __ cmp(cnt2, -wordSize/str2_chr_size); // no extra characters to check
4332       __ br(__ LE, NOMATCH);
4333       __ ldr(ch2, Address(str2));
4334       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4335       __ eor(ch2, first, ch2);
4336       __ sub(tmp2, ch2, tmp1);
4337       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4338       __ mov(tmp4, -1); // all bits set
4339       __ b(L_SMALL_PROCEED);
4340     __ align(OptoLoopAlignment);
4341     __ BIND(L_SMALL);
4342       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4343       __ eor(ch2, first, ch2);
4344       if (str1_isL != str2_isL) {
4345         __ zip1(v1, __ T16B, v1, v0);
4346       }
4347       __ sub(tmp2, ch2, tmp1);
4348       __ mov(tmp4, -1); // all bits set
4349       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4350       if (str1_isL != str2_isL) {
4351         __ fmovd(ch1, v1); // move converted 4 symbols
4352       }
4353     __ BIND(L_SMALL_PROCEED);
4354       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4355       __ bic(tmp2, tmp2, ch2);
4356       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4357       __ rbit(tmp2, tmp2);
4358       __ br(__ EQ, NOMATCH);
4359     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4360       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4361       __ cmp(cnt1, wordSize/str2_chr_size);
4362       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4363       if (str2_isL) { // LL
4364         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4365         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4366         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4367         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4368         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4369       } else {
4370         __ mov(ch2, 0xE); // all bits in byte set except last one
4371         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4372         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4373         __ lslv(tmp2, tmp2, tmp4);
4374         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4375         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4376         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4377         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4378       }
4379       __ cmp(ch1, ch2);
4380       __ mov(tmp4, wordSize/str2_chr_size);
4381       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4382     __ BIND(L_SMALL_CMP_LOOP);
4383       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4384                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4385       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4386                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4387       __ add(tmp4, tmp4, 1);
4388       __ cmp(tmp4, cnt1);
4389       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4390       __ cmp(first, ch2);
4391       __ br(__ EQ, L_SMALL_CMP_LOOP);
4392     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4393       __ cbz(tmp2, NOMATCH); // no more matches. exit
4394       __ clz(tmp4, tmp2);
4395       __ add(result, result, 1); // advance index
4396       __ add(str2, str2, str2_chr_size); // advance pointer
4397       __ b(L_SMALL_HAS_ZERO_LOOP);
4398     __ align(OptoLoopAlignment);
4399     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4400       __ cmp(first, ch2);
4401       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4402       __ b(DONE);
4403     __ align(OptoLoopAlignment);
4404     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4405       if (str2_isL) { // LL
4406         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4407         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4408         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4409         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4410         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4411       } else {
4412         __ mov(ch2, 0xE); // all bits in byte set except last one
4413         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4414         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4415         __ lslv(tmp2, tmp2, tmp4);
4416         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4417         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4418         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4419         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4420       }
4421       __ cmp(ch1, ch2);
4422       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4423       __ b(DONE);
4424     __ align(OptoLoopAlignment);
4425     __ BIND(L_HAS_ZERO);
4426       __ rbit(tmp2, tmp2);
4427       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4428       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4429       // It's fine because both counters are 32bit and are not changed in this
4430       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4431       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4432       __ sub(result, result, 1);
4433     __ BIND(L_HAS_ZERO_LOOP);
4434       __ mov(cnt1, wordSize/str2_chr_size);
4435       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4436       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4437       if (str2_isL) {
4438         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4439         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4440         __ lslv(tmp2, tmp2, tmp4);
4441         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4442         __ add(tmp4, tmp4, 1);
4443         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4444         __ lsl(tmp2, tmp2, 1);
4445         __ mov(tmp4, wordSize/str2_chr_size);
4446       } else {
4447         __ mov(ch2, 0xE);
4448         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4449         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4450         __ lslv(tmp2, tmp2, tmp4);
4451         __ add(tmp4, tmp4, 1);
4452         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4453         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4454         __ lsl(tmp2, tmp2, 1);
4455         __ mov(tmp4, wordSize/str2_chr_size);
4456         __ sub(str2, str2, str2_chr_size);
4457       }
4458       __ cmp(ch1, ch2);
4459       __ mov(tmp4, wordSize/str2_chr_size);
4460       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4461     __ BIND(L_CMP_LOOP);
4462       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4463                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4464       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4465                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4466       __ add(tmp4, tmp4, 1);
4467       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4468       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4469       __ cmp(cnt1, ch2);
4470       __ br(__ EQ, L_CMP_LOOP);
4471     __ BIND(L_CMP_LOOP_NOMATCH);
4472       // here we're not matched
4473       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4474       __ clz(tmp4, tmp2);
4475       __ add(str2, str2, str2_chr_size); // advance pointer
4476       __ b(L_HAS_ZERO_LOOP);
4477     __ align(OptoLoopAlignment);
4478     __ BIND(L_CMP_LOOP_LAST_CMP);
4479       __ cmp(cnt1, ch2);
4480       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4481       __ b(DONE);
4482     __ align(OptoLoopAlignment);
4483     __ BIND(L_CMP_LOOP_LAST_CMP2);
4484       if (str2_isL) {
4485         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4486         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4487         __ lslv(tmp2, tmp2, tmp4);
4488         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4489         __ add(tmp4, tmp4, 1);
4490         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4491         __ lsl(tmp2, tmp2, 1);
4492       } else {
4493         __ mov(ch2, 0xE);
4494         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4495         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4496         __ lslv(tmp2, tmp2, tmp4);
4497         __ add(tmp4, tmp4, 1);
4498         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4499         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4500         __ lsl(tmp2, tmp2, 1);
4501         __ sub(str2, str2, str2_chr_size);
4502       }
4503       __ cmp(ch1, ch2);
4504       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4505       __ b(DONE);
4506     __ align(OptoLoopAlignment);
4507     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4508       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4509       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4510       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4511       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4512       // result by analyzed characters value, so, we can just reset lower bits
4513       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4514       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4515       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4516       // index of last analyzed substring inside current octet. So, str2 in at
4517       // respective start address. We need to advance it to next octet
4518       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4519       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4520       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4521       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4522       __ movw(cnt2, cnt2);
4523       __ b(L_LOOP_PROCEED);
4524     __ align(OptoLoopAlignment);
4525     __ BIND(NOMATCH);
4526       __ mov(result, -1);
4527     __ BIND(DONE);
4528       __ pop(spilled_regs, sp);
4529       __ ret(lr);
4530     return entry;
4531   }
4532 
generate_string_indexof_stubs()4533   void generate_string_indexof_stubs() {
4534     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4535     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4536     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4537   }
4538 
inflate_and_store_2_fp_registers(bool generatePrfm,FloatRegister src1,FloatRegister src2)4539   void inflate_and_store_2_fp_registers(bool generatePrfm,
4540       FloatRegister src1, FloatRegister src2) {
4541     Register dst = r1;
4542     __ zip1(v1, __ T16B, src1, v0);
4543     __ zip2(v2, __ T16B, src1, v0);
4544     if (generatePrfm) {
4545       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4546     }
4547     __ zip1(v3, __ T16B, src2, v0);
4548     __ zip2(v4, __ T16B, src2, v0);
4549     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4550   }
4551 
4552   // R0 = src
4553   // R1 = dst
4554   // R2 = len
4555   // R3 = len >> 3
4556   // V0 = 0
4557   // v1 = loaded 8 bytes
generate_large_byte_array_inflate()4558   address generate_large_byte_array_inflate() {
4559     __ align(CodeEntryAlignment);
4560     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4561     address entry = __ pc();
4562     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4563     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4564     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
4565 
4566     // do one more 8-byte read to have address 16-byte aligned in most cases
4567     // also use single store instruction
4568     __ ldrd(v2, __ post(src, 8));
4569     __ sub(octetCounter, octetCounter, 2);
4570     __ zip1(v1, __ T16B, v1, v0);
4571     __ zip1(v2, __ T16B, v2, v0);
4572     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4573     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4574     __ cmp(octetCounter, large_loop_threshold);
4575     __ br(__ LE, LOOP_START);
4576     __ b(LOOP_PRFM_START);
4577     __ bind(LOOP_PRFM);
4578       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4579     __ bind(LOOP_PRFM_START);
4580       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4581       __ sub(octetCounter, octetCounter, 8);
4582       __ cmp(octetCounter, large_loop_threshold);
4583       inflate_and_store_2_fp_registers(true, v3, v4);
4584       inflate_and_store_2_fp_registers(true, v5, v6);
4585       __ br(__ GT, LOOP_PRFM);
4586       __ cmp(octetCounter, 8);
4587       __ br(__ LT, DONE);
4588     __ bind(LOOP);
4589       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4590       __ bind(LOOP_START);
4591       __ sub(octetCounter, octetCounter, 8);
4592       __ cmp(octetCounter, 8);
4593       inflate_and_store_2_fp_registers(false, v3, v4);
4594       inflate_and_store_2_fp_registers(false, v5, v6);
4595       __ br(__ GE, LOOP);
4596     __ bind(DONE);
4597       __ ret(lr);
4598     return entry;
4599   }
4600 
4601   /**
4602    *  Arguments:
4603    *
4604    *  Input:
4605    *  c_rarg0   - current state address
4606    *  c_rarg1   - H key address
4607    *  c_rarg2   - data address
4608    *  c_rarg3   - number of blocks
4609    *
4610    *  Output:
4611    *  Updated state at c_rarg0
4612    */
generate_ghash_processBlocks()4613   address generate_ghash_processBlocks() {
4614     // Bafflingly, GCM uses little-endian for the byte order, but
4615     // big-endian for the bit order.  For example, the polynomial 1 is
4616     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4617     //
4618     // So, we must either reverse the bytes in each word and do
4619     // everything big-endian or reverse the bits in each byte and do
4620     // it little-endian.  On AArch64 it's more idiomatic to reverse
4621     // the bits in each byte (we have an instruction, RBIT, to do
4622     // that) and keep the data in little-endian bit order throught the
4623     // calculation, bit-reversing the inputs and outputs.
4624 
4625     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4626     __ align(wordSize * 2);
4627     address p = __ pc();
4628     __ emit_int64(0x87);  // The low-order bits of the field
4629                           // polynomial (i.e. p = z^7+z^2+z+1)
4630                           // repeated in the low and high parts of a
4631                           // 128-bit vector
4632     __ emit_int64(0x87);
4633 
4634     __ align(CodeEntryAlignment);
4635     address start = __ pc();
4636 
4637     Register state   = c_rarg0;
4638     Register subkeyH = c_rarg1;
4639     Register data    = c_rarg2;
4640     Register blocks  = c_rarg3;
4641 
4642     FloatRegister vzr = v30;
4643     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4644 
4645     __ ldrq(v0, Address(state));
4646     __ ldrq(v1, Address(subkeyH));
4647 
4648     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4649     __ rbit(v0, __ T16B, v0);
4650     __ rev64(v1, __ T16B, v1);
4651     __ rbit(v1, __ T16B, v1);
4652 
4653     __ ldrq(v26, p);
4654 
4655     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4656     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4657 
4658     {
4659       Label L_ghash_loop;
4660       __ bind(L_ghash_loop);
4661 
4662       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4663                                                  // reversing each byte
4664       __ rbit(v2, __ T16B, v2);
4665       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4666 
4667       // Multiply state in v2 by subkey in v1
4668       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4669                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4670                      /*temps*/v6, v20, v18, v21);
4671       // Reduce v7:v5 by the field polynomial
4672       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4673 
4674       __ sub(blocks, blocks, 1);
4675       __ cbnz(blocks, L_ghash_loop);
4676     }
4677 
4678     // The bit-reversed result is at this point in v0
4679     __ rev64(v1, __ T16B, v0);
4680     __ rbit(v1, __ T16B, v1);
4681 
4682     __ st1(v1, __ T16B, state);
4683     __ ret(lr);
4684 
4685     return start;
4686   }
4687 
4688 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
4689 
4690   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
4691   //
4692   // If LSE is in use, generate LSE versions of all the stubs. The
4693   // non-LSE versions are in atomic_aarch64.S.
4694 
4695   // class AtomicStubMark records the entry point of a stub and the
4696   // stub pointer which will point to it. The stub pointer is set to
4697   // the entry point when ~AtomicStubMark() is called, which must be
4698   // after ICache::invalidate_range. This ensures safe publication of
4699   // the generated code.
4700   class AtomicStubMark {
4701     address _entry_point;
4702     aarch64_atomic_stub_t *_stub;
4703     MacroAssembler *_masm;
4704   public:
AtomicStubMark(MacroAssembler * masm,aarch64_atomic_stub_t * stub)4705     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
4706       _masm = masm;
4707       __ align(32);
4708       _entry_point = __ pc();
4709       _stub = stub;
4710     }
~AtomicStubMark()4711     ~AtomicStubMark() {
4712       *_stub = (aarch64_atomic_stub_t)_entry_point;
4713     }
4714   };
4715 
4716   // NB: For memory_order_conservative we need a trailing membar after
4717   // LSE atomic operations but not a leading membar.
4718   //
4719   // We don't need a leading membar because a clause in the Arm ARM
4720   // says:
4721   //
4722   //   Barrier-ordered-before
4723   //
4724   //   Barrier instructions order prior Memory effects before subsequent
4725   //   Memory effects generated by the same Observer. A read or a write
4726   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
4727   //   Observer if and only if RW1 appears in program order before RW 2
4728   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
4729   //   instruction with both Acquire and Release semantics.
4730   //
4731   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
4732   // and Release semantics, therefore we don't need a leading
4733   // barrier. However, there is no corresponding Barrier-ordered-after
4734   // relationship, therefore we need a trailing membar to prevent a
4735   // later store or load from being reordered with the store in an
4736   // atomic instruction.
4737   //
4738   // This was checked by using the herd7 consistency model simulator
4739   // (http://diy.inria.fr/) with this test case:
4740   //
4741   // AArch64 LseCas
4742   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
4743   // P0 | P1;
4744   // LDR W4, [X2] | MOV W3, #0;
4745   // DMB LD       | MOV W4, #1;
4746   // LDR W3, [X1] | CASAL W3, W4, [X1];
4747   //              | DMB ISH;
4748   //              | STR W4, [X2];
4749   // exists
4750   // (0:X3=0 /\ 0:X4=1)
4751   //
4752   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
4753   // with the store to x in P1. Without the DMB in P1 this may happen.
4754   //
4755   // At the time of writing we don't know of any AArch64 hardware that
4756   // reorders stores in this way, but the Reference Manual permits it.
4757 
gen_cas_entry(Assembler::operand_size size,atomic_memory_order order)4758   void gen_cas_entry(Assembler::operand_size size,
4759                      atomic_memory_order order) {
4760     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
4761       exchange_val = c_rarg2;
4762     bool acquire, release;
4763     switch (order) {
4764       case memory_order_relaxed:
4765         acquire = false;
4766         release = false;
4767         break;
4768       default:
4769         acquire = true;
4770         release = true;
4771         break;
4772     }
4773     __ mov(prev, compare_val);
4774     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
4775     if (order == memory_order_conservative) {
4776       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
4777     }
4778     if (size == Assembler::xword) {
4779       __ mov(r0, prev);
4780     } else {
4781       __ movw(r0, prev);
4782     }
4783     __ ret(lr);
4784   }
4785 
gen_ldaddal_entry(Assembler::operand_size size)4786   void gen_ldaddal_entry(Assembler::operand_size size) {
4787     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
4788     __ ldaddal(size, incr, prev, addr);
4789     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
4790     if (size == Assembler::xword) {
4791       __ mov(r0, prev);
4792     } else {
4793       __ movw(r0, prev);
4794     }
4795     __ ret(lr);
4796   }
4797 
gen_swpal_entry(Assembler::operand_size size)4798   void gen_swpal_entry(Assembler::operand_size size) {
4799     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
4800     __ swpal(size, incr, prev, addr);
4801     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
4802     if (size == Assembler::xword) {
4803       __ mov(r0, prev);
4804     } else {
4805       __ movw(r0, prev);
4806     }
4807     __ ret(lr);
4808   }
4809 
generate_atomic_entry_points()4810   void generate_atomic_entry_points() {
4811     if (! UseLSE) {
4812       return;
4813     }
4814 
4815     __ align(CodeEntryAlignment);
4816     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
4817     address first_entry = __ pc();
4818 
4819     // All memory_order_conservative
4820     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
4821     gen_ldaddal_entry(Assembler::word);
4822     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
4823     gen_ldaddal_entry(Assembler::xword);
4824 
4825     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
4826     gen_swpal_entry(Assembler::word);
4827     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
4828     gen_swpal_entry(Assembler::xword);
4829 
4830     // CAS, memory_order_conservative
4831     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
4832     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
4833     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
4834     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
4835     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
4836     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
4837 
4838     // CAS, memory_order_relaxed
4839     AtomicStubMark mark_cmpxchg_1_relaxed
4840       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
4841     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
4842     AtomicStubMark mark_cmpxchg_4_relaxed
4843       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
4844     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
4845     AtomicStubMark mark_cmpxchg_8_relaxed
4846       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
4847     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
4848 
4849     ICache::invalidate_range(first_entry, __ pc() - first_entry);
4850   }
4851 #endif // LINUX || _ALLBSD_SOURCE
4852 
generate_base64_encode_simdround(Register src,Register dst,FloatRegister codec,u8 size)4853   void generate_base64_encode_simdround(Register src, Register dst,
4854         FloatRegister codec, u8 size) {
4855 
4856     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
4857     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
4858     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
4859 
4860     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
4861 
4862     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
4863 
4864     __ ushr(ind0, arrangement, in0,  2);
4865 
4866     __ ushr(ind1, arrangement, in1,  2);
4867     __ shl(in0,   arrangement, in0,  6);
4868     __ orr(ind1,  arrangement, ind1, in0);
4869     __ ushr(ind1, arrangement, ind1, 2);
4870 
4871     __ ushr(ind2, arrangement, in2,  4);
4872     __ shl(in1,   arrangement, in1,  4);
4873     __ orr(ind2,  arrangement, in1,  ind2);
4874     __ ushr(ind2, arrangement, ind2, 2);
4875 
4876     __ shl(ind3,  arrangement, in2,  2);
4877     __ ushr(ind3, arrangement, ind3, 2);
4878 
4879     __ tbl(out0,  arrangement, codec,  4, ind0);
4880     __ tbl(out1,  arrangement, codec,  4, ind1);
4881     __ tbl(out2,  arrangement, codec,  4, ind2);
4882     __ tbl(out3,  arrangement, codec,  4, ind3);
4883 
4884     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
4885   }
4886 
4887    /**
4888    *  Arguments:
4889    *
4890    *  Input:
4891    *  c_rarg0   - src_start
4892    *  c_rarg1   - src_offset
4893    *  c_rarg2   - src_length
4894    *  c_rarg3   - dest_start
4895    *  c_rarg4   - dest_offset
4896    *  c_rarg5   - isURL
4897    *
4898    */
generate_base64_encodeBlock()4899   address generate_base64_encodeBlock() {
4900 
4901     static const char toBase64[64] = {
4902       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4903       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4904       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4905       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4906       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
4907     };
4908 
4909     static const char toBase64URL[64] = {
4910       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4911       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4912       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4913       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4914       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
4915     };
4916 
4917     __ align(CodeEntryAlignment);
4918     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
4919     address start = __ pc();
4920 
4921     Register src   = c_rarg0;  // source array
4922     Register soff  = c_rarg1;  // source start offset
4923     Register send  = c_rarg2;  // source end offset
4924     Register dst   = c_rarg3;  // dest array
4925     Register doff  = c_rarg4;  // position for writing to dest array
4926     Register isURL = c_rarg5;  // Base64 or URL chracter set
4927 
4928     // c_rarg6 and c_rarg7 are free to use as temps
4929     Register codec  = c_rarg6;
4930     Register length = c_rarg7;
4931 
4932     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
4933 
4934     __ add(src, src, soff);
4935     __ add(dst, dst, doff);
4936     __ sub(length, send, soff);
4937 
4938     // load the codec base address
4939     __ lea(codec, ExternalAddress((address) toBase64));
4940     __ cbz(isURL, ProcessData);
4941     __ lea(codec, ExternalAddress((address) toBase64URL));
4942 
4943     __ BIND(ProcessData);
4944 
4945     // too short to formup a SIMD loop, roll back
4946     __ cmp(length, (u1)24);
4947     __ br(Assembler::LT, Process3B);
4948 
4949     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
4950 
4951     __ BIND(Process48B);
4952     __ cmp(length, (u1)48);
4953     __ br(Assembler::LT, Process24B);
4954     generate_base64_encode_simdround(src, dst, v0, 16);
4955     __ sub(length, length, 48);
4956     __ b(Process48B);
4957 
4958     __ BIND(Process24B);
4959     __ cmp(length, (u1)24);
4960     __ br(Assembler::LT, SIMDExit);
4961     generate_base64_encode_simdround(src, dst, v0, 8);
4962     __ sub(length, length, 24);
4963 
4964     __ BIND(SIMDExit);
4965     __ cbz(length, Exit);
4966 
4967     __ BIND(Process3B);
4968     //  3 src bytes, 24 bits
4969     __ ldrb(r10, __ post(src, 1));
4970     __ ldrb(r11, __ post(src, 1));
4971     __ ldrb(r12, __ post(src, 1));
4972     __ orrw(r11, r11, r10, Assembler::LSL, 8);
4973     __ orrw(r12, r12, r11, Assembler::LSL, 8);
4974     // codec index
4975     __ ubfmw(r15, r12, 18, 23);
4976     __ ubfmw(r14, r12, 12, 17);
4977     __ ubfmw(r13, r12, 6,  11);
4978     __ andw(r12,  r12, 63);
4979     // get the code based on the codec
4980     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
4981     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
4982     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
4983     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
4984     __ strb(r15, __ post(dst, 1));
4985     __ strb(r14, __ post(dst, 1));
4986     __ strb(r13, __ post(dst, 1));
4987     __ strb(r12, __ post(dst, 1));
4988     __ sub(length, length, 3);
4989     __ cbnz(length, Process3B);
4990 
4991     __ BIND(Exit);
4992     __ ret(lr);
4993 
4994     return start;
4995   }
4996 
4997   // Continuation point for throwing of implicit exceptions that are
4998   // not handled in the current activation. Fabricates an exception
4999   // oop and initiates normal exception dispatching in this
5000   // frame. Since we need to preserve callee-saved values (currently
5001   // only for C2, but done for C1 as well) we need a callee-saved oop
5002   // map and therefore have to make these stubs into RuntimeStubs
5003   // rather than BufferBlobs.  If the compiler needs all registers to
5004   // be preserved between the fault point and the exception handler
5005   // then it must assume responsibility for that in
5006   // AbstractCompiler::continuation_for_implicit_null_exception or
5007   // continuation_for_implicit_division_by_zero_exception. All other
5008   // implicit exceptions (e.g., NullPointerException or
5009   // AbstractMethodError on entry) are either at call sites or
5010   // otherwise assume that stack unwinding will be initiated, so
5011   // caller saved registers were assumed volatile in the compiler.
5012 
5013 #undef __
5014 #define __ masm->
5015 
generate_throw_exception(const char * name,address runtime_entry,Register arg1=noreg,Register arg2=noreg)5016   address generate_throw_exception(const char* name,
5017                                    address runtime_entry,
5018                                    Register arg1 = noreg,
5019                                    Register arg2 = noreg) {
5020     // Information about frame layout at time of blocking runtime call.
5021     // Note that we only have to preserve callee-saved registers since
5022     // the compilers are responsible for supplying a continuation point
5023     // if they expect all registers to be preserved.
5024     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
5025     enum layout {
5026       rfp_off = 0,
5027       rfp_off2,
5028       return_off,
5029       return_off2,
5030       framesize // inclusive of return address
5031     };
5032 
5033     int insts_size = 512;
5034     int locs_size  = 64;
5035 
5036     CodeBuffer code(name, insts_size, locs_size);
5037     OopMapSet* oop_maps  = new OopMapSet();
5038     MacroAssembler* masm = new MacroAssembler(&code);
5039 
5040     address start = __ pc();
5041 
5042     // This is an inlined and slightly modified version of call_VM
5043     // which has the ability to fetch the return PC out of
5044     // thread-local storage and also sets up last_Java_sp slightly
5045     // differently than the real call_VM
5046 
5047     __ enter(); // Save FP and LR before call
5048 
5049     assert(is_even(framesize/2), "sp not 16-byte aligned");
5050 
5051     // lr and fp are already in place
5052     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
5053 
5054     int frame_complete = __ pc() - start;
5055 
5056     // Set up last_Java_sp and last_Java_fp
5057     address the_pc = __ pc();
5058     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
5059 
5060     // Call runtime
5061     if (arg1 != noreg) {
5062       assert(arg2 != c_rarg1, "clobbered");
5063       __ mov(c_rarg1, arg1);
5064     }
5065     if (arg2 != noreg) {
5066       __ mov(c_rarg2, arg2);
5067     }
5068     __ mov(c_rarg0, rthread);
5069     BLOCK_COMMENT("call runtime_entry");
5070     __ mov(rscratch1, runtime_entry);
5071     __ blr(rscratch1);
5072 
5073     // Generate oop map
5074     OopMap* map = new OopMap(framesize, 0);
5075 
5076     oop_maps->add_gc_map(the_pc - start, map);
5077 
5078     __ reset_last_Java_frame(true);
5079     __ maybe_isb();
5080 
5081     __ leave();
5082 
5083     // check for pending exceptions
5084 #ifdef ASSERT
5085     Label L;
5086     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
5087     __ cbnz(rscratch1, L);
5088     __ should_not_reach_here();
5089     __ bind(L);
5090 #endif // ASSERT
5091     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5092 
5093 
5094     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5095     RuntimeStub* stub =
5096       RuntimeStub::new_runtime_stub(name,
5097                                     &code,
5098                                     frame_complete,
5099                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5100                                     oop_maps, false);
5101     return stub->entry_point();
5102   }
5103 
5104   class MontgomeryMultiplyGenerator : public MacroAssembler {
5105 
5106     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
5107       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
5108 
5109     RegSet _toSave;
5110     bool _squaring;
5111 
5112   public:
MontgomeryMultiplyGenerator(Assembler * as,bool squaring)5113     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
5114       : MacroAssembler(as->code()), _squaring(squaring) {
5115 
5116       // Register allocation
5117 
5118       Register reg = c_rarg0;
5119       Pa_base = reg;       // Argument registers
5120       if (squaring)
5121         Pb_base = Pa_base;
5122       else
5123         Pb_base = ++reg;
5124       Pn_base = ++reg;
5125       Rlen= ++reg;
5126       inv = ++reg;
5127       Pm_base = ++reg;
5128 
5129                           // Working registers:
5130       Ra =  ++reg;        // The current digit of a, b, n, and m.
5131       Rb =  ++reg;
5132       Rm =  ++reg;
5133       Rn =  ++reg;
5134 
5135       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
5136       Pb =  ++reg;
5137       Pm =  ++reg;
5138       Pn =  ++reg;
5139 
5140       t0 =  ++reg;        // Three registers which form a
5141       t1 =  ++reg;        // triple-precision accumuator.
5142       t2 =  ++reg;
5143 
5144       Ri =  ++reg;        // Inner and outer loop indexes.
5145       Rj =  ++reg;
5146 
5147       Rhi_ab = ++reg;     // Product registers: low and high parts
5148       Rlo_ab = ++reg;     // of a*b and m*n.
5149       Rhi_mn = ++reg;
5150       Rlo_mn = ++reg;
5151 
5152       // r19 and up are callee-saved.
5153       _toSave = RegSet::range(r19, reg) + Pm_base;
5154     }
5155 
5156   private:
save_regs()5157     void save_regs() {
5158       push(_toSave, sp);
5159     }
5160 
restore_regs()5161     void restore_regs() {
5162       pop(_toSave, sp);
5163     }
5164 
5165     template <typename T>
unroll_2(Register count,T block)5166     void unroll_2(Register count, T block) {
5167       Label loop, end, odd;
5168       tbnz(count, 0, odd);
5169       cbz(count, end);
5170       align(16);
5171       bind(loop);
5172       (this->*block)();
5173       bind(odd);
5174       (this->*block)();
5175       subs(count, count, 2);
5176       br(Assembler::GT, loop);
5177       bind(end);
5178     }
5179 
5180     template <typename T>
unroll_2(Register count,T block,Register d,Register s,Register tmp)5181     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
5182       Label loop, end, odd;
5183       tbnz(count, 0, odd);
5184       cbz(count, end);
5185       align(16);
5186       bind(loop);
5187       (this->*block)(d, s, tmp);
5188       bind(odd);
5189       (this->*block)(d, s, tmp);
5190       subs(count, count, 2);
5191       br(Assembler::GT, loop);
5192       bind(end);
5193     }
5194 
pre1(RegisterOrConstant i)5195     void pre1(RegisterOrConstant i) {
5196       block_comment("pre1");
5197       // Pa = Pa_base;
5198       // Pb = Pb_base + i;
5199       // Pm = Pm_base;
5200       // Pn = Pn_base + i;
5201       // Ra = *Pa;
5202       // Rb = *Pb;
5203       // Rm = *Pm;
5204       // Rn = *Pn;
5205       ldr(Ra, Address(Pa_base));
5206       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5207       ldr(Rm, Address(Pm_base));
5208       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5209       lea(Pa, Address(Pa_base));
5210       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5211       lea(Pm, Address(Pm_base));
5212       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5213 
5214       // Zero the m*n result.
5215       mov(Rhi_mn, zr);
5216       mov(Rlo_mn, zr);
5217     }
5218 
5219     // The core multiply-accumulate step of a Montgomery
5220     // multiplication.  The idea is to schedule operations as a
5221     // pipeline so that instructions with long latencies (loads and
5222     // multiplies) have time to complete before their results are
5223     // used.  This most benefits in-order implementations of the
5224     // architecture but out-of-order ones also benefit.
step()5225     void step() {
5226       block_comment("step");
5227       // MACC(Ra, Rb, t0, t1, t2);
5228       // Ra = *++Pa;
5229       // Rb = *--Pb;
5230       umulh(Rhi_ab, Ra, Rb);
5231       mul(Rlo_ab, Ra, Rb);
5232       ldr(Ra, pre(Pa, wordSize));
5233       ldr(Rb, pre(Pb, -wordSize));
5234       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5235                                        // previous iteration.
5236       // MACC(Rm, Rn, t0, t1, t2);
5237       // Rm = *++Pm;
5238       // Rn = *--Pn;
5239       umulh(Rhi_mn, Rm, Rn);
5240       mul(Rlo_mn, Rm, Rn);
5241       ldr(Rm, pre(Pm, wordSize));
5242       ldr(Rn, pre(Pn, -wordSize));
5243       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5244     }
5245 
post1()5246     void post1() {
5247       block_comment("post1");
5248 
5249       // MACC(Ra, Rb, t0, t1, t2);
5250       // Ra = *++Pa;
5251       // Rb = *--Pb;
5252       umulh(Rhi_ab, Ra, Rb);
5253       mul(Rlo_ab, Ra, Rb);
5254       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5255       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5256 
5257       // *Pm = Rm = t0 * inv;
5258       mul(Rm, t0, inv);
5259       str(Rm, Address(Pm));
5260 
5261       // MACC(Rm, Rn, t0, t1, t2);
5262       // t0 = t1; t1 = t2; t2 = 0;
5263       umulh(Rhi_mn, Rm, Rn);
5264 
5265 #ifndef PRODUCT
5266       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5267       {
5268         mul(Rlo_mn, Rm, Rn);
5269         add(Rlo_mn, t0, Rlo_mn);
5270         Label ok;
5271         cbz(Rlo_mn, ok); {
5272           stop("broken Montgomery multiply");
5273         } bind(ok);
5274       }
5275 #endif
5276       // We have very carefully set things up so that
5277       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5278       // the lower half of Rm * Rn because we know the result already:
5279       // it must be -t0.  t0 + (-t0) must generate a carry iff
5280       // t0 != 0.  So, rather than do a mul and an adds we just set
5281       // the carry flag iff t0 is nonzero.
5282       //
5283       // mul(Rlo_mn, Rm, Rn);
5284       // adds(zr, t0, Rlo_mn);
5285       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5286       adcs(t0, t1, Rhi_mn);
5287       adc(t1, t2, zr);
5288       mov(t2, zr);
5289     }
5290 
pre2(RegisterOrConstant i,RegisterOrConstant len)5291     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5292       block_comment("pre2");
5293       // Pa = Pa_base + i-len;
5294       // Pb = Pb_base + len;
5295       // Pm = Pm_base + i-len;
5296       // Pn = Pn_base + len;
5297 
5298       if (i.is_register()) {
5299         sub(Rj, i.as_register(), len);
5300       } else {
5301         mov(Rj, i.as_constant());
5302         sub(Rj, Rj, len);
5303       }
5304       // Rj == i-len
5305 
5306       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5307       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5308       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5309       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5310 
5311       // Ra = *++Pa;
5312       // Rb = *--Pb;
5313       // Rm = *++Pm;
5314       // Rn = *--Pn;
5315       ldr(Ra, pre(Pa, wordSize));
5316       ldr(Rb, pre(Pb, -wordSize));
5317       ldr(Rm, pre(Pm, wordSize));
5318       ldr(Rn, pre(Pn, -wordSize));
5319 
5320       mov(Rhi_mn, zr);
5321       mov(Rlo_mn, zr);
5322     }
5323 
post2(RegisterOrConstant i,RegisterOrConstant len)5324     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5325       block_comment("post2");
5326       if (i.is_constant()) {
5327         mov(Rj, i.as_constant()-len.as_constant());
5328       } else {
5329         sub(Rj, i.as_register(), len);
5330       }
5331 
5332       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5333 
5334       // As soon as we know the least significant digit of our result,
5335       // store it.
5336       // Pm_base[i-len] = t0;
5337       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5338 
5339       // t0 = t1; t1 = t2; t2 = 0;
5340       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5341       adc(t1, t2, zr);
5342       mov(t2, zr);
5343     }
5344 
5345     // A carry in t0 after Montgomery multiplication means that we
5346     // should subtract multiples of n from our result in m.  We'll
5347     // keep doing that until there is no carry.
normalize(RegisterOrConstant len)5348     void normalize(RegisterOrConstant len) {
5349       block_comment("normalize");
5350       // while (t0)
5351       //   t0 = sub(Pm_base, Pn_base, t0, len);
5352       Label loop, post, again;
5353       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5354       cbz(t0, post); {
5355         bind(again); {
5356           mov(i, zr);
5357           mov(cnt, len);
5358           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5359           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5360           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5361           align(16);
5362           bind(loop); {
5363             sbcs(Rm, Rm, Rn);
5364             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5365             add(i, i, 1);
5366             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5367             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5368             sub(cnt, cnt, 1);
5369           } cbnz(cnt, loop);
5370           sbc(t0, t0, zr);
5371         } cbnz(t0, again);
5372       } bind(post);
5373     }
5374 
5375     // Move memory at s to d, reversing words.
5376     //    Increments d to end of copied memory
5377     //    Destroys tmp1, tmp2
5378     //    Preserves len
5379     //    Leaves s pointing to the address which was in d at start
reverse(Register d,Register s,Register len,Register tmp1,Register tmp2)5380     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5381       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5382 
5383       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5384       mov(tmp1, len);
5385       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5386       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5387     }
5388     // where
reverse1(Register d,Register s,Register tmp)5389     void reverse1(Register d, Register s, Register tmp) {
5390       ldr(tmp, pre(s, -wordSize));
5391       ror(tmp, tmp, 32);
5392       str(tmp, post(d, wordSize));
5393     }
5394 
step_squaring()5395     void step_squaring() {
5396       // An extra ACC
5397       step();
5398       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5399     }
5400 
last_squaring(RegisterOrConstant i)5401     void last_squaring(RegisterOrConstant i) {
5402       Label dont;
5403       // if ((i & 1) == 0) {
5404       tbnz(i.as_register(), 0, dont); {
5405         // MACC(Ra, Rb, t0, t1, t2);
5406         // Ra = *++Pa;
5407         // Rb = *--Pb;
5408         umulh(Rhi_ab, Ra, Rb);
5409         mul(Rlo_ab, Ra, Rb);
5410         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5411       } bind(dont);
5412     }
5413 
extra_step_squaring()5414     void extra_step_squaring() {
5415       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5416 
5417       // MACC(Rm, Rn, t0, t1, t2);
5418       // Rm = *++Pm;
5419       // Rn = *--Pn;
5420       umulh(Rhi_mn, Rm, Rn);
5421       mul(Rlo_mn, Rm, Rn);
5422       ldr(Rm, pre(Pm, wordSize));
5423       ldr(Rn, pre(Pn, -wordSize));
5424     }
5425 
post1_squaring()5426     void post1_squaring() {
5427       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5428 
5429       // *Pm = Rm = t0 * inv;
5430       mul(Rm, t0, inv);
5431       str(Rm, Address(Pm));
5432 
5433       // MACC(Rm, Rn, t0, t1, t2);
5434       // t0 = t1; t1 = t2; t2 = 0;
5435       umulh(Rhi_mn, Rm, Rn);
5436 
5437 #ifndef PRODUCT
5438       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5439       {
5440         mul(Rlo_mn, Rm, Rn);
5441         add(Rlo_mn, t0, Rlo_mn);
5442         Label ok;
5443         cbz(Rlo_mn, ok); {
5444           stop("broken Montgomery multiply");
5445         } bind(ok);
5446       }
5447 #endif
5448       // We have very carefully set things up so that
5449       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5450       // the lower half of Rm * Rn because we know the result already:
5451       // it must be -t0.  t0 + (-t0) must generate a carry iff
5452       // t0 != 0.  So, rather than do a mul and an adds we just set
5453       // the carry flag iff t0 is nonzero.
5454       //
5455       // mul(Rlo_mn, Rm, Rn);
5456       // adds(zr, t0, Rlo_mn);
5457       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5458       adcs(t0, t1, Rhi_mn);
5459       adc(t1, t2, zr);
5460       mov(t2, zr);
5461     }
5462 
acc(Register Rhi,Register Rlo,Register t0,Register t1,Register t2)5463     void acc(Register Rhi, Register Rlo,
5464              Register t0, Register t1, Register t2) {
5465       adds(t0, t0, Rlo);
5466       adcs(t1, t1, Rhi);
5467       adc(t2, t2, zr);
5468     }
5469 
5470   public:
5471     /**
5472      * Fast Montgomery multiplication.  The derivation of the
5473      * algorithm is in A Cryptographic Library for the Motorola
5474      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5475      *
5476      * Arguments:
5477      *
5478      * Inputs for multiplication:
5479      *   c_rarg0   - int array elements a
5480      *   c_rarg1   - int array elements b
5481      *   c_rarg2   - int array elements n (the modulus)
5482      *   c_rarg3   - int length
5483      *   c_rarg4   - int inv
5484      *   c_rarg5   - int array elements m (the result)
5485      *
5486      * Inputs for squaring:
5487      *   c_rarg0   - int array elements a
5488      *   c_rarg1   - int array elements n (the modulus)
5489      *   c_rarg2   - int length
5490      *   c_rarg3   - int inv
5491      *   c_rarg4   - int array elements m (the result)
5492      *
5493      */
generate_multiply()5494     address generate_multiply() {
5495       Label argh, nothing;
5496       bind(argh);
5497       stop("MontgomeryMultiply total_allocation must be <= 8192");
5498 
5499       align(CodeEntryAlignment);
5500       address entry = pc();
5501 
5502       cbzw(Rlen, nothing);
5503 
5504       enter();
5505 
5506       // Make room.
5507       cmpw(Rlen, 512);
5508       br(Assembler::HI, argh);
5509       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5510       andr(sp, Ra, -2 * wordSize);
5511 
5512       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5513 
5514       {
5515         // Copy input args, reversing as we go.  We use Ra as a
5516         // temporary variable.
5517         reverse(Ra, Pa_base, Rlen, t0, t1);
5518         if (!_squaring)
5519           reverse(Ra, Pb_base, Rlen, t0, t1);
5520         reverse(Ra, Pn_base, Rlen, t0, t1);
5521       }
5522 
5523       // Push all call-saved registers and also Pm_base which we'll need
5524       // at the end.
5525       save_regs();
5526 
5527 #ifndef PRODUCT
5528       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5529       {
5530         ldr(Rn, Address(Pn_base, 0));
5531         mul(Rlo_mn, Rn, inv);
5532         cmp(Rlo_mn, -1);
5533         Label ok;
5534         br(EQ, ok); {
5535           stop("broken inverse in Montgomery multiply");
5536         } bind(ok);
5537       }
5538 #endif
5539 
5540       mov(Pm_base, Ra);
5541 
5542       mov(t0, zr);
5543       mov(t1, zr);
5544       mov(t2, zr);
5545 
5546       block_comment("for (int i = 0; i < len; i++) {");
5547       mov(Ri, zr); {
5548         Label loop, end;
5549         cmpw(Ri, Rlen);
5550         br(Assembler::GE, end);
5551 
5552         bind(loop);
5553         pre1(Ri);
5554 
5555         block_comment("  for (j = i; j; j--) {"); {
5556           movw(Rj, Ri);
5557           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5558         } block_comment("  } // j");
5559 
5560         post1();
5561         addw(Ri, Ri, 1);
5562         cmpw(Ri, Rlen);
5563         br(Assembler::LT, loop);
5564         bind(end);
5565         block_comment("} // i");
5566       }
5567 
5568       block_comment("for (int i = len; i < 2*len; i++) {");
5569       mov(Ri, Rlen); {
5570         Label loop, end;
5571         cmpw(Ri, Rlen, Assembler::LSL, 1);
5572         br(Assembler::GE, end);
5573 
5574         bind(loop);
5575         pre2(Ri, Rlen);
5576 
5577         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5578           lslw(Rj, Rlen, 1);
5579           subw(Rj, Rj, Ri);
5580           subw(Rj, Rj, 1);
5581           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5582         } block_comment("  } // j");
5583 
5584         post2(Ri, Rlen);
5585         addw(Ri, Ri, 1);
5586         cmpw(Ri, Rlen, Assembler::LSL, 1);
5587         br(Assembler::LT, loop);
5588         bind(end);
5589       }
5590       block_comment("} // i");
5591 
5592       normalize(Rlen);
5593 
5594       mov(Ra, Pm_base);  // Save Pm_base in Ra
5595       restore_regs();  // Restore caller's Pm_base
5596 
5597       // Copy our result into caller's Pm_base
5598       reverse(Pm_base, Ra, Rlen, t0, t1);
5599 
5600       leave();
5601       bind(nothing);
5602       ret(lr);
5603 
5604       return entry;
5605     }
5606     // In C, approximately:
5607 
5608     // void
5609     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
5610     //                     julong Pn_base[], julong Pm_base[],
5611     //                     julong inv, int len) {
5612     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5613     //   julong *Pa, *Pb, *Pn, *Pm;
5614     //   julong Ra, Rb, Rn, Rm;
5615 
5616     //   int i;
5617 
5618     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5619 
5620     //   for (i = 0; i < len; i++) {
5621     //     int j;
5622 
5623     //     Pa = Pa_base;
5624     //     Pb = Pb_base + i;
5625     //     Pm = Pm_base;
5626     //     Pn = Pn_base + i;
5627 
5628     //     Ra = *Pa;
5629     //     Rb = *Pb;
5630     //     Rm = *Pm;
5631     //     Rn = *Pn;
5632 
5633     //     int iters = i;
5634     //     for (j = 0; iters--; j++) {
5635     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5636     //       MACC(Ra, Rb, t0, t1, t2);
5637     //       Ra = *++Pa;
5638     //       Rb = *--Pb;
5639     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5640     //       MACC(Rm, Rn, t0, t1, t2);
5641     //       Rm = *++Pm;
5642     //       Rn = *--Pn;
5643     //     }
5644 
5645     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5646     //     MACC(Ra, Rb, t0, t1, t2);
5647     //     *Pm = Rm = t0 * inv;
5648     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5649     //     MACC(Rm, Rn, t0, t1, t2);
5650 
5651     //     assert(t0 == 0, "broken Montgomery multiply");
5652 
5653     //     t0 = t1; t1 = t2; t2 = 0;
5654     //   }
5655 
5656     //   for (i = len; i < 2*len; i++) {
5657     //     int j;
5658 
5659     //     Pa = Pa_base + i-len;
5660     //     Pb = Pb_base + len;
5661     //     Pm = Pm_base + i-len;
5662     //     Pn = Pn_base + len;
5663 
5664     //     Ra = *++Pa;
5665     //     Rb = *--Pb;
5666     //     Rm = *++Pm;
5667     //     Rn = *--Pn;
5668 
5669     //     int iters = len*2-i-1;
5670     //     for (j = i-len+1; iters--; j++) {
5671     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5672     //       MACC(Ra, Rb, t0, t1, t2);
5673     //       Ra = *++Pa;
5674     //       Rb = *--Pb;
5675     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5676     //       MACC(Rm, Rn, t0, t1, t2);
5677     //       Rm = *++Pm;
5678     //       Rn = *--Pn;
5679     //     }
5680 
5681     //     Pm_base[i-len] = t0;
5682     //     t0 = t1; t1 = t2; t2 = 0;
5683     //   }
5684 
5685     //   while (t0)
5686     //     t0 = sub(Pm_base, Pn_base, t0, len);
5687     // }
5688 
5689     /**
5690      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5691      * multiplies than Montgomery multiplication so it should be up to
5692      * 25% faster.  However, its loop control is more complex and it
5693      * may actually run slower on some machines.
5694      *
5695      * Arguments:
5696      *
5697      * Inputs:
5698      *   c_rarg0   - int array elements a
5699      *   c_rarg1   - int array elements n (the modulus)
5700      *   c_rarg2   - int length
5701      *   c_rarg3   - int inv
5702      *   c_rarg4   - int array elements m (the result)
5703      *
5704      */
generate_square()5705     address generate_square() {
5706       Label argh;
5707       bind(argh);
5708       stop("MontgomeryMultiply total_allocation must be <= 8192");
5709 
5710       align(CodeEntryAlignment);
5711       address entry = pc();
5712 
5713       enter();
5714 
5715       // Make room.
5716       cmpw(Rlen, 512);
5717       br(Assembler::HI, argh);
5718       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5719       andr(sp, Ra, -2 * wordSize);
5720 
5721       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5722 
5723       {
5724         // Copy input args, reversing as we go.  We use Ra as a
5725         // temporary variable.
5726         reverse(Ra, Pa_base, Rlen, t0, t1);
5727         reverse(Ra, Pn_base, Rlen, t0, t1);
5728       }
5729 
5730       // Push all call-saved registers and also Pm_base which we'll need
5731       // at the end.
5732       save_regs();
5733 
5734       mov(Pm_base, Ra);
5735 
5736       mov(t0, zr);
5737       mov(t1, zr);
5738       mov(t2, zr);
5739 
5740       block_comment("for (int i = 0; i < len; i++) {");
5741       mov(Ri, zr); {
5742         Label loop, end;
5743         bind(loop);
5744         cmp(Ri, Rlen);
5745         br(Assembler::GE, end);
5746 
5747         pre1(Ri);
5748 
5749         block_comment("for (j = (i+1)/2; j; j--) {"); {
5750           add(Rj, Ri, 1);
5751           lsr(Rj, Rj, 1);
5752           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5753         } block_comment("  } // j");
5754 
5755         last_squaring(Ri);
5756 
5757         block_comment("  for (j = i/2; j; j--) {"); {
5758           lsr(Rj, Ri, 1);
5759           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5760         } block_comment("  } // j");
5761 
5762         post1_squaring();
5763         add(Ri, Ri, 1);
5764         cmp(Ri, Rlen);
5765         br(Assembler::LT, loop);
5766 
5767         bind(end);
5768         block_comment("} // i");
5769       }
5770 
5771       block_comment("for (int i = len; i < 2*len; i++) {");
5772       mov(Ri, Rlen); {
5773         Label loop, end;
5774         bind(loop);
5775         cmp(Ri, Rlen, Assembler::LSL, 1);
5776         br(Assembler::GE, end);
5777 
5778         pre2(Ri, Rlen);
5779 
5780         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5781           lsl(Rj, Rlen, 1);
5782           sub(Rj, Rj, Ri);
5783           sub(Rj, Rj, 1);
5784           lsr(Rj, Rj, 1);
5785           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5786         } block_comment("  } // j");
5787 
5788         last_squaring(Ri);
5789 
5790         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5791           lsl(Rj, Rlen, 1);
5792           sub(Rj, Rj, Ri);
5793           lsr(Rj, Rj, 1);
5794           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5795         } block_comment("  } // j");
5796 
5797         post2(Ri, Rlen);
5798         add(Ri, Ri, 1);
5799         cmp(Ri, Rlen, Assembler::LSL, 1);
5800 
5801         br(Assembler::LT, loop);
5802         bind(end);
5803         block_comment("} // i");
5804       }
5805 
5806       normalize(Rlen);
5807 
5808       mov(Ra, Pm_base);  // Save Pm_base in Ra
5809       restore_regs();  // Restore caller's Pm_base
5810 
5811       // Copy our result into caller's Pm_base
5812       reverse(Pm_base, Ra, Rlen, t0, t1);
5813 
5814       leave();
5815       ret(lr);
5816 
5817       return entry;
5818     }
5819     // In C, approximately:
5820 
5821     // void
5822     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
5823     //                     julong Pn_base[], julong Pm_base[],
5824     //                     julong inv, int len) {
5825     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5826     //   julong *Pa, *Pb, *Pn, *Pm;
5827     //   julong Ra, Rb, Rn, Rm;
5828 
5829     //   int i;
5830 
5831     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5832 
5833     //   for (i = 0; i < len; i++) {
5834     //     int j;
5835 
5836     //     Pa = Pa_base;
5837     //     Pb = Pa_base + i;
5838     //     Pm = Pm_base;
5839     //     Pn = Pn_base + i;
5840 
5841     //     Ra = *Pa;
5842     //     Rb = *Pb;
5843     //     Rm = *Pm;
5844     //     Rn = *Pn;
5845 
5846     //     int iters = (i+1)/2;
5847     //     for (j = 0; iters--; j++) {
5848     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5849     //       MACC2(Ra, Rb, t0, t1, t2);
5850     //       Ra = *++Pa;
5851     //       Rb = *--Pb;
5852     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5853     //       MACC(Rm, Rn, t0, t1, t2);
5854     //       Rm = *++Pm;
5855     //       Rn = *--Pn;
5856     //     }
5857     //     if ((i & 1) == 0) {
5858     //       assert(Ra == Pa_base[j], "must be");
5859     //       MACC(Ra, Ra, t0, t1, t2);
5860     //     }
5861     //     iters = i/2;
5862     //     assert(iters == i-j, "must be");
5863     //     for (; iters--; j++) {
5864     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5865     //       MACC(Rm, Rn, t0, t1, t2);
5866     //       Rm = *++Pm;
5867     //       Rn = *--Pn;
5868     //     }
5869 
5870     //     *Pm = Rm = t0 * inv;
5871     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5872     //     MACC(Rm, Rn, t0, t1, t2);
5873 
5874     //     assert(t0 == 0, "broken Montgomery multiply");
5875 
5876     //     t0 = t1; t1 = t2; t2 = 0;
5877     //   }
5878 
5879     //   for (i = len; i < 2*len; i++) {
5880     //     int start = i-len+1;
5881     //     int end = start + (len - start)/2;
5882     //     int j;
5883 
5884     //     Pa = Pa_base + i-len;
5885     //     Pb = Pa_base + len;
5886     //     Pm = Pm_base + i-len;
5887     //     Pn = Pn_base + len;
5888 
5889     //     Ra = *++Pa;
5890     //     Rb = *--Pb;
5891     //     Rm = *++Pm;
5892     //     Rn = *--Pn;
5893 
5894     //     int iters = (2*len-i-1)/2;
5895     //     assert(iters == end-start, "must be");
5896     //     for (j = start; iters--; j++) {
5897     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5898     //       MACC2(Ra, Rb, t0, t1, t2);
5899     //       Ra = *++Pa;
5900     //       Rb = *--Pb;
5901     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5902     //       MACC(Rm, Rn, t0, t1, t2);
5903     //       Rm = *++Pm;
5904     //       Rn = *--Pn;
5905     //     }
5906     //     if ((i & 1) == 0) {
5907     //       assert(Ra == Pa_base[j], "must be");
5908     //       MACC(Ra, Ra, t0, t1, t2);
5909     //     }
5910     //     iters =  (2*len-i)/2;
5911     //     assert(iters == len-j, "must be");
5912     //     for (; iters--; j++) {
5913     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5914     //       MACC(Rm, Rn, t0, t1, t2);
5915     //       Rm = *++Pm;
5916     //       Rn = *--Pn;
5917     //     }
5918     //     Pm_base[i-len] = t0;
5919     //     t0 = t1; t1 = t2; t2 = 0;
5920     //   }
5921 
5922     //   while (t0)
5923     //     t0 = sub(Pm_base, Pn_base, t0, len);
5924     // }
5925   };
5926 
5927 
5928   // Initialization
generate_initial()5929   void generate_initial() {
5930     // Generate initial stubs and initializes the entry points
5931 
5932     // entry points that exist in all platforms Note: This is code
5933     // that could be shared among different platforms - however the
5934     // benefit seems to be smaller than the disadvantage of having a
5935     // much more complicated generator structure. See also comment in
5936     // stubRoutines.hpp.
5937 
5938     StubRoutines::_forward_exception_entry = generate_forward_exception();
5939 
5940     StubRoutines::_call_stub_entry =
5941       generate_call_stub(StubRoutines::_call_stub_return_address);
5942 
5943     // is referenced by megamorphic call
5944     StubRoutines::_catch_exception_entry = generate_catch_exception();
5945 
5946     // Build this early so it's available for the interpreter.
5947     StubRoutines::_throw_StackOverflowError_entry =
5948       generate_throw_exception("StackOverflowError throw_exception",
5949                                CAST_FROM_FN_PTR(address,
5950                                                 SharedRuntime::throw_StackOverflowError));
5951     StubRoutines::_throw_delayed_StackOverflowError_entry =
5952       generate_throw_exception("delayed StackOverflowError throw_exception",
5953                                CAST_FROM_FN_PTR(address,
5954                                                 SharedRuntime::throw_delayed_StackOverflowError));
5955     if (UseCRC32Intrinsics) {
5956       // set table address before stub generation which use it
5957       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5958       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5959     }
5960 
5961     if (UseCRC32CIntrinsics) {
5962       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5963     }
5964 
5965     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5966       // disabled pending fix and retest of generated code via JDK-8210858
5967       // StubRoutines::_dlog = generate_dlog();
5968     }
5969 
5970     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5971       // disabled pending fix and retest of generated code via JDK-8210461
5972       // StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5973     }
5974 
5975     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5976       // disabled pending fix and retest of generated code via JDK-8210461
5977       // StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5978     }
5979   }
5980 
generate_all()5981   void generate_all() {
5982     // support for verify_oop (must happen after universe_init)
5983     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5984     StubRoutines::_throw_AbstractMethodError_entry =
5985       generate_throw_exception("AbstractMethodError throw_exception",
5986                                CAST_FROM_FN_PTR(address,
5987                                                 SharedRuntime::
5988                                                 throw_AbstractMethodError));
5989 
5990     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5991       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5992                                CAST_FROM_FN_PTR(address,
5993                                                 SharedRuntime::
5994                                                 throw_IncompatibleClassChangeError));
5995 
5996     StubRoutines::_throw_NullPointerException_at_call_entry =
5997       generate_throw_exception("NullPointerException at call throw_exception",
5998                                CAST_FROM_FN_PTR(address,
5999                                                 SharedRuntime::
6000                                                 throw_NullPointerException_at_call));
6001 
6002     // arraycopy stubs used by compilers
6003     generate_arraycopy_stubs();
6004 
6005     // has negatives stub for large arrays.
6006     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
6007 
6008     // array equals stub for large arrays.
6009     if (!UseSimpleArrayEquals) {
6010       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
6011     }
6012 
6013     generate_compare_long_strings();
6014 
6015     generate_string_indexof_stubs();
6016 
6017     // byte_array_inflate stub for large arrays.
6018     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
6019 
6020 #ifdef COMPILER2
6021     if (UseMultiplyToLenIntrinsic) {
6022       StubRoutines::_multiplyToLen = generate_multiplyToLen();
6023     }
6024 
6025     if (UseSquareToLenIntrinsic) {
6026       StubRoutines::_squareToLen = generate_squareToLen();
6027     }
6028 
6029     if (UseMulAddIntrinsic) {
6030       StubRoutines::_mulAdd = generate_mulAdd();
6031     }
6032 
6033     if (UseMontgomeryMultiplyIntrinsic) {
6034       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
6035       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
6036       StubRoutines::_montgomeryMultiply = g.generate_multiply();
6037     }
6038 
6039     if (UseMontgomerySquareIntrinsic) {
6040       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
6041       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
6042       // We use generate_multiply() rather than generate_square()
6043       // because it's faster for the sizes of modulus we care about.
6044       StubRoutines::_montgomerySquare = g.generate_multiply();
6045     }
6046 #endif // COMPILER2
6047 
6048     // generate GHASH intrinsics code
6049     if (UseGHASHIntrinsics) {
6050       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
6051     }
6052 
6053     if (UseBASE64Intrinsics) {
6054         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6055     }
6056 
6057     if (UseAESIntrinsics) {
6058       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6059       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6060       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6061       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
6062     }
6063 
6064     if (UseSHA1Intrinsics) {
6065       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
6066       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
6067     }
6068     if (UseSHA256Intrinsics) {
6069       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
6070       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
6071     }
6072 
6073     // generate Adler32 intrinsics code
6074     if (UseAdler32Intrinsics) {
6075       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6076     }
6077 
6078     // Safefetch stubs.
6079     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
6080                                                        &StubRoutines::_safefetch32_fault_pc,
6081                                                        &StubRoutines::_safefetch32_continuation_pc);
6082     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
6083                                                        &StubRoutines::_safefetchN_fault_pc,
6084                                                        &StubRoutines::_safefetchN_continuation_pc);
6085 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
6086 
6087     generate_atomic_entry_points();
6088 
6089 #endif // LINUX || _ALLBSD_SOURCE
6090 
6091   StubRoutines::aarch64::set_completed();
6092   }
6093 
6094  public:
StubGenerator(CodeBuffer * code,bool all)6095   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6096     if (all) {
6097       generate_all();
6098     } else {
6099       generate_initial();
6100     }
6101   }
6102 }; // end class declaration
6103 
StubGenerator_generate(CodeBuffer * code,bool all)6104 void StubGenerator_generate(CodeBuffer* code, bool all) {
6105   StubGenerator g(code, all);
6106 }
6107 
6108 
6109 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
6110 
6111 // Define pointers to atomic stubs and initialize them to point to the
6112 // code in atomic_aarch64.S.
6113 
6114 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
6115   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
6116     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
6117   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
6118     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
6119 
6120 DEFAULT_ATOMIC_OP(fetch_add, 4, )
6121 DEFAULT_ATOMIC_OP(fetch_add, 8, )
6122 DEFAULT_ATOMIC_OP(xchg, 4, )
6123 DEFAULT_ATOMIC_OP(xchg, 8, )
6124 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
6125 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
6126 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
6127 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
6128 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
6129 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
6130 
6131 #undef DEFAULT_ATOMIC_OP
6132 
6133 #endif // LINUX || _ALLBSD_SOURCE
6134