1 /*
2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
3  * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.
9  *
10  * This code is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13  * version 2 for more details (a copy is included in the LICENSE file that
14  * accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License version
17  * 2 along with this work; if not, write to the Free Software Foundation,
18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19  *
20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21  * or visit www.oracle.com if you need additional information or have any
22  * questions.
23  *
24  */
25 
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "runtime/atomic.hpp"
44 #include "runtime/frame.inline.hpp"
45 #include "runtime/handles.inline.hpp"
46 #include "runtime/sharedRuntime.hpp"
47 #include "runtime/stubCodeGenerator.hpp"
48 #include "runtime/stubRoutines.hpp"
49 #include "runtime/thread.inline.hpp"
50 #include "utilities/align.hpp"
51 #include "utilities/powerOfTwo.hpp"
52 #ifdef COMPILER2
53 #include "opto/runtime.hpp"
54 #endif
55 #if INCLUDE_ZGC
56 #include "gc/z/zThreadLocalData.hpp"
57 #endif
58 
59 // Declaration and definition of StubGenerator (no .hpp file).
60 // For a more detailed description of the stub routine structure
61 // see the comment in stubRoutines.hpp
62 
63 #undef __
64 #define __ _masm->
65 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
66 
67 #ifdef PRODUCT
68 #define BLOCK_COMMENT(str) /* nothing */
69 #else
70 #define BLOCK_COMMENT(str) __ block_comment(str)
71 #endif
72 
73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
74 
75 // Stub Code definitions
76 
77 class StubGenerator: public StubCodeGenerator {
78  private:
79 
80 #ifdef PRODUCT
81 #define inc_counter_np(counter) ((void)0)
82 #else
83   void inc_counter_np_(int& counter) {
84     __ lea(rscratch2, ExternalAddress((address)&counter));
85     __ ldrw(rscratch1, Address(rscratch2));
86     __ addw(rscratch1, rscratch1, 1);
87     __ strw(rscratch1, Address(rscratch2));
88   }
89 #define inc_counter_np(counter) \
90   BLOCK_COMMENT("inc_counter " #counter); \
91   inc_counter_np_(counter);
92 #endif
93 
94   // Call stubs are used to call Java from C
95   //
96   // Arguments:
97   //    c_rarg0:   call wrapper address                   address
98   //    c_rarg1:   result                                 address
99   //    c_rarg2:   result type                            BasicType
100   //    c_rarg3:   method                                 Method*
101   //    c_rarg4:   (interpreter) entry point              address
102   //    c_rarg5:   parameters                             intptr_t*
103   //    c_rarg6:   parameter size (in words)              int
104   //    c_rarg7:   thread                                 Thread*
105   //
106   // There is no return from the stub itself as any Java result
107   // is written to result
108   //
109   // we save r30 (lr) as the return PC at the base of the frame and
110   // link r29 (fp) below it as the frame pointer installing sp (r31)
111   // into fp.
112   //
113   // we save r0-r7, which accounts for all the c arguments.
114   //
115   // TODO: strictly do we need to save them all? they are treated as
116   // volatile by C so could we omit saving the ones we are going to
117   // place in global registers (thread? method?) or those we only use
118   // during setup of the Java call?
119   //
120   // we don't need to save r8 which C uses as an indirect result location
121   // return register.
122   //
123   // we don't need to save r9-r15 which both C and Java treat as
124   // volatile
125   //
126   // we don't need to save r16-18 because Java does not use them
127   //
128   // we save r19-r28 which Java uses as scratch registers and C
129   // expects to be callee-save
130   //
131   // we save the bottom 64 bits of each value stored in v8-v15; it is
132   // the responsibility of the caller to preserve larger values.
133   //
134   // so the stub frame looks like this when we enter Java code
135   //
136   //     [ return_from_Java     ] <--- sp
137   //     [ argument word n      ]
138   //      ...
139   // -27 [ argument word 1      ]
140   // -26 [ saved v15            ] <--- sp_after_call
141   // -25 [ saved v14            ]
142   // -24 [ saved v13            ]
143   // -23 [ saved v12            ]
144   // -22 [ saved v11            ]
145   // -21 [ saved v10            ]
146   // -20 [ saved v9             ]
147   // -19 [ saved v8             ]
148   // -18 [ saved r28            ]
149   // -17 [ saved r27            ]
150   // -16 [ saved r26            ]
151   // -15 [ saved r25            ]
152   // -14 [ saved r24            ]
153   // -13 [ saved r23            ]
154   // -12 [ saved r22            ]
155   // -11 [ saved r21            ]
156   // -10 [ saved r20            ]
157   //  -9 [ saved r19            ]
158   //  -8 [ call wrapper    (r0) ]
159   //  -7 [ result          (r1) ]
160   //  -6 [ result type     (r2) ]
161   //  -5 [ method          (r3) ]
162   //  -4 [ entry point     (r4) ]
163   //  -3 [ parameters      (r5) ]
164   //  -2 [ parameter size  (r6) ]
165   //  -1 [ thread (r7)          ]
166   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
167   //   1 [ saved lr       (r30) ]
168 
169   // Call stub stack layout word offsets from fp
170   enum call_stub_layout {
171     sp_after_call_off = -26,
172 
173     d15_off            = -26,
174     d13_off            = -24,
175     d11_off            = -22,
176     d9_off             = -20,
177 
178     r28_off            = -18,
179     r26_off            = -16,
180     r24_off            = -14,
181     r22_off            = -12,
182     r20_off            = -10,
183     call_wrapper_off   =  -8,
184     result_off         =  -7,
185     result_type_off    =  -6,
186     method_off         =  -5,
187     entry_point_off    =  -4,
188     parameter_size_off =  -2,
189     thread_off         =  -1,
190     fp_f               =   0,
191     retaddr_off        =   1,
192   };
193 
generate_call_stub(address & return_address)194   address generate_call_stub(address& return_address) {
195     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
196            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
197            "adjust this code");
198 
199     StubCodeMark mark(this, "StubRoutines", "call_stub");
200     address start = __ pc();
201 
202     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
203 
204     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
205     const Address result        (rfp, result_off         * wordSize);
206     const Address result_type   (rfp, result_type_off    * wordSize);
207     const Address method        (rfp, method_off         * wordSize);
208     const Address entry_point   (rfp, entry_point_off    * wordSize);
209     const Address parameter_size(rfp, parameter_size_off * wordSize);
210 
211     const Address thread        (rfp, thread_off         * wordSize);
212 
213     const Address d15_save      (rfp, d15_off * wordSize);
214     const Address d13_save      (rfp, d13_off * wordSize);
215     const Address d11_save      (rfp, d11_off * wordSize);
216     const Address d9_save       (rfp, d9_off * wordSize);
217 
218     const Address r28_save      (rfp, r28_off * wordSize);
219     const Address r26_save      (rfp, r26_off * wordSize);
220     const Address r24_save      (rfp, r24_off * wordSize);
221     const Address r22_save      (rfp, r22_off * wordSize);
222     const Address r20_save      (rfp, r20_off * wordSize);
223 
224     // stub code
225 
226     address aarch64_entry = __ pc();
227 
228     // set up frame and move sp to end of save area
229     __ enter();
230     __ sub(sp, rfp, -sp_after_call_off * wordSize);
231 
232     // save register parameters and Java scratch/global registers
233     // n.b. we save thread even though it gets installed in
234     // rthread because we want to sanity check rthread later
235     __ str(c_rarg7,  thread);
236     __ strw(c_rarg6, parameter_size);
237     __ stp(c_rarg4, c_rarg5,  entry_point);
238     __ stp(c_rarg2, c_rarg3,  result_type);
239     __ stp(c_rarg0, c_rarg1,  call_wrapper);
240 
241     __ stp(r20, r19,   r20_save);
242     __ stp(r22, r21,   r22_save);
243     __ stp(r24, r23,   r24_save);
244     __ stp(r26, r25,   r26_save);
245     __ stp(r28, r27,   r28_save);
246 
247     __ stpd(v9,  v8,   d9_save);
248     __ stpd(v11, v10,  d11_save);
249     __ stpd(v13, v12,  d13_save);
250     __ stpd(v15, v14,  d15_save);
251 
252     // install Java thread in global register now we have saved
253     // whatever value it held
254     __ mov(rthread, c_rarg7);
255     // And method
256     __ mov(rmethod, c_rarg3);
257 
258     // set up the heapbase register
259     __ reinit_heapbase();
260 
261 #ifdef ASSERT
262     // make sure we have no pending exceptions
263     {
264       Label L;
265       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
266       __ cmp(rscratch1, (u1)NULL_WORD);
267       __ br(Assembler::EQ, L);
268       __ stop("StubRoutines::call_stub: entered with pending exception");
269       __ BIND(L);
270     }
271 #endif
272     // pass parameters if any
273     __ mov(esp, sp);
274     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
275     __ andr(sp, rscratch1, -2 * wordSize);
276 
277     BLOCK_COMMENT("pass parameters if any");
278     Label parameters_done;
279     // parameter count is still in c_rarg6
280     // and parameter pointer identifying param 1 is in c_rarg5
281     __ cbzw(c_rarg6, parameters_done);
282 
283     address loop = __ pc();
284     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
285     __ subsw(c_rarg6, c_rarg6, 1);
286     __ push(rscratch1);
287     __ br(Assembler::GT, loop);
288 
289     __ BIND(parameters_done);
290 
291     // call Java entry -- passing methdoOop, and current sp
292     //      rmethod: Method*
293     //      r13: sender sp
294     BLOCK_COMMENT("call Java function");
295     __ mov(r13, sp);
296     __ blr(c_rarg4);
297 
298     // we do this here because the notify will already have been done
299     // if we get to the next instruction via an exception
300     //
301     // n.b. adding this instruction here affects the calculation of
302     // whether or not a routine returns to the call stub (used when
303     // doing stack walks) since the normal test is to check the return
304     // pc against the address saved below. so we may need to allow for
305     // this extra instruction in the check.
306 
307     // save current address for use by exception handling code
308 
309     return_address = __ pc();
310 
311     // store result depending on type (everything that is not
312     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
313     // n.b. this assumes Java returns an integral result in r0
314     // and a floating result in j_farg0
315     __ ldr(j_rarg2, result);
316     Label is_long, is_float, is_double, exit;
317     __ ldr(j_rarg1, result_type);
318     __ cmp(j_rarg1, (u1)T_OBJECT);
319     __ br(Assembler::EQ, is_long);
320     __ cmp(j_rarg1, (u1)T_LONG);
321     __ br(Assembler::EQ, is_long);
322     __ cmp(j_rarg1, (u1)T_FLOAT);
323     __ br(Assembler::EQ, is_float);
324     __ cmp(j_rarg1, (u1)T_DOUBLE);
325     __ br(Assembler::EQ, is_double);
326 
327     // handle T_INT case
328     __ strw(r0, Address(j_rarg2));
329 
330     __ BIND(exit);
331 
332     // pop parameters
333     __ sub(esp, rfp, -sp_after_call_off * wordSize);
334 
335 #ifdef ASSERT
336     // verify that threads correspond
337     {
338       Label L, S;
339       __ ldr(rscratch1, thread);
340       __ cmp(rthread, rscratch1);
341       __ br(Assembler::NE, S);
342       __ get_thread(rscratch1);
343       __ cmp(rthread, rscratch1);
344       __ br(Assembler::EQ, L);
345       __ BIND(S);
346       __ stop("StubRoutines::call_stub: threads must correspond");
347       __ BIND(L);
348     }
349 #endif
350 
351     // restore callee-save registers
352     __ ldpd(v15, v14,  d15_save);
353     __ ldpd(v13, v12,  d13_save);
354     __ ldpd(v11, v10,  d11_save);
355     __ ldpd(v9,  v8,   d9_save);
356 
357     __ ldp(r28, r27,   r28_save);
358     __ ldp(r26, r25,   r26_save);
359     __ ldp(r24, r23,   r24_save);
360     __ ldp(r22, r21,   r22_save);
361     __ ldp(r20, r19,   r20_save);
362 
363     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
364     __ ldrw(c_rarg2, result_type);
365     __ ldr(c_rarg3,  method);
366     __ ldp(c_rarg4, c_rarg5,  entry_point);
367     __ ldp(c_rarg6, c_rarg7,  parameter_size);
368 
369     // leave frame and return to caller
370     __ leave();
371     __ ret(lr);
372 
373     // handle return types different from T_INT
374 
375     __ BIND(is_long);
376     __ str(r0, Address(j_rarg2, 0));
377     __ br(Assembler::AL, exit);
378 
379     __ BIND(is_float);
380     __ strs(j_farg0, Address(j_rarg2, 0));
381     __ br(Assembler::AL, exit);
382 
383     __ BIND(is_double);
384     __ strd(j_farg0, Address(j_rarg2, 0));
385     __ br(Assembler::AL, exit);
386 
387     return start;
388   }
389 
390   // Return point for a Java call if there's an exception thrown in
391   // Java code.  The exception is caught and transformed into a
392   // pending exception stored in JavaThread that can be tested from
393   // within the VM.
394   //
395   // Note: Usually the parameters are removed by the callee. In case
396   // of an exception crossing an activation frame boundary, that is
397   // not the case if the callee is compiled code => need to setup the
398   // rsp.
399   //
400   // r0: exception oop
401 
generate_catch_exception()402   address generate_catch_exception() {
403     StubCodeMark mark(this, "StubRoutines", "catch_exception");
404     address start = __ pc();
405 
406     // same as in generate_call_stub():
407     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
408     const Address thread        (rfp, thread_off         * wordSize);
409 
410 #ifdef ASSERT
411     // verify that threads correspond
412     {
413       Label L, S;
414       __ ldr(rscratch1, thread);
415       __ cmp(rthread, rscratch1);
416       __ br(Assembler::NE, S);
417       __ get_thread(rscratch1);
418       __ cmp(rthread, rscratch1);
419       __ br(Assembler::EQ, L);
420       __ bind(S);
421       __ stop("StubRoutines::catch_exception: threads must correspond");
422       __ bind(L);
423     }
424 #endif
425 
426     // set pending exception
427     __ verify_oop(r0);
428 
429     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
430     __ mov(rscratch1, (address)__FILE__);
431     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
432     __ movw(rscratch1, (int)__LINE__);
433     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
434 
435     // complete return to VM
436     assert(StubRoutines::_call_stub_return_address != NULL,
437            "_call_stub_return_address must have been generated before");
438     __ b(StubRoutines::_call_stub_return_address);
439 
440     return start;
441   }
442 
443   // Continuation point for runtime calls returning with a pending
444   // exception.  The pending exception check happened in the runtime
445   // or native call stub.  The pending exception in Thread is
446   // converted into a Java-level exception.
447   //
448   // Contract with Java-level exception handlers:
449   // r0: exception
450   // r3: throwing pc
451   //
452   // NOTE: At entry of this stub, exception-pc must be in LR !!
453 
454   // NOTE: this is always used as a jump target within generated code
455   // so it just needs to be generated code wiht no x86 prolog
456 
generate_forward_exception()457   address generate_forward_exception() {
458     StubCodeMark mark(this, "StubRoutines", "forward exception");
459     address start = __ pc();
460 
461     // Upon entry, LR points to the return address returning into
462     // Java (interpreted or compiled) code; i.e., the return address
463     // becomes the throwing pc.
464     //
465     // Arguments pushed before the runtime call are still on the stack
466     // but the exception handler will reset the stack pointer ->
467     // ignore them.  A potential result in registers can be ignored as
468     // well.
469 
470 #ifdef ASSERT
471     // make sure this code is only executed if there is a pending exception
472     {
473       Label L;
474       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
475       __ cbnz(rscratch1, L);
476       __ stop("StubRoutines::forward exception: no pending exception (1)");
477       __ bind(L);
478     }
479 #endif
480 
481     // compute exception handler into r19
482 
483     // call the VM to find the handler address associated with the
484     // caller address. pass thread in r0 and caller pc (ret address)
485     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
486     // the stack.
487     __ mov(c_rarg1, lr);
488     // lr will be trashed by the VM call so we move it to R19
489     // (callee-saved) because we also need to pass it to the handler
490     // returned by this call.
491     __ mov(r19, lr);
492     BLOCK_COMMENT("call exception_handler_for_return_address");
493     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
494                          SharedRuntime::exception_handler_for_return_address),
495                     rthread, c_rarg1);
496     // Reinitialize the ptrue predicate register, in case the external runtime
497     // call clobbers ptrue reg, as we may return to SVE compiled code.
498     __ reinitialize_ptrue();
499 
500     // we should not really care that lr is no longer the callee
501     // address. we saved the value the handler needs in r19 so we can
502     // just copy it to r3. however, the C2 handler will push its own
503     // frame and then calls into the VM and the VM code asserts that
504     // the PC for the frame above the handler belongs to a compiled
505     // Java method. So, we restore lr here to satisfy that assert.
506     __ mov(lr, r19);
507     // setup r0 & r3 & clear pending exception
508     __ mov(r3, r19);
509     __ mov(r19, r0);
510     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
511     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
512 
513 #ifdef ASSERT
514     // make sure exception is set
515     {
516       Label L;
517       __ cbnz(r0, L);
518       __ stop("StubRoutines::forward exception: no pending exception (2)");
519       __ bind(L);
520     }
521 #endif
522 
523     // continue at exception handler
524     // r0: exception
525     // r3: throwing pc
526     // r19: exception handler
527     __ verify_oop(r0);
528     __ br(r19);
529 
530     return start;
531   }
532 
533   // Non-destructive plausibility checks for oops
534   //
535   // Arguments:
536   //    r0: oop to verify
537   //    rscratch1: error message
538   //
539   // Stack after saving c_rarg3:
540   //    [tos + 0]: saved c_rarg3
541   //    [tos + 1]: saved c_rarg2
542   //    [tos + 2]: saved lr
543   //    [tos + 3]: saved rscratch2
544   //    [tos + 4]: saved r0
545   //    [tos + 5]: saved rscratch1
generate_verify_oop()546   address generate_verify_oop() {
547 
548     StubCodeMark mark(this, "StubRoutines", "verify_oop");
549     address start = __ pc();
550 
551     Label exit, error;
552 
553     // save c_rarg2 and c_rarg3
554     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
555 
556     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
557     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
558     __ ldr(c_rarg3, Address(c_rarg2));
559     __ add(c_rarg3, c_rarg3, 1);
560     __ str(c_rarg3, Address(c_rarg2));
561 
562     // object is in r0
563     // make sure object is 'reasonable'
564     __ cbz(r0, exit); // if obj is NULL it is OK
565 
566 #if INCLUDE_ZGC
567     if (UseZGC) {
568       // Check if mask is good.
569       // verifies that ZAddressBadMask & r0 == 0
570       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
571       __ andr(c_rarg2, r0, c_rarg3);
572       __ cbnz(c_rarg2, error);
573     }
574 #endif
575 
576     // Check if the oop is in the right area of memory
577     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
578     __ andr(c_rarg2, r0, c_rarg3);
579     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
580 
581     // Compare c_rarg2 and c_rarg3.  We don't use a compare
582     // instruction here because the flags register is live.
583     __ eor(c_rarg2, c_rarg2, c_rarg3);
584     __ cbnz(c_rarg2, error);
585 
586     // make sure klass is 'reasonable', which is not zero.
587     __ load_klass(r0, r0);  // get klass
588     __ cbz(r0, error);      // if klass is NULL it is broken
589 
590     // return if everything seems ok
591     __ bind(exit);
592 
593     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
594     __ ret(lr);
595 
596     // handle errors
597     __ bind(error);
598     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
599 
600     __ push(RegSet::range(r0, r29), sp);
601     // debug(char* msg, int64_t pc, int64_t regs[])
602     __ mov(c_rarg0, rscratch1);      // pass address of error message
603     __ mov(c_rarg1, lr);             // pass return address
604     __ mov(c_rarg2, sp);             // pass address of regs on stack
605 #ifndef PRODUCT
606     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
607 #endif
608     BLOCK_COMMENT("call MacroAssembler::debug");
609     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
610     __ blr(rscratch1);
611     __ hlt(0);
612 
613     return start;
614   }
615 
array_overlap_test(Label & L_no_overlap,Address::sxtw sf)616   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
617 
618   // Generate indices for iota vector.
generate_iota_indices(const char * stub_name)619   address generate_iota_indices(const char *stub_name) {
620     __ align(CodeEntryAlignment);
621     StubCodeMark mark(this, "StubRoutines", stub_name);
622     address start = __ pc();
623     __ emit_data64(0x0706050403020100, relocInfo::none);
624     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
625     return start;
626   }
627 
628   // The inner part of zero_words().  This is the bulk operation,
629   // zeroing words in blocks, possibly using DC ZVA to do it.  The
630   // caller is responsible for zeroing the last few words.
631   //
632   // Inputs:
633   // r10: the HeapWord-aligned base address of an array to zero.
634   // r11: the count in HeapWords, r11 > 0.
635   //
636   // Returns r10 and r11, adjusted for the caller to clear.
637   // r10: the base address of the tail of words left to clear.
638   // r11: the number of words in the tail.
639   //      r11 < MacroAssembler::zero_words_block_size.
640 
generate_zero_blocks()641   address generate_zero_blocks() {
642     Label done;
643     Label base_aligned;
644 
645     Register base = r10, cnt = r11;
646 
647     __ align(CodeEntryAlignment);
648     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
649     address start = __ pc();
650 
651     if (UseBlockZeroing) {
652       int zva_length = VM_Version::zva_length();
653 
654       // Ensure ZVA length can be divided by 16. This is required by
655       // the subsequent operations.
656       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
657 
658       __ tbz(base, 3, base_aligned);
659       __ str(zr, Address(__ post(base, 8)));
660       __ sub(cnt, cnt, 1);
661       __ bind(base_aligned);
662 
663       // Ensure count >= zva_length * 2 so that it still deserves a zva after
664       // alignment.
665       Label small;
666       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
667       __ subs(rscratch1, cnt, low_limit >> 3);
668       __ br(Assembler::LT, small);
669       __ zero_dcache_blocks(base, cnt);
670       __ bind(small);
671     }
672 
673     {
674       // Number of stp instructions we'll unroll
675       const int unroll =
676         MacroAssembler::zero_words_block_size / 2;
677       // Clear the remaining blocks.
678       Label loop;
679       __ subs(cnt, cnt, unroll * 2);
680       __ br(Assembler::LT, done);
681       __ bind(loop);
682       for (int i = 0; i < unroll; i++)
683         __ stp(zr, zr, __ post(base, 16));
684       __ subs(cnt, cnt, unroll * 2);
685       __ br(Assembler::GE, loop);
686       __ bind(done);
687       __ add(cnt, cnt, unroll * 2);
688     }
689 
690     __ ret(lr);
691 
692     return start;
693   }
694 
695 
696   typedef enum {
697     copy_forwards = 1,
698     copy_backwards = -1
699   } copy_direction;
700 
701   // Bulk copy of blocks of 8 words.
702   //
703   // count is a count of words.
704   //
705   // Precondition: count >= 8
706   //
707   // Postconditions:
708   //
709   // The least significant bit of count contains the remaining count
710   // of words to copy.  The rest of count is trash.
711   //
712   // s and d are adjusted to point to the remaining words to copy
713   //
generate_copy_longs(Label & start,Register s,Register d,Register count,copy_direction direction)714   void generate_copy_longs(Label &start, Register s, Register d, Register count,
715                            copy_direction direction) {
716     int unit = wordSize * direction;
717     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
718 
719     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
720       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
721     const Register stride = r13;
722 
723     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
724     assert_different_registers(s, d, count, rscratch1);
725 
726     Label again, drain;
727     const char *stub_name;
728     if (direction == copy_forwards)
729       stub_name = "forward_copy_longs";
730     else
731       stub_name = "backward_copy_longs";
732 
733     __ align(CodeEntryAlignment);
734 
735     StubCodeMark mark(this, "StubRoutines", stub_name);
736 
737     __ bind(start);
738 
739     Label unaligned_copy_long;
740     if (AvoidUnalignedAccesses) {
741       __ tbnz(d, 3, unaligned_copy_long);
742     }
743 
744     if (direction == copy_forwards) {
745       __ sub(s, s, bias);
746       __ sub(d, d, bias);
747     }
748 
749 #ifdef ASSERT
750     // Make sure we are never given < 8 words
751     {
752       Label L;
753       __ cmp(count, (u1)8);
754       __ br(Assembler::GE, L);
755       __ stop("genrate_copy_longs called with < 8 words");
756       __ bind(L);
757     }
758 #endif
759 
760     // Fill 8 registers
761     if (UseSIMDForMemoryOps) {
762       __ ldpq(v0, v1, Address(s, 4 * unit));
763       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
764     } else {
765       __ ldp(t0, t1, Address(s, 2 * unit));
766       __ ldp(t2, t3, Address(s, 4 * unit));
767       __ ldp(t4, t5, Address(s, 6 * unit));
768       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
769     }
770 
771     __ subs(count, count, 16);
772     __ br(Assembler::LO, drain);
773 
774     int prefetch = PrefetchCopyIntervalInBytes;
775     bool use_stride = false;
776     if (direction == copy_backwards) {
777        use_stride = prefetch > 256;
778        prefetch = -prefetch;
779        if (use_stride) __ mov(stride, prefetch);
780     }
781 
782     __ bind(again);
783 
784     if (PrefetchCopyIntervalInBytes > 0)
785       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
786 
787     if (UseSIMDForMemoryOps) {
788       __ stpq(v0, v1, Address(d, 4 * unit));
789       __ ldpq(v0, v1, Address(s, 4 * unit));
790       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
791       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
792     } else {
793       __ stp(t0, t1, Address(d, 2 * unit));
794       __ ldp(t0, t1, Address(s, 2 * unit));
795       __ stp(t2, t3, Address(d, 4 * unit));
796       __ ldp(t2, t3, Address(s, 4 * unit));
797       __ stp(t4, t5, Address(d, 6 * unit));
798       __ ldp(t4, t5, Address(s, 6 * unit));
799       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
800       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
801     }
802 
803     __ subs(count, count, 8);
804     __ br(Assembler::HS, again);
805 
806     // Drain
807     __ bind(drain);
808     if (UseSIMDForMemoryOps) {
809       __ stpq(v0, v1, Address(d, 4 * unit));
810       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
811     } else {
812       __ stp(t0, t1, Address(d, 2 * unit));
813       __ stp(t2, t3, Address(d, 4 * unit));
814       __ stp(t4, t5, Address(d, 6 * unit));
815       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
816     }
817 
818     {
819       Label L1, L2;
820       __ tbz(count, exact_log2(4), L1);
821       if (UseSIMDForMemoryOps) {
822         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
823         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
824       } else {
825         __ ldp(t0, t1, Address(s, 2 * unit));
826         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
827         __ stp(t0, t1, Address(d, 2 * unit));
828         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
829       }
830       __ bind(L1);
831 
832       if (direction == copy_forwards) {
833         __ add(s, s, bias);
834         __ add(d, d, bias);
835       }
836 
837       __ tbz(count, 1, L2);
838       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
839       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
840       __ bind(L2);
841     }
842 
843     __ ret(lr);
844 
845     if (AvoidUnalignedAccesses) {
846       Label drain, again;
847       // Register order for storing. Order is different for backward copy.
848 
849       __ bind(unaligned_copy_long);
850 
851       // source address is even aligned, target odd aligned
852       //
853       // when forward copying word pairs we read long pairs at offsets
854       // {0, 2, 4, 6} (in long words). when backwards copying we read
855       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
856       // address by -2 in the forwards case so we can compute the
857       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
858       // or -1.
859       //
860       // when forward copying we need to store 1 word, 3 pairs and
861       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
862       // zero offset We adjust the destination by -1 which means we
863       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
864       //
865       // When backwards copyng we need to store 1 word, 3 pairs and
866       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
867       // offsets {1, 3, 5, 7, 8} * unit.
868 
869       if (direction == copy_forwards) {
870         __ sub(s, s, 16);
871         __ sub(d, d, 8);
872       }
873 
874       // Fill 8 registers
875       //
876       // for forwards copy s was offset by -16 from the original input
877       // value of s so the register contents are at these offsets
878       // relative to the 64 bit block addressed by that original input
879       // and so on for each successive 64 byte block when s is updated
880       //
881       // t0 at offset 0,  t1 at offset 8
882       // t2 at offset 16, t3 at offset 24
883       // t4 at offset 32, t5 at offset 40
884       // t6 at offset 48, t7 at offset 56
885 
886       // for backwards copy s was not offset so the register contents
887       // are at these offsets into the preceding 64 byte block
888       // relative to that original input and so on for each successive
889       // preceding 64 byte block when s is updated. this explains the
890       // slightly counter-intuitive looking pattern of register usage
891       // in the stp instructions for backwards copy.
892       //
893       // t0 at offset -16, t1 at offset -8
894       // t2 at offset -32, t3 at offset -24
895       // t4 at offset -48, t5 at offset -40
896       // t6 at offset -64, t7 at offset -56
897 
898       __ ldp(t0, t1, Address(s, 2 * unit));
899       __ ldp(t2, t3, Address(s, 4 * unit));
900       __ ldp(t4, t5, Address(s, 6 * unit));
901       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
902 
903       __ subs(count, count, 16);
904       __ br(Assembler::LO, drain);
905 
906       int prefetch = PrefetchCopyIntervalInBytes;
907       bool use_stride = false;
908       if (direction == copy_backwards) {
909          use_stride = prefetch > 256;
910          prefetch = -prefetch;
911          if (use_stride) __ mov(stride, prefetch);
912       }
913 
914       __ bind(again);
915 
916       if (PrefetchCopyIntervalInBytes > 0)
917         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
918 
919       if (direction == copy_forwards) {
920        // allowing for the offset of -8 the store instructions place
921        // registers into the target 64 bit block at the following
922        // offsets
923        //
924        // t0 at offset 0
925        // t1 at offset 8,  t2 at offset 16
926        // t3 at offset 24, t4 at offset 32
927        // t5 at offset 40, t6 at offset 48
928        // t7 at offset 56
929 
930         __ str(t0, Address(d, 1 * unit));
931         __ stp(t1, t2, Address(d, 2 * unit));
932         __ ldp(t0, t1, Address(s, 2 * unit));
933         __ stp(t3, t4, Address(d, 4 * unit));
934         __ ldp(t2, t3, Address(s, 4 * unit));
935         __ stp(t5, t6, Address(d, 6 * unit));
936         __ ldp(t4, t5, Address(s, 6 * unit));
937         __ str(t7, Address(__ pre(d, 8 * unit)));
938         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
939       } else {
940        // d was not offset when we started so the registers are
941        // written into the 64 bit block preceding d with the following
942        // offsets
943        //
944        // t1 at offset -8
945        // t3 at offset -24, t0 at offset -16
946        // t5 at offset -48, t2 at offset -32
947        // t7 at offset -56, t4 at offset -48
948        //                   t6 at offset -64
949        //
950        // note that this matches the offsets previously noted for the
951        // loads
952 
953         __ str(t1, Address(d, 1 * unit));
954         __ stp(t3, t0, Address(d, 3 * unit));
955         __ ldp(t0, t1, Address(s, 2 * unit));
956         __ stp(t5, t2, Address(d, 5 * unit));
957         __ ldp(t2, t3, Address(s, 4 * unit));
958         __ stp(t7, t4, Address(d, 7 * unit));
959         __ ldp(t4, t5, Address(s, 6 * unit));
960         __ str(t6, Address(__ pre(d, 8 * unit)));
961         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
962       }
963 
964       __ subs(count, count, 8);
965       __ br(Assembler::HS, again);
966 
967       // Drain
968       //
969       // this uses the same pattern of offsets and register arguments
970       // as above
971       __ bind(drain);
972       if (direction == copy_forwards) {
973         __ str(t0, Address(d, 1 * unit));
974         __ stp(t1, t2, Address(d, 2 * unit));
975         __ stp(t3, t4, Address(d, 4 * unit));
976         __ stp(t5, t6, Address(d, 6 * unit));
977         __ str(t7, Address(__ pre(d, 8 * unit)));
978       } else {
979         __ str(t1, Address(d, 1 * unit));
980         __ stp(t3, t0, Address(d, 3 * unit));
981         __ stp(t5, t2, Address(d, 5 * unit));
982         __ stp(t7, t4, Address(d, 7 * unit));
983         __ str(t6, Address(__ pre(d, 8 * unit)));
984       }
985       // now we need to copy any remaining part block which may
986       // include a 4 word block subblock and/or a 2 word subblock.
987       // bits 2 and 1 in the count are the tell-tale for whetehr we
988       // have each such subblock
989       {
990         Label L1, L2;
991         __ tbz(count, exact_log2(4), L1);
992        // this is the same as above but copying only 4 longs hence
993        // with ony one intervening stp between the str instructions
994        // but note that the offsets and registers still follow the
995        // same pattern
996         __ ldp(t0, t1, Address(s, 2 * unit));
997         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
998         if (direction == copy_forwards) {
999           __ str(t0, Address(d, 1 * unit));
1000           __ stp(t1, t2, Address(d, 2 * unit));
1001           __ str(t3, Address(__ pre(d, 4 * unit)));
1002         } else {
1003           __ str(t1, Address(d, 1 * unit));
1004           __ stp(t3, t0, Address(d, 3 * unit));
1005           __ str(t2, Address(__ pre(d, 4 * unit)));
1006         }
1007         __ bind(L1);
1008 
1009         __ tbz(count, 1, L2);
1010        // this is the same as above but copying only 2 longs hence
1011        // there is no intervening stp between the str instructions
1012        // but note that the offset and register patterns are still
1013        // the same
1014         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1015         if (direction == copy_forwards) {
1016           __ str(t0, Address(d, 1 * unit));
1017           __ str(t1, Address(__ pre(d, 2 * unit)));
1018         } else {
1019           __ str(t1, Address(d, 1 * unit));
1020           __ str(t0, Address(__ pre(d, 2 * unit)));
1021         }
1022         __ bind(L2);
1023 
1024        // for forwards copy we need to re-adjust the offsets we
1025        // applied so that s and d are follow the last words written
1026 
1027        if (direction == copy_forwards) {
1028          __ add(s, s, 16);
1029          __ add(d, d, 8);
1030        }
1031 
1032       }
1033 
1034       __ ret(lr);
1035       }
1036   }
1037 
1038   // Small copy: less than 16 bytes.
1039   //
1040   // NB: Ignores all of the bits of count which represent more than 15
1041   // bytes, so a caller doesn't have to mask them.
1042 
copy_memory_small(Register s,Register d,Register count,Register tmp,int step)1043   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1044     bool is_backwards = step < 0;
1045     size_t granularity = uabs(step);
1046     int direction = is_backwards ? -1 : 1;
1047     int unit = wordSize * direction;
1048 
1049     Label Lword, Lint, Lshort, Lbyte;
1050 
1051     assert(granularity
1052            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1053 
1054     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1055 
1056     // ??? I don't know if this bit-test-and-branch is the right thing
1057     // to do.  It does a lot of jumping, resulting in several
1058     // mispredicted branches.  It might make more sense to do this
1059     // with something like Duff's device with a single computed branch.
1060 
1061     __ tbz(count, 3 - exact_log2(granularity), Lword);
1062     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1063     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1064     __ bind(Lword);
1065 
1066     if (granularity <= sizeof (jint)) {
1067       __ tbz(count, 2 - exact_log2(granularity), Lint);
1068       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1069       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1070       __ bind(Lint);
1071     }
1072 
1073     if (granularity <= sizeof (jshort)) {
1074       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1075       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1076       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1077       __ bind(Lshort);
1078     }
1079 
1080     if (granularity <= sizeof (jbyte)) {
1081       __ tbz(count, 0, Lbyte);
1082       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1083       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1084       __ bind(Lbyte);
1085     }
1086   }
1087 
1088   Label copy_f, copy_b;
1089 
1090   // All-singing all-dancing memory copy.
1091   //
1092   // Copy count units of memory from s to d.  The size of a unit is
1093   // step, which can be positive or negative depending on the direction
1094   // of copy.  If is_aligned is false, we align the source address.
1095   //
1096 
copy_memory(bool is_aligned,Register s,Register d,Register count,Register tmp,int step)1097   void copy_memory(bool is_aligned, Register s, Register d,
1098                    Register count, Register tmp, int step) {
1099     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1100     bool is_backwards = step < 0;
1101     unsigned int granularity = uabs(step);
1102     const Register t0 = r3, t1 = r4;
1103 
1104     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1105     // load all the data before writing anything
1106     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1107     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1108     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1109     const Register send = r17, dend = r16;
1110 
1111     if (PrefetchCopyIntervalInBytes > 0)
1112       __ prfm(Address(s, 0), PLDL1KEEP);
1113     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1114     __ br(Assembler::HI, copy_big);
1115 
1116     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1117     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1118 
1119     __ cmp(count, u1(16/granularity));
1120     __ br(Assembler::LS, copy16);
1121 
1122     __ cmp(count, u1(64/granularity));
1123     __ br(Assembler::HI, copy80);
1124 
1125     __ cmp(count, u1(32/granularity));
1126     __ br(Assembler::LS, copy32);
1127 
1128     // 33..64 bytes
1129     if (UseSIMDForMemoryOps) {
1130       __ ldpq(v0, v1, Address(s, 0));
1131       __ ldpq(v2, v3, Address(send, -32));
1132       __ stpq(v0, v1, Address(d, 0));
1133       __ stpq(v2, v3, Address(dend, -32));
1134     } else {
1135       __ ldp(t0, t1, Address(s, 0));
1136       __ ldp(t2, t3, Address(s, 16));
1137       __ ldp(t4, t5, Address(send, -32));
1138       __ ldp(t6, t7, Address(send, -16));
1139 
1140       __ stp(t0, t1, Address(d, 0));
1141       __ stp(t2, t3, Address(d, 16));
1142       __ stp(t4, t5, Address(dend, -32));
1143       __ stp(t6, t7, Address(dend, -16));
1144     }
1145     __ b(finish);
1146 
1147     // 17..32 bytes
1148     __ bind(copy32);
1149     __ ldp(t0, t1, Address(s, 0));
1150     __ ldp(t2, t3, Address(send, -16));
1151     __ stp(t0, t1, Address(d, 0));
1152     __ stp(t2, t3, Address(dend, -16));
1153     __ b(finish);
1154 
1155     // 65..80/96 bytes
1156     // (96 bytes if SIMD because we do 32 byes per instruction)
1157     __ bind(copy80);
1158     if (UseSIMDForMemoryOps) {
1159       __ ldpq(v0, v1, Address(s, 0));
1160       __ ldpq(v2, v3, Address(s, 32));
1161       // Unaligned pointers can be an issue for copying.
1162       // The issue has more chances to happen when granularity of data is
1163       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1164       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1165       // The most performance drop has been seen for the range 65-80 bytes.
1166       // For such cases using the pair of ldp/stp instead of the third pair of
1167       // ldpq/stpq fixes the performance issue.
1168       if (granularity < sizeof (jint)) {
1169         Label copy96;
1170         __ cmp(count, u1(80/granularity));
1171         __ br(Assembler::HI, copy96);
1172         __ ldp(t0, t1, Address(send, -16));
1173 
1174         __ stpq(v0, v1, Address(d, 0));
1175         __ stpq(v2, v3, Address(d, 32));
1176         __ stp(t0, t1, Address(dend, -16));
1177         __ b(finish);
1178 
1179         __ bind(copy96);
1180       }
1181       __ ldpq(v4, v5, Address(send, -32));
1182 
1183       __ stpq(v0, v1, Address(d, 0));
1184       __ stpq(v2, v3, Address(d, 32));
1185       __ stpq(v4, v5, Address(dend, -32));
1186     } else {
1187       __ ldp(t0, t1, Address(s, 0));
1188       __ ldp(t2, t3, Address(s, 16));
1189       __ ldp(t4, t5, Address(s, 32));
1190       __ ldp(t6, t7, Address(s, 48));
1191       __ ldp(t8, t9, Address(send, -16));
1192 
1193       __ stp(t0, t1, Address(d, 0));
1194       __ stp(t2, t3, Address(d, 16));
1195       __ stp(t4, t5, Address(d, 32));
1196       __ stp(t6, t7, Address(d, 48));
1197       __ stp(t8, t9, Address(dend, -16));
1198     }
1199     __ b(finish);
1200 
1201     // 0..16 bytes
1202     __ bind(copy16);
1203     __ cmp(count, u1(8/granularity));
1204     __ br(Assembler::LO, copy8);
1205 
1206     // 8..16 bytes
1207     __ ldr(t0, Address(s, 0));
1208     __ ldr(t1, Address(send, -8));
1209     __ str(t0, Address(d, 0));
1210     __ str(t1, Address(dend, -8));
1211     __ b(finish);
1212 
1213     if (granularity < 8) {
1214       // 4..7 bytes
1215       __ bind(copy8);
1216       __ tbz(count, 2 - exact_log2(granularity), copy4);
1217       __ ldrw(t0, Address(s, 0));
1218       __ ldrw(t1, Address(send, -4));
1219       __ strw(t0, Address(d, 0));
1220       __ strw(t1, Address(dend, -4));
1221       __ b(finish);
1222       if (granularity < 4) {
1223         // 0..3 bytes
1224         __ bind(copy4);
1225         __ cbz(count, finish); // get rid of 0 case
1226         if (granularity == 2) {
1227           __ ldrh(t0, Address(s, 0));
1228           __ strh(t0, Address(d, 0));
1229         } else { // granularity == 1
1230           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1231           // the first and last byte.
1232           // Handle the 3 byte case by loading and storing base + count/2
1233           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1234           // This does means in the 1 byte case we load/store the same
1235           // byte 3 times.
1236           __ lsr(count, count, 1);
1237           __ ldrb(t0, Address(s, 0));
1238           __ ldrb(t1, Address(send, -1));
1239           __ ldrb(t2, Address(s, count));
1240           __ strb(t0, Address(d, 0));
1241           __ strb(t1, Address(dend, -1));
1242           __ strb(t2, Address(d, count));
1243         }
1244         __ b(finish);
1245       }
1246     }
1247 
1248     __ bind(copy_big);
1249     if (is_backwards) {
1250       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1251       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1252     }
1253 
1254     // Now we've got the small case out of the way we can align the
1255     // source address on a 2-word boundary.
1256 
1257     Label aligned;
1258 
1259     if (is_aligned) {
1260       // We may have to adjust by 1 word to get s 2-word-aligned.
1261       __ tbz(s, exact_log2(wordSize), aligned);
1262       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1263       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1264       __ sub(count, count, wordSize/granularity);
1265     } else {
1266       if (is_backwards) {
1267         __ andr(rscratch2, s, 2 * wordSize - 1);
1268       } else {
1269         __ neg(rscratch2, s);
1270         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1271       }
1272       // rscratch2 is the byte adjustment needed to align s.
1273       __ cbz(rscratch2, aligned);
1274       int shift = exact_log2(granularity);
1275       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1276       __ sub(count, count, rscratch2);
1277 
1278 #if 0
1279       // ?? This code is only correct for a disjoint copy.  It may or
1280       // may not make sense to use it in that case.
1281 
1282       // Copy the first pair; s and d may not be aligned.
1283       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1284       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1285 
1286       // Align s and d, adjust count
1287       if (is_backwards) {
1288         __ sub(s, s, rscratch2);
1289         __ sub(d, d, rscratch2);
1290       } else {
1291         __ add(s, s, rscratch2);
1292         __ add(d, d, rscratch2);
1293       }
1294 #else
1295       copy_memory_small(s, d, rscratch2, rscratch1, step);
1296 #endif
1297     }
1298 
1299     __ bind(aligned);
1300 
1301     // s is now 2-word-aligned.
1302 
1303     // We have a count of units and some trailing bytes.  Adjust the
1304     // count and do a bulk copy of words.
1305     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1306     if (direction == copy_forwards)
1307       __ bl(copy_f);
1308     else
1309       __ bl(copy_b);
1310 
1311     // And the tail.
1312     copy_memory_small(s, d, count, tmp, step);
1313 
1314     if (granularity >= 8) __ bind(copy8);
1315     if (granularity >= 4) __ bind(copy4);
1316     __ bind(finish);
1317   }
1318 
1319 
clobber_registers()1320   void clobber_registers() {
1321 #ifdef ASSERT
1322     RegSet clobbered
1323       = MacroAssembler::call_clobbered_registers() - rscratch1;
1324     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1325     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1326     for (RegSetIterator<> it = clobbered.begin(); *it != noreg; ++it) {
1327       __ mov(*it, rscratch1);
1328     }
1329 #endif
1330 
1331   }
1332 
1333   // Scan over array at a for count oops, verifying each one.
1334   // Preserves a and count, clobbers rscratch1 and rscratch2.
verify_oop_array(int size,Register a,Register count,Register temp)1335   void verify_oop_array (int size, Register a, Register count, Register temp) {
1336     Label loop, end;
1337     __ mov(rscratch1, a);
1338     __ mov(rscratch2, zr);
1339     __ bind(loop);
1340     __ cmp(rscratch2, count);
1341     __ br(Assembler::HS, end);
1342     if (size == wordSize) {
1343       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1344       __ verify_oop(temp);
1345     } else {
1346       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1347       __ decode_heap_oop(temp); // calls verify_oop
1348     }
1349     __ add(rscratch2, rscratch2, 1);
1350     __ b(loop);
1351     __ bind(end);
1352   }
1353 
1354   // Arguments:
1355   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1356   //             ignored
1357   //   is_oop  - true => oop array, so generate store check code
1358   //   name    - stub name string
1359   //
1360   // Inputs:
1361   //   c_rarg0   - source array address
1362   //   c_rarg1   - destination array address
1363   //   c_rarg2   - element count, treated as ssize_t, can be zero
1364   //
1365   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1366   // the hardware handle it.  The two dwords within qwords that span
1367   // cache line boundaries will still be loaded and stored atomically.
1368   //
1369   // Side Effects:
1370   //   disjoint_int_copy_entry is set to the no-overlap entry point
1371   //   used by generate_conjoint_int_oop_copy().
1372   //
generate_disjoint_copy(int size,bool aligned,bool is_oop,address * entry,const char * name,bool dest_uninitialized=false)1373   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1374                                   const char *name, bool dest_uninitialized = false) {
1375     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1376     RegSet saved_reg = RegSet::of(s, d, count);
1377     __ align(CodeEntryAlignment);
1378     StubCodeMark mark(this, "StubRoutines", name);
1379     address start = __ pc();
1380     __ enter();
1381 
1382     if (entry != NULL) {
1383       *entry = __ pc();
1384       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1385       BLOCK_COMMENT("Entry:");
1386     }
1387 
1388     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1389     if (dest_uninitialized) {
1390       decorators |= IS_DEST_UNINITIALIZED;
1391     }
1392     if (aligned) {
1393       decorators |= ARRAYCOPY_ALIGNED;
1394     }
1395 
1396     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1397     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1398 
1399     if (is_oop) {
1400       // save regs before copy_memory
1401       __ push(RegSet::of(d, count), sp);
1402     }
1403     {
1404       // UnsafeCopyMemory page error: continue after ucm
1405       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1406       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1407       copy_memory(aligned, s, d, count, rscratch1, size);
1408     }
1409 
1410     if (is_oop) {
1411       __ pop(RegSet::of(d, count), sp);
1412       if (VerifyOops)
1413         verify_oop_array(size, d, count, r16);
1414     }
1415 
1416     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1417 
1418     __ leave();
1419     __ mov(r0, zr); // return 0
1420     __ ret(lr);
1421     return start;
1422   }
1423 
1424   // Arguments:
1425   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1426   //             ignored
1427   //   is_oop  - true => oop array, so generate store check code
1428   //   name    - stub name string
1429   //
1430   // Inputs:
1431   //   c_rarg0   - source array address
1432   //   c_rarg1   - destination array address
1433   //   c_rarg2   - element count, treated as ssize_t, can be zero
1434   //
1435   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1436   // the hardware handle it.  The two dwords within qwords that span
1437   // cache line boundaries will still be loaded and stored atomically.
1438   //
generate_conjoint_copy(int size,bool aligned,bool is_oop,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1439   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1440                                  address *entry, const char *name,
1441                                  bool dest_uninitialized = false) {
1442     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1443     RegSet saved_regs = RegSet::of(s, d, count);
1444     StubCodeMark mark(this, "StubRoutines", name);
1445     address start = __ pc();
1446     __ enter();
1447 
1448     if (entry != NULL) {
1449       *entry = __ pc();
1450       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1451       BLOCK_COMMENT("Entry:");
1452     }
1453 
1454     // use fwd copy when (d-s) above_equal (count*size)
1455     __ sub(rscratch1, d, s);
1456     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1457     __ br(Assembler::HS, nooverlap_target);
1458 
1459     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1460     if (dest_uninitialized) {
1461       decorators |= IS_DEST_UNINITIALIZED;
1462     }
1463     if (aligned) {
1464       decorators |= ARRAYCOPY_ALIGNED;
1465     }
1466 
1467     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1468     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1469 
1470     if (is_oop) {
1471       // save regs before copy_memory
1472       __ push(RegSet::of(d, count), sp);
1473     }
1474     {
1475       // UnsafeCopyMemory page error: continue after ucm
1476       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1477       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1478       copy_memory(aligned, s, d, count, rscratch1, -size);
1479     }
1480     if (is_oop) {
1481       __ pop(RegSet::of(d, count), sp);
1482       if (VerifyOops)
1483         verify_oop_array(size, d, count, r16);
1484     }
1485     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1486     __ leave();
1487     __ mov(r0, zr); // return 0
1488     __ ret(lr);
1489     return start;
1490 }
1491 
1492   // Arguments:
1493   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1494   //             ignored
1495   //   name    - stub name string
1496   //
1497   // Inputs:
1498   //   c_rarg0   - source array address
1499   //   c_rarg1   - destination array address
1500   //   c_rarg2   - element count, treated as ssize_t, can be zero
1501   //
1502   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1503   // we let the hardware handle it.  The one to eight bytes within words,
1504   // dwords or qwords that span cache line boundaries will still be loaded
1505   // and stored atomically.
1506   //
1507   // Side Effects:
1508   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1509   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1510   // we let the hardware handle it.  The one to eight bytes within words,
1511   // dwords or qwords that span cache line boundaries will still be loaded
1512   // and stored atomically.
1513   //
1514   // Side Effects:
1515   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1516   //   used by generate_conjoint_byte_copy().
1517   //
generate_disjoint_byte_copy(bool aligned,address * entry,const char * name)1518   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1519     const bool not_oop = false;
1520     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1521   }
1522 
1523   // Arguments:
1524   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1525   //             ignored
1526   //   name    - stub name string
1527   //
1528   // Inputs:
1529   //   c_rarg0   - source array address
1530   //   c_rarg1   - destination array address
1531   //   c_rarg2   - element count, treated as ssize_t, can be zero
1532   //
1533   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1534   // we let the hardware handle it.  The one to eight bytes within words,
1535   // dwords or qwords that span cache line boundaries will still be loaded
1536   // and stored atomically.
1537   //
generate_conjoint_byte_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1538   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1539                                       address* entry, const char *name) {
1540     const bool not_oop = false;
1541     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1542   }
1543 
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1555   // let the hardware handle it.  The two or four words within dwords
1556   // or qwords that span cache line boundaries will still be loaded
1557   // and stored atomically.
1558   //
1559   // Side Effects:
1560   //   disjoint_short_copy_entry is set to the no-overlap entry point
1561   //   used by generate_conjoint_short_copy().
1562   //
generate_disjoint_short_copy(bool aligned,address * entry,const char * name)1563   address generate_disjoint_short_copy(bool aligned,
1564                                        address* entry, const char *name) {
1565     const bool not_oop = false;
1566     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1567   }
1568 
1569   // Arguments:
1570   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1571   //             ignored
1572   //   name    - stub name string
1573   //
1574   // Inputs:
1575   //   c_rarg0   - source array address
1576   //   c_rarg1   - destination array address
1577   //   c_rarg2   - element count, treated as ssize_t, can be zero
1578   //
1579   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1580   // let the hardware handle it.  The two or four words within dwords
1581   // or qwords that span cache line boundaries will still be loaded
1582   // and stored atomically.
1583   //
generate_conjoint_short_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1584   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1585                                        address *entry, const char *name) {
1586     const bool not_oop = false;
1587     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1588 
1589   }
1590   // Arguments:
1591   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1592   //             ignored
1593   //   name    - stub name string
1594   //
1595   // Inputs:
1596   //   c_rarg0   - source array address
1597   //   c_rarg1   - destination array address
1598   //   c_rarg2   - element count, treated as ssize_t, can be zero
1599   //
1600   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1601   // the hardware handle it.  The two dwords within qwords that span
1602   // cache line boundaries will still be loaded and stored atomically.
1603   //
1604   // Side Effects:
1605   //   disjoint_int_copy_entry is set to the no-overlap entry point
1606   //   used by generate_conjoint_int_oop_copy().
1607   //
generate_disjoint_int_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1608   address generate_disjoint_int_copy(bool aligned, address *entry,
1609                                          const char *name, bool dest_uninitialized = false) {
1610     const bool not_oop = false;
1611     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1612   }
1613 
1614   // Arguments:
1615   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1616   //             ignored
1617   //   name    - stub name string
1618   //
1619   // Inputs:
1620   //   c_rarg0   - source array address
1621   //   c_rarg1   - destination array address
1622   //   c_rarg2   - element count, treated as ssize_t, can be zero
1623   //
1624   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1625   // the hardware handle it.  The two dwords within qwords that span
1626   // cache line boundaries will still be loaded and stored atomically.
1627   //
generate_conjoint_int_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1628   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1629                                      address *entry, const char *name,
1630                                      bool dest_uninitialized = false) {
1631     const bool not_oop = false;
1632     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1633   }
1634 
1635 
1636   // Arguments:
1637   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1638   //             ignored
1639   //   name    - stub name string
1640   //
1641   // Inputs:
1642   //   c_rarg0   - source array address
1643   //   c_rarg1   - destination array address
1644   //   c_rarg2   - element count, treated as size_t, can be zero
1645   //
1646   // Side Effects:
1647   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1648   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1649   //
generate_disjoint_long_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1650   address generate_disjoint_long_copy(bool aligned, address *entry,
1651                                           const char *name, bool dest_uninitialized = false) {
1652     const bool not_oop = false;
1653     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1654   }
1655 
1656   // Arguments:
1657   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1658   //             ignored
1659   //   name    - stub name string
1660   //
1661   // Inputs:
1662   //   c_rarg0   - source array address
1663   //   c_rarg1   - destination array address
1664   //   c_rarg2   - element count, treated as size_t, can be zero
1665   //
generate_conjoint_long_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1666   address generate_conjoint_long_copy(bool aligned,
1667                                       address nooverlap_target, address *entry,
1668                                       const char *name, bool dest_uninitialized = false) {
1669     const bool not_oop = false;
1670     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1671   }
1672 
1673   // Arguments:
1674   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1675   //             ignored
1676   //   name    - stub name string
1677   //
1678   // Inputs:
1679   //   c_rarg0   - source array address
1680   //   c_rarg1   - destination array address
1681   //   c_rarg2   - element count, treated as size_t, can be zero
1682   //
1683   // Side Effects:
1684   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1685   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1686   //
generate_disjoint_oop_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized)1687   address generate_disjoint_oop_copy(bool aligned, address *entry,
1688                                      const char *name, bool dest_uninitialized) {
1689     const bool is_oop = true;
1690     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1691     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1692   }
1693 
1694   // Arguments:
1695   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1696   //             ignored
1697   //   name    - stub name string
1698   //
1699   // Inputs:
1700   //   c_rarg0   - source array address
1701   //   c_rarg1   - destination array address
1702   //   c_rarg2   - element count, treated as size_t, can be zero
1703   //
generate_conjoint_oop_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized)1704   address generate_conjoint_oop_copy(bool aligned,
1705                                      address nooverlap_target, address *entry,
1706                                      const char *name, bool dest_uninitialized) {
1707     const bool is_oop = true;
1708     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1709     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1710                                   name, dest_uninitialized);
1711   }
1712 
1713 
1714   // Helper for generating a dynamic type check.
1715   // Smashes rscratch1, rscratch2.
generate_type_check(Register sub_klass,Register super_check_offset,Register super_klass,Label & L_success)1716   void generate_type_check(Register sub_klass,
1717                            Register super_check_offset,
1718                            Register super_klass,
1719                            Label& L_success) {
1720     assert_different_registers(sub_klass, super_check_offset, super_klass);
1721 
1722     BLOCK_COMMENT("type_check:");
1723 
1724     Label L_miss;
1725 
1726     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1727                                      super_check_offset);
1728     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1729 
1730     // Fall through on failure!
1731     __ BIND(L_miss);
1732   }
1733 
1734   //
1735   //  Generate checkcasting array copy stub
1736   //
1737   //  Input:
1738   //    c_rarg0   - source array address
1739   //    c_rarg1   - destination array address
1740   //    c_rarg2   - element count, treated as ssize_t, can be zero
1741   //    c_rarg3   - size_t ckoff (super_check_offset)
1742   //    c_rarg4   - oop ckval (super_klass)
1743   //
1744   //  Output:
1745   //    r0 ==  0  -  success
1746   //    r0 == -1^K - failure, where K is partial transfer count
1747   //
generate_checkcast_copy(const char * name,address * entry,bool dest_uninitialized=false)1748   address generate_checkcast_copy(const char *name, address *entry,
1749                                   bool dest_uninitialized = false) {
1750 
1751     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1752 
1753     // Input registers (after setup_arg_regs)
1754     const Register from        = c_rarg0;   // source array address
1755     const Register to          = c_rarg1;   // destination array address
1756     const Register count       = c_rarg2;   // elementscount
1757     const Register ckoff       = c_rarg3;   // super_check_offset
1758     const Register ckval       = c_rarg4;   // super_klass
1759 
1760     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1761     RegSet wb_post_saved_regs = RegSet::of(count);
1762 
1763     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1764     const Register copied_oop  = r22;       // actual oop copied
1765     const Register count_save  = r21;       // orig elementscount
1766     const Register start_to    = r20;       // destination array start address
1767     const Register r19_klass   = r19;       // oop._klass
1768 
1769     //---------------------------------------------------------------
1770     // Assembler stub will be used for this call to arraycopy
1771     // if the two arrays are subtypes of Object[] but the
1772     // destination array type is not equal to or a supertype
1773     // of the source type.  Each element must be separately
1774     // checked.
1775 
1776     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1777                                copied_oop, r19_klass, count_save);
1778 
1779     __ align(CodeEntryAlignment);
1780     StubCodeMark mark(this, "StubRoutines", name);
1781     address start = __ pc();
1782 
1783     __ enter(); // required for proper stackwalking of RuntimeStub frame
1784 
1785 #ifdef ASSERT
1786     // caller guarantees that the arrays really are different
1787     // otherwise, we would have to make conjoint checks
1788     { Label L;
1789       array_overlap_test(L, TIMES_OOP);
1790       __ stop("checkcast_copy within a single array");
1791       __ bind(L);
1792     }
1793 #endif //ASSERT
1794 
1795     // Caller of this entry point must set up the argument registers.
1796     if (entry != NULL) {
1797       *entry = __ pc();
1798       BLOCK_COMMENT("Entry:");
1799     }
1800 
1801      // Empty array:  Nothing to do.
1802     __ cbz(count, L_done);
1803     __ push(RegSet::of(r19, r20, r21, r22), sp);
1804 
1805 #ifdef ASSERT
1806     BLOCK_COMMENT("assert consistent ckoff/ckval");
1807     // The ckoff and ckval must be mutually consistent,
1808     // even though caller generates both.
1809     { Label L;
1810       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1811       __ ldrw(start_to, Address(ckval, sco_offset));
1812       __ cmpw(ckoff, start_to);
1813       __ br(Assembler::EQ, L);
1814       __ stop("super_check_offset inconsistent");
1815       __ bind(L);
1816     }
1817 #endif //ASSERT
1818 
1819     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1820     bool is_oop = true;
1821     if (dest_uninitialized) {
1822       decorators |= IS_DEST_UNINITIALIZED;
1823     }
1824 
1825     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1826     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1827 
1828     // save the original count
1829     __ mov(count_save, count);
1830 
1831     // Copy from low to high addresses
1832     __ mov(start_to, to);              // Save destination array start address
1833     __ b(L_load_element);
1834 
1835     // ======== begin loop ========
1836     // (Loop is rotated; its entry is L_load_element.)
1837     // Loop control:
1838     //   for (; count != 0; count--) {
1839     //     copied_oop = load_heap_oop(from++);
1840     //     ... generate_type_check ...;
1841     //     store_heap_oop(to++, copied_oop);
1842     //   }
1843     __ align(OptoLoopAlignment);
1844 
1845     __ BIND(L_store_element);
1846     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1847     __ sub(count, count, 1);
1848     __ cbz(count, L_do_card_marks);
1849 
1850     // ======== loop entry is here ========
1851     __ BIND(L_load_element);
1852     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1853     __ cbz(copied_oop, L_store_element);
1854 
1855     __ load_klass(r19_klass, copied_oop);// query the object klass
1856     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1857     // ======== end loop ========
1858 
1859     // It was a real error; we must depend on the caller to finish the job.
1860     // Register count = remaining oops, count_orig = total oops.
1861     // Emit GC store barriers for the oops we have copied and report
1862     // their number to the caller.
1863 
1864     __ subs(count, count_save, count);     // K = partially copied oop count
1865     __ eon(count, count, zr);                   // report (-1^K) to caller
1866     __ br(Assembler::EQ, L_done_pop);
1867 
1868     __ BIND(L_do_card_marks);
1869     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1870 
1871     __ bind(L_done_pop);
1872     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1873     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1874 
1875     __ bind(L_done);
1876     __ mov(r0, count);
1877     __ leave();
1878     __ ret(lr);
1879 
1880     return start;
1881   }
1882 
1883   // Perform range checks on the proposed arraycopy.
1884   // Kills temp, but nothing else.
1885   // Also, clean the sign bits of src_pos and dst_pos.
arraycopy_range_checks(Register src,Register src_pos,Register dst,Register dst_pos,Register length,Register temp,Label & L_failed)1886   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1887                               Register src_pos, // source position (c_rarg1)
1888                               Register dst,     // destination array oo (c_rarg2)
1889                               Register dst_pos, // destination position (c_rarg3)
1890                               Register length,
1891                               Register temp,
1892                               Label& L_failed) {
1893     BLOCK_COMMENT("arraycopy_range_checks:");
1894 
1895     assert_different_registers(rscratch1, temp);
1896 
1897     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1898     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1899     __ addw(temp, length, src_pos);
1900     __ cmpw(temp, rscratch1);
1901     __ br(Assembler::HI, L_failed);
1902 
1903     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1904     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1905     __ addw(temp, length, dst_pos);
1906     __ cmpw(temp, rscratch1);
1907     __ br(Assembler::HI, L_failed);
1908 
1909     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1910     __ movw(src_pos, src_pos);
1911     __ movw(dst_pos, dst_pos);
1912 
1913     BLOCK_COMMENT("arraycopy_range_checks done");
1914   }
1915 
1916   // These stubs get called from some dumb test routine.
1917   // I'll write them properly when they're called from
1918   // something that's actually doing something.
fake_arraycopy_stub(address src,address dst,int count)1919   static void fake_arraycopy_stub(address src, address dst, int count) {
1920     assert(count == 0, "huh?");
1921   }
1922 
1923 
1924   //
1925   //  Generate 'unsafe' array copy stub
1926   //  Though just as safe as the other stubs, it takes an unscaled
1927   //  size_t argument instead of an element count.
1928   //
1929   //  Input:
1930   //    c_rarg0   - source array address
1931   //    c_rarg1   - destination array address
1932   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1933   //
1934   // Examines the alignment of the operands and dispatches
1935   // to a long, int, short, or byte copy loop.
1936   //
generate_unsafe_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address long_copy_entry)1937   address generate_unsafe_copy(const char *name,
1938                                address byte_copy_entry,
1939                                address short_copy_entry,
1940                                address int_copy_entry,
1941                                address long_copy_entry) {
1942     Label L_long_aligned, L_int_aligned, L_short_aligned;
1943     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1944 
1945     __ align(CodeEntryAlignment);
1946     StubCodeMark mark(this, "StubRoutines", name);
1947     address start = __ pc();
1948     __ enter(); // required for proper stackwalking of RuntimeStub frame
1949 
1950     // bump this on entry, not on exit:
1951     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1952 
1953     __ orr(rscratch1, s, d);
1954     __ orr(rscratch1, rscratch1, count);
1955 
1956     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1957     __ cbz(rscratch1, L_long_aligned);
1958     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1959     __ cbz(rscratch1, L_int_aligned);
1960     __ tbz(rscratch1, 0, L_short_aligned);
1961     __ b(RuntimeAddress(byte_copy_entry));
1962 
1963     __ BIND(L_short_aligned);
1964     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1965     __ b(RuntimeAddress(short_copy_entry));
1966     __ BIND(L_int_aligned);
1967     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1968     __ b(RuntimeAddress(int_copy_entry));
1969     __ BIND(L_long_aligned);
1970     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1971     __ b(RuntimeAddress(long_copy_entry));
1972 
1973     return start;
1974   }
1975 
1976   //
1977   //  Generate generic array copy stubs
1978   //
1979   //  Input:
1980   //    c_rarg0    -  src oop
1981   //    c_rarg1    -  src_pos (32-bits)
1982   //    c_rarg2    -  dst oop
1983   //    c_rarg3    -  dst_pos (32-bits)
1984   //    c_rarg4    -  element count (32-bits)
1985   //
1986   //  Output:
1987   //    r0 ==  0  -  success
1988   //    r0 == -1^K - failure, where K is partial transfer count
1989   //
generate_generic_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address oop_copy_entry,address long_copy_entry,address checkcast_copy_entry)1990   address generate_generic_copy(const char *name,
1991                                 address byte_copy_entry, address short_copy_entry,
1992                                 address int_copy_entry, address oop_copy_entry,
1993                                 address long_copy_entry, address checkcast_copy_entry) {
1994 
1995     Label L_failed, L_objArray;
1996     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1997 
1998     // Input registers
1999     const Register src        = c_rarg0;  // source array oop
2000     const Register src_pos    = c_rarg1;  // source position
2001     const Register dst        = c_rarg2;  // destination array oop
2002     const Register dst_pos    = c_rarg3;  // destination position
2003     const Register length     = c_rarg4;
2004 
2005 
2006     // Registers used as temps
2007     const Register dst_klass  = c_rarg5;
2008 
2009     __ align(CodeEntryAlignment);
2010 
2011     StubCodeMark mark(this, "StubRoutines", name);
2012 
2013     address start = __ pc();
2014 
2015     __ enter(); // required for proper stackwalking of RuntimeStub frame
2016 
2017     // bump this on entry, not on exit:
2018     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2019 
2020     //-----------------------------------------------------------------------
2021     // Assembler stub will be used for this call to arraycopy
2022     // if the following conditions are met:
2023     //
2024     // (1) src and dst must not be null.
2025     // (2) src_pos must not be negative.
2026     // (3) dst_pos must not be negative.
2027     // (4) length  must not be negative.
2028     // (5) src klass and dst klass should be the same and not NULL.
2029     // (6) src and dst should be arrays.
2030     // (7) src_pos + length must not exceed length of src.
2031     // (8) dst_pos + length must not exceed length of dst.
2032     //
2033 
2034     //  if (src == NULL) return -1;
2035     __ cbz(src, L_failed);
2036 
2037     //  if (src_pos < 0) return -1;
2038     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2039 
2040     //  if (dst == NULL) return -1;
2041     __ cbz(dst, L_failed);
2042 
2043     //  if (dst_pos < 0) return -1;
2044     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2045 
2046     // registers used as temp
2047     const Register scratch_length    = r16; // elements count to copy
2048     const Register scratch_src_klass = r17; // array klass
2049     const Register lh                = r15; // layout helper
2050 
2051     //  if (length < 0) return -1;
2052     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2053     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2054 
2055     __ load_klass(scratch_src_klass, src);
2056 #ifdef ASSERT
2057     //  assert(src->klass() != NULL);
2058     {
2059       BLOCK_COMMENT("assert klasses not null {");
2060       Label L1, L2;
2061       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2062       __ bind(L1);
2063       __ stop("broken null klass");
2064       __ bind(L2);
2065       __ load_klass(rscratch1, dst);
2066       __ cbz(rscratch1, L1);     // this would be broken also
2067       BLOCK_COMMENT("} assert klasses not null done");
2068     }
2069 #endif
2070 
2071     // Load layout helper (32-bits)
2072     //
2073     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2074     // 32        30    24            16              8     2                 0
2075     //
2076     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2077     //
2078 
2079     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2080 
2081     // Handle objArrays completely differently...
2082     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2083     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2084     __ movw(rscratch1, objArray_lh);
2085     __ eorw(rscratch2, lh, rscratch1);
2086     __ cbzw(rscratch2, L_objArray);
2087 
2088     //  if (src->klass() != dst->klass()) return -1;
2089     __ load_klass(rscratch2, dst);
2090     __ eor(rscratch2, rscratch2, scratch_src_klass);
2091     __ cbnz(rscratch2, L_failed);
2092 
2093     //  if (!src->is_Array()) return -1;
2094     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2095 
2096     // At this point, it is known to be a typeArray (array_tag 0x3).
2097 #ifdef ASSERT
2098     {
2099       BLOCK_COMMENT("assert primitive array {");
2100       Label L;
2101       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2102       __ cmpw(lh, rscratch2);
2103       __ br(Assembler::GE, L);
2104       __ stop("must be a primitive array");
2105       __ bind(L);
2106       BLOCK_COMMENT("} assert primitive array done");
2107     }
2108 #endif
2109 
2110     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2111                            rscratch2, L_failed);
2112 
2113     // TypeArrayKlass
2114     //
2115     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2116     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2117     //
2118 
2119     const Register rscratch1_offset = rscratch1;    // array offset
2120     const Register r15_elsize = lh; // element size
2121 
2122     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2123            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2124     __ add(src, src, rscratch1_offset);           // src array offset
2125     __ add(dst, dst, rscratch1_offset);           // dst array offset
2126     BLOCK_COMMENT("choose copy loop based on element size");
2127 
2128     // next registers should be set before the jump to corresponding stub
2129     const Register from     = c_rarg0;  // source array address
2130     const Register to       = c_rarg1;  // destination array address
2131     const Register count    = c_rarg2;  // elements count
2132 
2133     // 'from', 'to', 'count' registers should be set in such order
2134     // since they are the same as 'src', 'src_pos', 'dst'.
2135 
2136     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2137 
2138     // The possible values of elsize are 0-3, i.e. exact_log2(element
2139     // size in bytes).  We do a simple bitwise binary search.
2140   __ BIND(L_copy_bytes);
2141     __ tbnz(r15_elsize, 1, L_copy_ints);
2142     __ tbnz(r15_elsize, 0, L_copy_shorts);
2143     __ lea(from, Address(src, src_pos));// src_addr
2144     __ lea(to,   Address(dst, dst_pos));// dst_addr
2145     __ movw(count, scratch_length); // length
2146     __ b(RuntimeAddress(byte_copy_entry));
2147 
2148   __ BIND(L_copy_shorts);
2149     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2150     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2151     __ movw(count, scratch_length); // length
2152     __ b(RuntimeAddress(short_copy_entry));
2153 
2154   __ BIND(L_copy_ints);
2155     __ tbnz(r15_elsize, 0, L_copy_longs);
2156     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2157     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2158     __ movw(count, scratch_length); // length
2159     __ b(RuntimeAddress(int_copy_entry));
2160 
2161   __ BIND(L_copy_longs);
2162 #ifdef ASSERT
2163     {
2164       BLOCK_COMMENT("assert long copy {");
2165       Label L;
2166       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2167       __ cmpw(r15_elsize, LogBytesPerLong);
2168       __ br(Assembler::EQ, L);
2169       __ stop("must be long copy, but elsize is wrong");
2170       __ bind(L);
2171       BLOCK_COMMENT("} assert long copy done");
2172     }
2173 #endif
2174     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2175     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2176     __ movw(count, scratch_length); // length
2177     __ b(RuntimeAddress(long_copy_entry));
2178 
2179     // ObjArrayKlass
2180   __ BIND(L_objArray);
2181     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2182 
2183     Label L_plain_copy, L_checkcast_copy;
2184     //  test array classes for subtyping
2185     __ load_klass(r15, dst);
2186     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2187     __ br(Assembler::NE, L_checkcast_copy);
2188 
2189     // Identically typed arrays can be copied without element-wise checks.
2190     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2191                            rscratch2, L_failed);
2192 
2193     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2194     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2195     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2196     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2197     __ movw(count, scratch_length); // length
2198   __ BIND(L_plain_copy);
2199     __ b(RuntimeAddress(oop_copy_entry));
2200 
2201   __ BIND(L_checkcast_copy);
2202     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2203     {
2204       // Before looking at dst.length, make sure dst is also an objArray.
2205       __ ldrw(rscratch1, Address(r15, lh_offset));
2206       __ movw(rscratch2, objArray_lh);
2207       __ eorw(rscratch1, rscratch1, rscratch2);
2208       __ cbnzw(rscratch1, L_failed);
2209 
2210       // It is safe to examine both src.length and dst.length.
2211       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2212                              r15, L_failed);
2213 
2214       __ load_klass(dst_klass, dst); // reload
2215 
2216       // Marshal the base address arguments now, freeing registers.
2217       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2218       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2219       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2220       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2221       __ movw(count, length);           // length (reloaded)
2222       Register sco_temp = c_rarg3;      // this register is free now
2223       assert_different_registers(from, to, count, sco_temp,
2224                                  dst_klass, scratch_src_klass);
2225       // assert_clean_int(count, sco_temp);
2226 
2227       // Generate the type check.
2228       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2229       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2230 
2231       // Smashes rscratch1, rscratch2
2232       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2233 
2234       // Fetch destination element klass from the ObjArrayKlass header.
2235       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2236       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2237       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2238 
2239       // the checkcast_copy loop needs two extra arguments:
2240       assert(c_rarg3 == sco_temp, "#3 already in place");
2241       // Set up arguments for checkcast_copy_entry.
2242       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2243       __ b(RuntimeAddress(checkcast_copy_entry));
2244     }
2245 
2246   __ BIND(L_failed);
2247     __ mov(r0, -1);
2248     __ leave();   // required for proper stackwalking of RuntimeStub frame
2249     __ ret(lr);
2250 
2251     return start;
2252   }
2253 
2254   //
2255   // Generate stub for array fill. If "aligned" is true, the
2256   // "to" address is assumed to be heapword aligned.
2257   //
2258   // Arguments for generated stub:
2259   //   to:    c_rarg0
2260   //   value: c_rarg1
2261   //   count: c_rarg2 treated as signed
2262   //
generate_fill(BasicType t,bool aligned,const char * name)2263   address generate_fill(BasicType t, bool aligned, const char *name) {
2264     __ align(CodeEntryAlignment);
2265     StubCodeMark mark(this, "StubRoutines", name);
2266     address start = __ pc();
2267 
2268     BLOCK_COMMENT("Entry:");
2269 
2270     const Register to        = c_rarg0;  // source array address
2271     const Register value     = c_rarg1;  // value
2272     const Register count     = c_rarg2;  // elements count
2273 
2274     const Register bz_base = r10;        // base for block_zero routine
2275     const Register cnt_words = r11;      // temp register
2276 
2277     __ enter();
2278 
2279     Label L_fill_elements, L_exit1;
2280 
2281     int shift = -1;
2282     switch (t) {
2283       case T_BYTE:
2284         shift = 0;
2285         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2286         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2287         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2288         __ br(Assembler::LO, L_fill_elements);
2289         break;
2290       case T_SHORT:
2291         shift = 1;
2292         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2293         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2294         __ br(Assembler::LO, L_fill_elements);
2295         break;
2296       case T_INT:
2297         shift = 2;
2298         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2299         __ br(Assembler::LO, L_fill_elements);
2300         break;
2301       default: ShouldNotReachHere();
2302     }
2303 
2304     // Align source address at 8 bytes address boundary.
2305     Label L_skip_align1, L_skip_align2, L_skip_align4;
2306     if (!aligned) {
2307       switch (t) {
2308         case T_BYTE:
2309           // One byte misalignment happens only for byte arrays.
2310           __ tbz(to, 0, L_skip_align1);
2311           __ strb(value, Address(__ post(to, 1)));
2312           __ subw(count, count, 1);
2313           __ bind(L_skip_align1);
2314           // Fallthrough
2315         case T_SHORT:
2316           // Two bytes misalignment happens only for byte and short (char) arrays.
2317           __ tbz(to, 1, L_skip_align2);
2318           __ strh(value, Address(__ post(to, 2)));
2319           __ subw(count, count, 2 >> shift);
2320           __ bind(L_skip_align2);
2321           // Fallthrough
2322         case T_INT:
2323           // Align to 8 bytes, we know we are 4 byte aligned to start.
2324           __ tbz(to, 2, L_skip_align4);
2325           __ strw(value, Address(__ post(to, 4)));
2326           __ subw(count, count, 4 >> shift);
2327           __ bind(L_skip_align4);
2328           break;
2329         default: ShouldNotReachHere();
2330       }
2331     }
2332 
2333     //
2334     //  Fill large chunks
2335     //
2336     __ lsrw(cnt_words, count, 3 - shift); // number of words
2337     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2338     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2339     if (UseBlockZeroing) {
2340       Label non_block_zeroing, rest;
2341       // If the fill value is zero we can use the fast zero_words().
2342       __ cbnz(value, non_block_zeroing);
2343       __ mov(bz_base, to);
2344       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2345       __ zero_words(bz_base, cnt_words);
2346       __ b(rest);
2347       __ bind(non_block_zeroing);
2348       __ fill_words(to, cnt_words, value);
2349       __ bind(rest);
2350     } else {
2351       __ fill_words(to, cnt_words, value);
2352     }
2353 
2354     // Remaining count is less than 8 bytes. Fill it by a single store.
2355     // Note that the total length is no less than 8 bytes.
2356     if (t == T_BYTE || t == T_SHORT) {
2357       Label L_exit1;
2358       __ cbzw(count, L_exit1);
2359       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2360       __ str(value, Address(to, -8));    // overwrite some elements
2361       __ bind(L_exit1);
2362       __ leave();
2363       __ ret(lr);
2364     }
2365 
2366     // Handle copies less than 8 bytes.
2367     Label L_fill_2, L_fill_4, L_exit2;
2368     __ bind(L_fill_elements);
2369     switch (t) {
2370       case T_BYTE:
2371         __ tbz(count, 0, L_fill_2);
2372         __ strb(value, Address(__ post(to, 1)));
2373         __ bind(L_fill_2);
2374         __ tbz(count, 1, L_fill_4);
2375         __ strh(value, Address(__ post(to, 2)));
2376         __ bind(L_fill_4);
2377         __ tbz(count, 2, L_exit2);
2378         __ strw(value, Address(to));
2379         break;
2380       case T_SHORT:
2381         __ tbz(count, 0, L_fill_4);
2382         __ strh(value, Address(__ post(to, 2)));
2383         __ bind(L_fill_4);
2384         __ tbz(count, 1, L_exit2);
2385         __ strw(value, Address(to));
2386         break;
2387       case T_INT:
2388         __ cbzw(count, L_exit2);
2389         __ strw(value, Address(to));
2390         break;
2391       default: ShouldNotReachHere();
2392     }
2393     __ bind(L_exit2);
2394     __ leave();
2395     __ ret(lr);
2396     return start;
2397   }
2398 
generate_data_cache_writeback()2399   address generate_data_cache_writeback() {
2400     const Register line        = c_rarg0;  // address of line to write back
2401 
2402     __ align(CodeEntryAlignment);
2403 
2404     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2405 
2406     address start = __ pc();
2407     __ enter();
2408     __ cache_wb(Address(line, 0));
2409     __ leave();
2410     __ ret(lr);
2411 
2412     return start;
2413   }
2414 
generate_data_cache_writeback_sync()2415   address generate_data_cache_writeback_sync() {
2416     const Register is_pre     = c_rarg0;  // pre or post sync
2417 
2418     __ align(CodeEntryAlignment);
2419 
2420     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2421 
2422     // pre wbsync is a no-op
2423     // post wbsync translates to an sfence
2424 
2425     Label skip;
2426     address start = __ pc();
2427     __ enter();
2428     __ cbnz(is_pre, skip);
2429     __ cache_wbsync(false);
2430     __ bind(skip);
2431     __ leave();
2432     __ ret(lr);
2433 
2434     return start;
2435   }
2436 
generate_arraycopy_stubs()2437   void generate_arraycopy_stubs() {
2438     address entry;
2439     address entry_jbyte_arraycopy;
2440     address entry_jshort_arraycopy;
2441     address entry_jint_arraycopy;
2442     address entry_oop_arraycopy;
2443     address entry_jlong_arraycopy;
2444     address entry_checkcast_arraycopy;
2445 
2446     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2447     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2448 
2449     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2450 
2451     //*** jbyte
2452     // Always need aligned and unaligned versions
2453     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2454                                                                                   "jbyte_disjoint_arraycopy");
2455     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2456                                                                                   &entry_jbyte_arraycopy,
2457                                                                                   "jbyte_arraycopy");
2458     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2459                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2460     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2461                                                                                   "arrayof_jbyte_arraycopy");
2462 
2463     //*** jshort
2464     // Always need aligned and unaligned versions
2465     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2466                                                                                     "jshort_disjoint_arraycopy");
2467     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2468                                                                                     &entry_jshort_arraycopy,
2469                                                                                     "jshort_arraycopy");
2470     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2471                                                                                     "arrayof_jshort_disjoint_arraycopy");
2472     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2473                                                                                     "arrayof_jshort_arraycopy");
2474 
2475     //*** jint
2476     // Aligned versions
2477     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2478                                                                                 "arrayof_jint_disjoint_arraycopy");
2479     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2480                                                                                 "arrayof_jint_arraycopy");
2481     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2482     // entry_jint_arraycopy always points to the unaligned version
2483     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2484                                                                                 "jint_disjoint_arraycopy");
2485     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2486                                                                                 &entry_jint_arraycopy,
2487                                                                                 "jint_arraycopy");
2488 
2489     //*** jlong
2490     // It is always aligned
2491     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2492                                                                                   "arrayof_jlong_disjoint_arraycopy");
2493     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2494                                                                                   "arrayof_jlong_arraycopy");
2495     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2496     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2497 
2498     //*** oops
2499     {
2500       // With compressed oops we need unaligned versions; notice that
2501       // we overwrite entry_oop_arraycopy.
2502       bool aligned = !UseCompressedOops;
2503 
2504       StubRoutines::_arrayof_oop_disjoint_arraycopy
2505         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2506                                      /*dest_uninitialized*/false);
2507       StubRoutines::_arrayof_oop_arraycopy
2508         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2509                                      /*dest_uninitialized*/false);
2510       // Aligned versions without pre-barriers
2511       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2512         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2513                                      /*dest_uninitialized*/true);
2514       StubRoutines::_arrayof_oop_arraycopy_uninit
2515         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2516                                      /*dest_uninitialized*/true);
2517     }
2518 
2519     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2520     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2521     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2522     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2523 
2524     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2525     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2526                                                                         /*dest_uninitialized*/true);
2527 
2528     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2529                                                               entry_jbyte_arraycopy,
2530                                                               entry_jshort_arraycopy,
2531                                                               entry_jint_arraycopy,
2532                                                               entry_jlong_arraycopy);
2533 
2534     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2535                                                                entry_jbyte_arraycopy,
2536                                                                entry_jshort_arraycopy,
2537                                                                entry_jint_arraycopy,
2538                                                                entry_oop_arraycopy,
2539                                                                entry_jlong_arraycopy,
2540                                                                entry_checkcast_arraycopy);
2541 
2542     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2543     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2544     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2545     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2546     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2547     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2548   }
2549 
generate_math_stubs()2550   void generate_math_stubs() { Unimplemented(); }
2551 
2552   // Arguments:
2553   //
2554   // Inputs:
2555   //   c_rarg0   - source byte array address
2556   //   c_rarg1   - destination byte array address
2557   //   c_rarg2   - K (key) in little endian int array
2558   //
generate_aescrypt_encryptBlock()2559   address generate_aescrypt_encryptBlock() {
2560     __ align(CodeEntryAlignment);
2561     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2562 
2563     Label L_doLast;
2564 
2565     const Register from        = c_rarg0;  // source array address
2566     const Register to          = c_rarg1;  // destination array address
2567     const Register key         = c_rarg2;  // key array address
2568     const Register keylen      = rscratch1;
2569 
2570     address start = __ pc();
2571     __ enter();
2572 
2573     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2574 
2575     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2576 
2577     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2578     __ rev32(v1, __ T16B, v1);
2579     __ rev32(v2, __ T16B, v2);
2580     __ rev32(v3, __ T16B, v3);
2581     __ rev32(v4, __ T16B, v4);
2582     __ aese(v0, v1);
2583     __ aesmc(v0, v0);
2584     __ aese(v0, v2);
2585     __ aesmc(v0, v0);
2586     __ aese(v0, v3);
2587     __ aesmc(v0, v0);
2588     __ aese(v0, v4);
2589     __ aesmc(v0, v0);
2590 
2591     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2592     __ rev32(v1, __ T16B, v1);
2593     __ rev32(v2, __ T16B, v2);
2594     __ rev32(v3, __ T16B, v3);
2595     __ rev32(v4, __ T16B, v4);
2596     __ aese(v0, v1);
2597     __ aesmc(v0, v0);
2598     __ aese(v0, v2);
2599     __ aesmc(v0, v0);
2600     __ aese(v0, v3);
2601     __ aesmc(v0, v0);
2602     __ aese(v0, v4);
2603     __ aesmc(v0, v0);
2604 
2605     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2606     __ rev32(v1, __ T16B, v1);
2607     __ rev32(v2, __ T16B, v2);
2608 
2609     __ cmpw(keylen, 44);
2610     __ br(Assembler::EQ, L_doLast);
2611 
2612     __ aese(v0, v1);
2613     __ aesmc(v0, v0);
2614     __ aese(v0, v2);
2615     __ aesmc(v0, v0);
2616 
2617     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2618     __ rev32(v1, __ T16B, v1);
2619     __ rev32(v2, __ T16B, v2);
2620 
2621     __ cmpw(keylen, 52);
2622     __ br(Assembler::EQ, L_doLast);
2623 
2624     __ aese(v0, v1);
2625     __ aesmc(v0, v0);
2626     __ aese(v0, v2);
2627     __ aesmc(v0, v0);
2628 
2629     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2630     __ rev32(v1, __ T16B, v1);
2631     __ rev32(v2, __ T16B, v2);
2632 
2633     __ BIND(L_doLast);
2634 
2635     __ aese(v0, v1);
2636     __ aesmc(v0, v0);
2637     __ aese(v0, v2);
2638 
2639     __ ld1(v1, __ T16B, key);
2640     __ rev32(v1, __ T16B, v1);
2641     __ eor(v0, __ T16B, v0, v1);
2642 
2643     __ st1(v0, __ T16B, to);
2644 
2645     __ mov(r0, 0);
2646 
2647     __ leave();
2648     __ ret(lr);
2649 
2650     return start;
2651   }
2652 
2653   // Arguments:
2654   //
2655   // Inputs:
2656   //   c_rarg0   - source byte array address
2657   //   c_rarg1   - destination byte array address
2658   //   c_rarg2   - K (key) in little endian int array
2659   //
generate_aescrypt_decryptBlock()2660   address generate_aescrypt_decryptBlock() {
2661     assert(UseAES, "need AES cryptographic extension support");
2662     __ align(CodeEntryAlignment);
2663     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2664     Label L_doLast;
2665 
2666     const Register from        = c_rarg0;  // source array address
2667     const Register to          = c_rarg1;  // destination array address
2668     const Register key         = c_rarg2;  // key array address
2669     const Register keylen      = rscratch1;
2670 
2671     address start = __ pc();
2672     __ enter(); // required for proper stackwalking of RuntimeStub frame
2673 
2674     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2675 
2676     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2677 
2678     __ ld1(v5, __ T16B, __ post(key, 16));
2679     __ rev32(v5, __ T16B, v5);
2680 
2681     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2682     __ rev32(v1, __ T16B, v1);
2683     __ rev32(v2, __ T16B, v2);
2684     __ rev32(v3, __ T16B, v3);
2685     __ rev32(v4, __ T16B, v4);
2686     __ aesd(v0, v1);
2687     __ aesimc(v0, v0);
2688     __ aesd(v0, v2);
2689     __ aesimc(v0, v0);
2690     __ aesd(v0, v3);
2691     __ aesimc(v0, v0);
2692     __ aesd(v0, v4);
2693     __ aesimc(v0, v0);
2694 
2695     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2696     __ rev32(v1, __ T16B, v1);
2697     __ rev32(v2, __ T16B, v2);
2698     __ rev32(v3, __ T16B, v3);
2699     __ rev32(v4, __ T16B, v4);
2700     __ aesd(v0, v1);
2701     __ aesimc(v0, v0);
2702     __ aesd(v0, v2);
2703     __ aesimc(v0, v0);
2704     __ aesd(v0, v3);
2705     __ aesimc(v0, v0);
2706     __ aesd(v0, v4);
2707     __ aesimc(v0, v0);
2708 
2709     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2710     __ rev32(v1, __ T16B, v1);
2711     __ rev32(v2, __ T16B, v2);
2712 
2713     __ cmpw(keylen, 44);
2714     __ br(Assembler::EQ, L_doLast);
2715 
2716     __ aesd(v0, v1);
2717     __ aesimc(v0, v0);
2718     __ aesd(v0, v2);
2719     __ aesimc(v0, v0);
2720 
2721     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2722     __ rev32(v1, __ T16B, v1);
2723     __ rev32(v2, __ T16B, v2);
2724 
2725     __ cmpw(keylen, 52);
2726     __ br(Assembler::EQ, L_doLast);
2727 
2728     __ aesd(v0, v1);
2729     __ aesimc(v0, v0);
2730     __ aesd(v0, v2);
2731     __ aesimc(v0, v0);
2732 
2733     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2734     __ rev32(v1, __ T16B, v1);
2735     __ rev32(v2, __ T16B, v2);
2736 
2737     __ BIND(L_doLast);
2738 
2739     __ aesd(v0, v1);
2740     __ aesimc(v0, v0);
2741     __ aesd(v0, v2);
2742 
2743     __ eor(v0, __ T16B, v0, v5);
2744 
2745     __ st1(v0, __ T16B, to);
2746 
2747     __ mov(r0, 0);
2748 
2749     __ leave();
2750     __ ret(lr);
2751 
2752     return start;
2753   }
2754 
2755   // Arguments:
2756   //
2757   // Inputs:
2758   //   c_rarg0   - source byte array address
2759   //   c_rarg1   - destination byte array address
2760   //   c_rarg2   - K (key) in little endian int array
2761   //   c_rarg3   - r vector byte array address
2762   //   c_rarg4   - input length
2763   //
2764   // Output:
2765   //   x0        - input length
2766   //
generate_cipherBlockChaining_encryptAESCrypt()2767   address generate_cipherBlockChaining_encryptAESCrypt() {
2768     assert(UseAES, "need AES cryptographic extension support");
2769     __ align(CodeEntryAlignment);
2770     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2771 
2772     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2773 
2774     const Register from        = c_rarg0;  // source array address
2775     const Register to          = c_rarg1;  // destination array address
2776     const Register key         = c_rarg2;  // key array address
2777     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2778                                            // and left with the results of the last encryption block
2779     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2780     const Register keylen      = rscratch1;
2781 
2782     address start = __ pc();
2783 
2784       __ enter();
2785 
2786       __ movw(rscratch2, len_reg);
2787 
2788       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2789 
2790       __ ld1(v0, __ T16B, rvec);
2791 
2792       __ cmpw(keylen, 52);
2793       __ br(Assembler::CC, L_loadkeys_44);
2794       __ br(Assembler::EQ, L_loadkeys_52);
2795 
2796       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2797       __ rev32(v17, __ T16B, v17);
2798       __ rev32(v18, __ T16B, v18);
2799     __ BIND(L_loadkeys_52);
2800       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2801       __ rev32(v19, __ T16B, v19);
2802       __ rev32(v20, __ T16B, v20);
2803     __ BIND(L_loadkeys_44);
2804       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2805       __ rev32(v21, __ T16B, v21);
2806       __ rev32(v22, __ T16B, v22);
2807       __ rev32(v23, __ T16B, v23);
2808       __ rev32(v24, __ T16B, v24);
2809       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2810       __ rev32(v25, __ T16B, v25);
2811       __ rev32(v26, __ T16B, v26);
2812       __ rev32(v27, __ T16B, v27);
2813       __ rev32(v28, __ T16B, v28);
2814       __ ld1(v29, v30, v31, __ T16B, key);
2815       __ rev32(v29, __ T16B, v29);
2816       __ rev32(v30, __ T16B, v30);
2817       __ rev32(v31, __ T16B, v31);
2818 
2819     __ BIND(L_aes_loop);
2820       __ ld1(v1, __ T16B, __ post(from, 16));
2821       __ eor(v0, __ T16B, v0, v1);
2822 
2823       __ br(Assembler::CC, L_rounds_44);
2824       __ br(Assembler::EQ, L_rounds_52);
2825 
2826       __ aese(v0, v17); __ aesmc(v0, v0);
2827       __ aese(v0, v18); __ aesmc(v0, v0);
2828     __ BIND(L_rounds_52);
2829       __ aese(v0, v19); __ aesmc(v0, v0);
2830       __ aese(v0, v20); __ aesmc(v0, v0);
2831     __ BIND(L_rounds_44);
2832       __ aese(v0, v21); __ aesmc(v0, v0);
2833       __ aese(v0, v22); __ aesmc(v0, v0);
2834       __ aese(v0, v23); __ aesmc(v0, v0);
2835       __ aese(v0, v24); __ aesmc(v0, v0);
2836       __ aese(v0, v25); __ aesmc(v0, v0);
2837       __ aese(v0, v26); __ aesmc(v0, v0);
2838       __ aese(v0, v27); __ aesmc(v0, v0);
2839       __ aese(v0, v28); __ aesmc(v0, v0);
2840       __ aese(v0, v29); __ aesmc(v0, v0);
2841       __ aese(v0, v30);
2842       __ eor(v0, __ T16B, v0, v31);
2843 
2844       __ st1(v0, __ T16B, __ post(to, 16));
2845 
2846       __ subw(len_reg, len_reg, 16);
2847       __ cbnzw(len_reg, L_aes_loop);
2848 
2849       __ st1(v0, __ T16B, rvec);
2850 
2851       __ mov(r0, rscratch2);
2852 
2853       __ leave();
2854       __ ret(lr);
2855 
2856       return start;
2857   }
2858 
2859   // Arguments:
2860   //
2861   // Inputs:
2862   //   c_rarg0   - source byte array address
2863   //   c_rarg1   - destination byte array address
2864   //   c_rarg2   - K (key) in little endian int array
2865   //   c_rarg3   - r vector byte array address
2866   //   c_rarg4   - input length
2867   //
2868   // Output:
2869   //   r0        - input length
2870   //
generate_cipherBlockChaining_decryptAESCrypt()2871   address generate_cipherBlockChaining_decryptAESCrypt() {
2872     assert(UseAES, "need AES cryptographic extension support");
2873     __ align(CodeEntryAlignment);
2874     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2875 
2876     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2877 
2878     const Register from        = c_rarg0;  // source array address
2879     const Register to          = c_rarg1;  // destination array address
2880     const Register key         = c_rarg2;  // key array address
2881     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2882                                            // and left with the results of the last encryption block
2883     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2884     const Register keylen      = rscratch1;
2885 
2886     address start = __ pc();
2887 
2888       __ enter();
2889 
2890       __ movw(rscratch2, len_reg);
2891 
2892       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2893 
2894       __ ld1(v2, __ T16B, rvec);
2895 
2896       __ ld1(v31, __ T16B, __ post(key, 16));
2897       __ rev32(v31, __ T16B, v31);
2898 
2899       __ cmpw(keylen, 52);
2900       __ br(Assembler::CC, L_loadkeys_44);
2901       __ br(Assembler::EQ, L_loadkeys_52);
2902 
2903       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2904       __ rev32(v17, __ T16B, v17);
2905       __ rev32(v18, __ T16B, v18);
2906     __ BIND(L_loadkeys_52);
2907       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2908       __ rev32(v19, __ T16B, v19);
2909       __ rev32(v20, __ T16B, v20);
2910     __ BIND(L_loadkeys_44);
2911       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2912       __ rev32(v21, __ T16B, v21);
2913       __ rev32(v22, __ T16B, v22);
2914       __ rev32(v23, __ T16B, v23);
2915       __ rev32(v24, __ T16B, v24);
2916       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2917       __ rev32(v25, __ T16B, v25);
2918       __ rev32(v26, __ T16B, v26);
2919       __ rev32(v27, __ T16B, v27);
2920       __ rev32(v28, __ T16B, v28);
2921       __ ld1(v29, v30, __ T16B, key);
2922       __ rev32(v29, __ T16B, v29);
2923       __ rev32(v30, __ T16B, v30);
2924 
2925     __ BIND(L_aes_loop);
2926       __ ld1(v0, __ T16B, __ post(from, 16));
2927       __ orr(v1, __ T16B, v0, v0);
2928 
2929       __ br(Assembler::CC, L_rounds_44);
2930       __ br(Assembler::EQ, L_rounds_52);
2931 
2932       __ aesd(v0, v17); __ aesimc(v0, v0);
2933       __ aesd(v0, v18); __ aesimc(v0, v0);
2934     __ BIND(L_rounds_52);
2935       __ aesd(v0, v19); __ aesimc(v0, v0);
2936       __ aesd(v0, v20); __ aesimc(v0, v0);
2937     __ BIND(L_rounds_44);
2938       __ aesd(v0, v21); __ aesimc(v0, v0);
2939       __ aesd(v0, v22); __ aesimc(v0, v0);
2940       __ aesd(v0, v23); __ aesimc(v0, v0);
2941       __ aesd(v0, v24); __ aesimc(v0, v0);
2942       __ aesd(v0, v25); __ aesimc(v0, v0);
2943       __ aesd(v0, v26); __ aesimc(v0, v0);
2944       __ aesd(v0, v27); __ aesimc(v0, v0);
2945       __ aesd(v0, v28); __ aesimc(v0, v0);
2946       __ aesd(v0, v29); __ aesimc(v0, v0);
2947       __ aesd(v0, v30);
2948       __ eor(v0, __ T16B, v0, v31);
2949       __ eor(v0, __ T16B, v0, v2);
2950 
2951       __ st1(v0, __ T16B, __ post(to, 16));
2952       __ orr(v2, __ T16B, v1, v1);
2953 
2954       __ subw(len_reg, len_reg, 16);
2955       __ cbnzw(len_reg, L_aes_loop);
2956 
2957       __ st1(v2, __ T16B, rvec);
2958 
2959       __ mov(r0, rscratch2);
2960 
2961       __ leave();
2962       __ ret(lr);
2963 
2964     return start;
2965   }
2966 
2967   // Arguments:
2968   //
2969   // Inputs:
2970   //   c_rarg0   - byte[]  source+offset
2971   //   c_rarg1   - int[]   SHA.state
2972   //   c_rarg2   - int     offset
2973   //   c_rarg3   - int     limit
2974   //
generate_sha1_implCompress(bool multi_block,const char * name)2975   address generate_sha1_implCompress(bool multi_block, const char *name) {
2976     __ align(CodeEntryAlignment);
2977     StubCodeMark mark(this, "StubRoutines", name);
2978     address start = __ pc();
2979 
2980     Register buf   = c_rarg0;
2981     Register state = c_rarg1;
2982     Register ofs   = c_rarg2;
2983     Register limit = c_rarg3;
2984 
2985     Label keys;
2986     Label sha1_loop;
2987 
2988     // load the keys into v0..v3
2989     __ adr(rscratch1, keys);
2990     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2991     // load 5 words state into v6, v7
2992     __ ldrq(v6, Address(state, 0));
2993     __ ldrs(v7, Address(state, 16));
2994 
2995 
2996     __ BIND(sha1_loop);
2997     // load 64 bytes of data into v16..v19
2998     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2999     __ rev32(v16, __ T16B, v16);
3000     __ rev32(v17, __ T16B, v17);
3001     __ rev32(v18, __ T16B, v18);
3002     __ rev32(v19, __ T16B, v19);
3003 
3004     // do the sha1
3005     __ addv(v4, __ T4S, v16, v0);
3006     __ orr(v20, __ T16B, v6, v6);
3007 
3008     FloatRegister d0 = v16;
3009     FloatRegister d1 = v17;
3010     FloatRegister d2 = v18;
3011     FloatRegister d3 = v19;
3012 
3013     for (int round = 0; round < 20; round++) {
3014       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3015       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3016       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3017       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3018       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3019 
3020       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3021       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3022       __ sha1h(tmp2, __ T4S, v20);
3023       if (round < 5)
3024         __ sha1c(v20, __ T4S, tmp3, tmp4);
3025       else if (round < 10 || round >= 15)
3026         __ sha1p(v20, __ T4S, tmp3, tmp4);
3027       else
3028         __ sha1m(v20, __ T4S, tmp3, tmp4);
3029       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3030 
3031       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3032     }
3033 
3034     __ addv(v7, __ T2S, v7, v21);
3035     __ addv(v6, __ T4S, v6, v20);
3036 
3037     if (multi_block) {
3038       __ add(ofs, ofs, 64);
3039       __ cmp(ofs, limit);
3040       __ br(Assembler::LE, sha1_loop);
3041       __ mov(c_rarg0, ofs); // return ofs
3042     }
3043 
3044     __ strq(v6, Address(state, 0));
3045     __ strs(v7, Address(state, 16));
3046 
3047     __ ret(lr);
3048 
3049     __ bind(keys);
3050     __ emit_int32(0x5a827999);
3051     __ emit_int32(0x6ed9eba1);
3052     __ emit_int32(0x8f1bbcdc);
3053     __ emit_int32(0xca62c1d6);
3054 
3055     return start;
3056   }
3057 
3058 
3059   // Arguments:
3060   //
3061   // Inputs:
3062   //   c_rarg0   - byte[]  source+offset
3063   //   c_rarg1   - int[]   SHA.state
3064   //   c_rarg2   - int     offset
3065   //   c_rarg3   - int     limit
3066   //
generate_sha256_implCompress(bool multi_block,const char * name)3067   address generate_sha256_implCompress(bool multi_block, const char *name) {
3068     static const uint32_t round_consts[64] = {
3069       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3070       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3071       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3072       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3073       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3074       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3075       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3076       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3077       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3078       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3079       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3080       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3081       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3082       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3083       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3084       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3085     };
3086     __ align(CodeEntryAlignment);
3087     StubCodeMark mark(this, "StubRoutines", name);
3088     address start = __ pc();
3089 
3090     Register buf   = c_rarg0;
3091     Register state = c_rarg1;
3092     Register ofs   = c_rarg2;
3093     Register limit = c_rarg3;
3094 
3095     Label sha1_loop;
3096 
3097     __ stpd(v8, v9, __ pre(sp, -32));
3098     __ stpd(v10, v11, Address(sp, 16));
3099 
3100 // dga == v0
3101 // dgb == v1
3102 // dg0 == v2
3103 // dg1 == v3
3104 // dg2 == v4
3105 // t0 == v6
3106 // t1 == v7
3107 
3108     // load 16 keys to v16..v31
3109     __ lea(rscratch1, ExternalAddress((address)round_consts));
3110     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3111     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3112     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3113     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3114 
3115     // load 8 words (256 bits) state
3116     __ ldpq(v0, v1, state);
3117 
3118     __ BIND(sha1_loop);
3119     // load 64 bytes of data into v8..v11
3120     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3121     __ rev32(v8, __ T16B, v8);
3122     __ rev32(v9, __ T16B, v9);
3123     __ rev32(v10, __ T16B, v10);
3124     __ rev32(v11, __ T16B, v11);
3125 
3126     __ addv(v6, __ T4S, v8, v16);
3127     __ orr(v2, __ T16B, v0, v0);
3128     __ orr(v3, __ T16B, v1, v1);
3129 
3130     FloatRegister d0 = v8;
3131     FloatRegister d1 = v9;
3132     FloatRegister d2 = v10;
3133     FloatRegister d3 = v11;
3134 
3135 
3136     for (int round = 0; round < 16; round++) {
3137       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3138       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3139       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3140       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3141 
3142       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3143        __ orr(v4, __ T16B, v2, v2);
3144       if (round < 15)
3145         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3146       __ sha256h(v2, __ T4S, v3, tmp2);
3147       __ sha256h2(v3, __ T4S, v4, tmp2);
3148       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3149 
3150       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3151     }
3152 
3153     __ addv(v0, __ T4S, v0, v2);
3154     __ addv(v1, __ T4S, v1, v3);
3155 
3156     if (multi_block) {
3157       __ add(ofs, ofs, 64);
3158       __ cmp(ofs, limit);
3159       __ br(Assembler::LE, sha1_loop);
3160       __ mov(c_rarg0, ofs); // return ofs
3161     }
3162 
3163     __ ldpd(v10, v11, Address(sp, 16));
3164     __ ldpd(v8, v9, __ post(sp, 32));
3165 
3166     __ stpq(v0, v1, state);
3167 
3168     __ ret(lr);
3169 
3170     return start;
3171   }
3172 
3173   // Arguments:
3174   //
3175   // Inputs:
3176   //   c_rarg0   - byte[]  source+offset
3177   //   c_rarg1   - int[]   SHA.state
3178   //   c_rarg2   - int     offset
3179   //   c_rarg3   - int     limit
3180   //
generate_sha512_implCompress(bool multi_block,const char * name)3181   address generate_sha512_implCompress(bool multi_block, const char *name) {
3182     static const uint64_t round_consts[80] = {
3183       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3184       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3185       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3186       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3187       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3188       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3189       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3190       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3191       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3192       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3193       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3194       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3195       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3196       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3197       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3198       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3199       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3200       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3201       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3202       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3203       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3204       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3205       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3206       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3207       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3208       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3209       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3210     };
3211 
3212     // Double rounds for sha512.
3213     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3214       if (dr < 36)                                                                   \
3215         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3216       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3217       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3218       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3219       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3220       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3221       if (dr < 32) {                                                                 \
3222         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3223         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3224       }                                                                              \
3225       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3226       if (dr < 32)                                                                   \
3227         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3228       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3229       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3230 
3231     __ align(CodeEntryAlignment);
3232     StubCodeMark mark(this, "StubRoutines", name);
3233     address start = __ pc();
3234 
3235     Register buf   = c_rarg0;
3236     Register state = c_rarg1;
3237     Register ofs   = c_rarg2;
3238     Register limit = c_rarg3;
3239 
3240     __ stpd(v8, v9, __ pre(sp, -64));
3241     __ stpd(v10, v11, Address(sp, 16));
3242     __ stpd(v12, v13, Address(sp, 32));
3243     __ stpd(v14, v15, Address(sp, 48));
3244 
3245     Label sha512_loop;
3246 
3247     // load state
3248     __ ld1(v8, v9, v10, v11, __ T2D, state);
3249 
3250     // load first 4 round constants
3251     __ lea(rscratch1, ExternalAddress((address)round_consts));
3252     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3253 
3254     __ BIND(sha512_loop);
3255     // load 128B of data into v12..v19
3256     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3257     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3258     __ rev64(v12, __ T16B, v12);
3259     __ rev64(v13, __ T16B, v13);
3260     __ rev64(v14, __ T16B, v14);
3261     __ rev64(v15, __ T16B, v15);
3262     __ rev64(v16, __ T16B, v16);
3263     __ rev64(v17, __ T16B, v17);
3264     __ rev64(v18, __ T16B, v18);
3265     __ rev64(v19, __ T16B, v19);
3266 
3267     __ mov(rscratch2, rscratch1);
3268 
3269     __ mov(v0, __ T16B, v8);
3270     __ mov(v1, __ T16B, v9);
3271     __ mov(v2, __ T16B, v10);
3272     __ mov(v3, __ T16B, v11);
3273 
3274     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3275     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3276     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3277     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3278     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3279     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3280     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3281     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3282     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3283     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3284     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3285     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3286     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3287     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3288     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3289     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3290     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3291     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3292     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3293     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3294     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3295     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3296     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3297     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3298     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3299     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3300     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3301     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3302     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3303     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3304     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3305     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3306     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3307     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3308     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3309     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3310     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3311     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3312     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3313     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3314 
3315     __ addv(v8, __ T2D, v8, v0);
3316     __ addv(v9, __ T2D, v9, v1);
3317     __ addv(v10, __ T2D, v10, v2);
3318     __ addv(v11, __ T2D, v11, v3);
3319 
3320     if (multi_block) {
3321       __ add(ofs, ofs, 128);
3322       __ cmp(ofs, limit);
3323       __ br(Assembler::LE, sha512_loop);
3324       __ mov(c_rarg0, ofs); // return ofs
3325     }
3326 
3327     __ st1(v8, v9, v10, v11, __ T2D, state);
3328 
3329     __ ldpd(v14, v15, Address(sp, 48));
3330     __ ldpd(v12, v13, Address(sp, 32));
3331     __ ldpd(v10, v11, Address(sp, 16));
3332     __ ldpd(v8, v9, __ post(sp, 64));
3333 
3334     __ ret(lr);
3335 
3336     return start;
3337   }
3338 
3339   // Arguments:
3340   //
3341   // Inputs:
3342   //   c_rarg0   - byte[]  source+offset
3343   //   c_rarg1   - byte[]   SHA.state
3344   //   c_rarg2   - int     digest_length
3345   //   c_rarg3   - int     offset
3346   //   c_rarg4   - int     limit
3347   //
generate_sha3_implCompress(bool multi_block,const char * name)3348   address generate_sha3_implCompress(bool multi_block, const char *name) {
3349     static const uint64_t round_consts[24] = {
3350       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3351       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3352       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3353       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3354       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3355       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3356       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3357       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3358     };
3359 
3360     __ align(CodeEntryAlignment);
3361     StubCodeMark mark(this, "StubRoutines", name);
3362     address start = __ pc();
3363 
3364     Register buf           = c_rarg0;
3365     Register state         = c_rarg1;
3366     Register digest_length = c_rarg2;
3367     Register ofs           = c_rarg3;
3368     Register limit         = c_rarg4;
3369 
3370     Label sha3_loop, rounds24_loop;
3371     Label sha3_512, sha3_384_or_224, sha3_256;
3372 
3373     __ stpd(v8, v9, __ pre(sp, -64));
3374     __ stpd(v10, v11, Address(sp, 16));
3375     __ stpd(v12, v13, Address(sp, 32));
3376     __ stpd(v14, v15, Address(sp, 48));
3377 
3378     // load state
3379     __ add(rscratch1, state, 32);
3380     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3381     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3382     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3383     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3384     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3385     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3386     __ ld1(v24, __ T1D, rscratch1);
3387 
3388     __ BIND(sha3_loop);
3389 
3390     // 24 keccak rounds
3391     __ movw(rscratch2, 24);
3392 
3393     // load round_constants base
3394     __ lea(rscratch1, ExternalAddress((address) round_consts));
3395 
3396     // load input
3397     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3398     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3399     __ eor(v0, __ T8B, v0, v25);
3400     __ eor(v1, __ T8B, v1, v26);
3401     __ eor(v2, __ T8B, v2, v27);
3402     __ eor(v3, __ T8B, v3, v28);
3403     __ eor(v4, __ T8B, v4, v29);
3404     __ eor(v5, __ T8B, v5, v30);
3405     __ eor(v6, __ T8B, v6, v31);
3406 
3407     // digest_length == 64, SHA3-512
3408     __ tbnz(digest_length, 6, sha3_512);
3409 
3410     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3411     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3412     __ eor(v7, __ T8B, v7, v25);
3413     __ eor(v8, __ T8B, v8, v26);
3414     __ eor(v9, __ T8B, v9, v27);
3415     __ eor(v10, __ T8B, v10, v28);
3416     __ eor(v11, __ T8B, v11, v29);
3417     __ eor(v12, __ T8B, v12, v30);
3418 
3419     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3420     __ tbnz(digest_length, 4, sha3_384_or_224);
3421 
3422     // SHA3-256
3423     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3424     __ eor(v13, __ T8B, v13, v25);
3425     __ eor(v14, __ T8B, v14, v26);
3426     __ eor(v15, __ T8B, v15, v27);
3427     __ eor(v16, __ T8B, v16, v28);
3428     __ b(rounds24_loop);
3429 
3430     __ BIND(sha3_384_or_224);
3431     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3432 
3433     // SHA3-224
3434     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3435     __ ld1(v29, __ T8B, __ post(buf, 8));
3436     __ eor(v13, __ T8B, v13, v25);
3437     __ eor(v14, __ T8B, v14, v26);
3438     __ eor(v15, __ T8B, v15, v27);
3439     __ eor(v16, __ T8B, v16, v28);
3440     __ eor(v17, __ T8B, v17, v29);
3441     __ b(rounds24_loop);
3442 
3443     __ BIND(sha3_512);
3444     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3445     __ eor(v7, __ T8B, v7, v25);
3446     __ eor(v8, __ T8B, v8, v26);
3447 
3448     __ BIND(rounds24_loop);
3449     __ subw(rscratch2, rscratch2, 1);
3450 
3451     __ eor3(v29, __ T16B, v4, v9, v14);
3452     __ eor3(v26, __ T16B, v1, v6, v11);
3453     __ eor3(v28, __ T16B, v3, v8, v13);
3454     __ eor3(v25, __ T16B, v0, v5, v10);
3455     __ eor3(v27, __ T16B, v2, v7, v12);
3456     __ eor3(v29, __ T16B, v29, v19, v24);
3457     __ eor3(v26, __ T16B, v26, v16, v21);
3458     __ eor3(v28, __ T16B, v28, v18, v23);
3459     __ eor3(v25, __ T16B, v25, v15, v20);
3460     __ eor3(v27, __ T16B, v27, v17, v22);
3461 
3462     __ rax1(v30, __ T2D, v29, v26);
3463     __ rax1(v26, __ T2D, v26, v28);
3464     __ rax1(v28, __ T2D, v28, v25);
3465     __ rax1(v25, __ T2D, v25, v27);
3466     __ rax1(v27, __ T2D, v27, v29);
3467 
3468     __ eor(v0, __ T16B, v0, v30);
3469     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3470     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3471     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3472     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3473     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3474     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3475     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3476     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3477     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3478     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3479     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3480     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3481     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3482     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3483     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3484     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3485     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3486     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3487     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3488     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3489     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3490     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3491     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3492     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3493 
3494     __ bcax(v20, __ T16B, v31, v22, v8);
3495     __ bcax(v21, __ T16B, v8,  v23, v22);
3496     __ bcax(v22, __ T16B, v22, v24, v23);
3497     __ bcax(v23, __ T16B, v23, v31, v24);
3498     __ bcax(v24, __ T16B, v24, v8,  v31);
3499 
3500     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3501 
3502     __ bcax(v17, __ T16B, v25, v19, v3);
3503     __ bcax(v18, __ T16B, v3,  v15, v19);
3504     __ bcax(v19, __ T16B, v19, v16, v15);
3505     __ bcax(v15, __ T16B, v15, v25, v16);
3506     __ bcax(v16, __ T16B, v16, v3,  v25);
3507 
3508     __ bcax(v10, __ T16B, v29, v12, v26);
3509     __ bcax(v11, __ T16B, v26, v13, v12);
3510     __ bcax(v12, __ T16B, v12, v14, v13);
3511     __ bcax(v13, __ T16B, v13, v29, v14);
3512     __ bcax(v14, __ T16B, v14, v26, v29);
3513 
3514     __ bcax(v7, __ T16B, v30, v9,  v4);
3515     __ bcax(v8, __ T16B, v4,  v5,  v9);
3516     __ bcax(v9, __ T16B, v9,  v6,  v5);
3517     __ bcax(v5, __ T16B, v5,  v30, v6);
3518     __ bcax(v6, __ T16B, v6,  v4,  v30);
3519 
3520     __ bcax(v3, __ T16B, v27, v0,  v28);
3521     __ bcax(v4, __ T16B, v28, v1,  v0);
3522     __ bcax(v0, __ T16B, v0,  v2,  v1);
3523     __ bcax(v1, __ T16B, v1,  v27, v2);
3524     __ bcax(v2, __ T16B, v2,  v28, v27);
3525 
3526     __ eor(v0, __ T16B, v0, v31);
3527 
3528     __ cbnzw(rscratch2, rounds24_loop);
3529 
3530     if (multi_block) {
3531       // block_size =  200 - 2 * digest_length, ofs += block_size
3532       __ add(ofs, ofs, 200);
3533       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3534 
3535       __ cmp(ofs, limit);
3536       __ br(Assembler::LE, sha3_loop);
3537       __ mov(c_rarg0, ofs); // return ofs
3538     }
3539 
3540     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3541     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3542     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3543     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3544     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3545     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3546     __ st1(v24, __ T1D, state);
3547 
3548     __ ldpd(v14, v15, Address(sp, 48));
3549     __ ldpd(v12, v13, Address(sp, 32));
3550     __ ldpd(v10, v11, Address(sp, 16));
3551     __ ldpd(v8, v9, __ post(sp, 64));
3552 
3553     __ ret(lr);
3554 
3555     return start;
3556   }
3557 
3558   // Safefetch stubs.
generate_safefetch(const char * name,int size,address * entry,address * fault_pc,address * continuation_pc)3559   void generate_safefetch(const char* name, int size, address* entry,
3560                           address* fault_pc, address* continuation_pc) {
3561     // safefetch signatures:
3562     //   int      SafeFetch32(int*      adr, int      errValue);
3563     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3564     //
3565     // arguments:
3566     //   c_rarg0 = adr
3567     //   c_rarg1 = errValue
3568     //
3569     // result:
3570     //   PPC_RET  = *adr or errValue
3571 
3572     StubCodeMark mark(this, "StubRoutines", name);
3573 
3574     // Entry point, pc or function descriptor.
3575     *entry = __ pc();
3576 
3577     // Load *adr into c_rarg1, may fault.
3578     *fault_pc = __ pc();
3579     switch (size) {
3580       case 4:
3581         // int32_t
3582         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3583         break;
3584       case 8:
3585         // int64_t
3586         __ ldr(c_rarg1, Address(c_rarg0, 0));
3587         break;
3588       default:
3589         ShouldNotReachHere();
3590     }
3591 
3592     // return errValue or *adr
3593     *continuation_pc = __ pc();
3594     __ mov(r0, c_rarg1);
3595     __ ret(lr);
3596   }
3597 
3598   /**
3599    *  Arguments:
3600    *
3601    * Inputs:
3602    *   c_rarg0   - int crc
3603    *   c_rarg1   - byte* buf
3604    *   c_rarg2   - int length
3605    *
3606    * Ouput:
3607    *       rax   - int crc result
3608    */
generate_updateBytesCRC32()3609   address generate_updateBytesCRC32() {
3610     assert(UseCRC32Intrinsics, "what are we doing here?");
3611 
3612     __ align(CodeEntryAlignment);
3613     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3614 
3615     address start = __ pc();
3616 
3617     const Register crc   = c_rarg0;  // crc
3618     const Register buf   = c_rarg1;  // source java byte array address
3619     const Register len   = c_rarg2;  // length
3620     const Register table0 = c_rarg3; // crc_table address
3621     const Register table1 = c_rarg4;
3622     const Register table2 = c_rarg5;
3623     const Register table3 = c_rarg6;
3624     const Register tmp3 = c_rarg7;
3625 
3626     BLOCK_COMMENT("Entry:");
3627     __ enter(); // required for proper stackwalking of RuntimeStub frame
3628 
3629     __ kernel_crc32(crc, buf, len,
3630               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3631 
3632     __ leave(); // required for proper stackwalking of RuntimeStub frame
3633     __ ret(lr);
3634 
3635     return start;
3636   }
3637 
3638   /**
3639    *  Arguments:
3640    *
3641    * Inputs:
3642    *   c_rarg0   - int crc
3643    *   c_rarg1   - byte* buf
3644    *   c_rarg2   - int length
3645    *   c_rarg3   - int* table
3646    *
3647    * Ouput:
3648    *       r0   - int crc result
3649    */
generate_updateBytesCRC32C()3650   address generate_updateBytesCRC32C() {
3651     assert(UseCRC32CIntrinsics, "what are we doing here?");
3652 
3653     __ align(CodeEntryAlignment);
3654     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3655 
3656     address start = __ pc();
3657 
3658     const Register crc   = c_rarg0;  // crc
3659     const Register buf   = c_rarg1;  // source java byte array address
3660     const Register len   = c_rarg2;  // length
3661     const Register table0 = c_rarg3; // crc_table address
3662     const Register table1 = c_rarg4;
3663     const Register table2 = c_rarg5;
3664     const Register table3 = c_rarg6;
3665     const Register tmp3 = c_rarg7;
3666 
3667     BLOCK_COMMENT("Entry:");
3668     __ enter(); // required for proper stackwalking of RuntimeStub frame
3669 
3670     __ kernel_crc32c(crc, buf, len,
3671               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3672 
3673     __ leave(); // required for proper stackwalking of RuntimeStub frame
3674     __ ret(lr);
3675 
3676     return start;
3677   }
3678 
3679   /***
3680    *  Arguments:
3681    *
3682    *  Inputs:
3683    *   c_rarg0   - int   adler
3684    *   c_rarg1   - byte* buff
3685    *   c_rarg2   - int   len
3686    *
3687    * Output:
3688    *   c_rarg0   - int adler result
3689    */
generate_updateBytesAdler32()3690   address generate_updateBytesAdler32() {
3691     __ align(CodeEntryAlignment);
3692     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3693     address start = __ pc();
3694 
3695     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3696 
3697     // Aliases
3698     Register adler  = c_rarg0;
3699     Register s1     = c_rarg0;
3700     Register s2     = c_rarg3;
3701     Register buff   = c_rarg1;
3702     Register len    = c_rarg2;
3703     Register nmax  = r4;
3704     Register base  = r5;
3705     Register count = r6;
3706     Register temp0 = rscratch1;
3707     Register temp1 = rscratch2;
3708     FloatRegister vbytes = v0;
3709     FloatRegister vs1acc = v1;
3710     FloatRegister vs2acc = v2;
3711     FloatRegister vtable = v3;
3712 
3713     // Max number of bytes we can process before having to take the mod
3714     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3715     uint64_t BASE = 0xfff1;
3716     uint64_t NMAX = 0x15B0;
3717 
3718     __ mov(base, BASE);
3719     __ mov(nmax, NMAX);
3720 
3721     // Load accumulation coefficients for the upper 16 bits
3722     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3723     __ ld1(vtable, __ T16B, Address(temp0));
3724 
3725     // s1 is initialized to the lower 16 bits of adler
3726     // s2 is initialized to the upper 16 bits of adler
3727     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3728     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3729 
3730     // The pipelined loop needs at least 16 elements for 1 iteration
3731     // It does check this, but it is more effective to skip to the cleanup loop
3732     __ cmp(len, (u1)16);
3733     __ br(Assembler::HS, L_nmax);
3734     __ cbz(len, L_combine);
3735 
3736     __ bind(L_simple_by1_loop);
3737     __ ldrb(temp0, Address(__ post(buff, 1)));
3738     __ add(s1, s1, temp0);
3739     __ add(s2, s2, s1);
3740     __ subs(len, len, 1);
3741     __ br(Assembler::HI, L_simple_by1_loop);
3742 
3743     // s1 = s1 % BASE
3744     __ subs(temp0, s1, base);
3745     __ csel(s1, temp0, s1, Assembler::HS);
3746 
3747     // s2 = s2 % BASE
3748     __ lsr(temp0, s2, 16);
3749     __ lsl(temp1, temp0, 4);
3750     __ sub(temp1, temp1, temp0);
3751     __ add(s2, temp1, s2, ext::uxth);
3752 
3753     __ subs(temp0, s2, base);
3754     __ csel(s2, temp0, s2, Assembler::HS);
3755 
3756     __ b(L_combine);
3757 
3758     __ bind(L_nmax);
3759     __ subs(len, len, nmax);
3760     __ sub(count, nmax, 16);
3761     __ br(Assembler::LO, L_by16);
3762 
3763     __ bind(L_nmax_loop);
3764 
3765     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3766                                       vbytes, vs1acc, vs2acc, vtable);
3767 
3768     __ subs(count, count, 16);
3769     __ br(Assembler::HS, L_nmax_loop);
3770 
3771     // s1 = s1 % BASE
3772     __ lsr(temp0, s1, 16);
3773     __ lsl(temp1, temp0, 4);
3774     __ sub(temp1, temp1, temp0);
3775     __ add(temp1, temp1, s1, ext::uxth);
3776 
3777     __ lsr(temp0, temp1, 16);
3778     __ lsl(s1, temp0, 4);
3779     __ sub(s1, s1, temp0);
3780     __ add(s1, s1, temp1, ext:: uxth);
3781 
3782     __ subs(temp0, s1, base);
3783     __ csel(s1, temp0, s1, Assembler::HS);
3784 
3785     // s2 = s2 % BASE
3786     __ lsr(temp0, s2, 16);
3787     __ lsl(temp1, temp0, 4);
3788     __ sub(temp1, temp1, temp0);
3789     __ add(temp1, temp1, s2, ext::uxth);
3790 
3791     __ lsr(temp0, temp1, 16);
3792     __ lsl(s2, temp0, 4);
3793     __ sub(s2, s2, temp0);
3794     __ add(s2, s2, temp1, ext:: uxth);
3795 
3796     __ subs(temp0, s2, base);
3797     __ csel(s2, temp0, s2, Assembler::HS);
3798 
3799     __ subs(len, len, nmax);
3800     __ sub(count, nmax, 16);
3801     __ br(Assembler::HS, L_nmax_loop);
3802 
3803     __ bind(L_by16);
3804     __ adds(len, len, count);
3805     __ br(Assembler::LO, L_by1);
3806 
3807     __ bind(L_by16_loop);
3808 
3809     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3810                                       vbytes, vs1acc, vs2acc, vtable);
3811 
3812     __ subs(len, len, 16);
3813     __ br(Assembler::HS, L_by16_loop);
3814 
3815     __ bind(L_by1);
3816     __ adds(len, len, 15);
3817     __ br(Assembler::LO, L_do_mod);
3818 
3819     __ bind(L_by1_loop);
3820     __ ldrb(temp0, Address(__ post(buff, 1)));
3821     __ add(s1, temp0, s1);
3822     __ add(s2, s2, s1);
3823     __ subs(len, len, 1);
3824     __ br(Assembler::HS, L_by1_loop);
3825 
3826     __ bind(L_do_mod);
3827     // s1 = s1 % BASE
3828     __ lsr(temp0, s1, 16);
3829     __ lsl(temp1, temp0, 4);
3830     __ sub(temp1, temp1, temp0);
3831     __ add(temp1, temp1, s1, ext::uxth);
3832 
3833     __ lsr(temp0, temp1, 16);
3834     __ lsl(s1, temp0, 4);
3835     __ sub(s1, s1, temp0);
3836     __ add(s1, s1, temp1, ext:: uxth);
3837 
3838     __ subs(temp0, s1, base);
3839     __ csel(s1, temp0, s1, Assembler::HS);
3840 
3841     // s2 = s2 % BASE
3842     __ lsr(temp0, s2, 16);
3843     __ lsl(temp1, temp0, 4);
3844     __ sub(temp1, temp1, temp0);
3845     __ add(temp1, temp1, s2, ext::uxth);
3846 
3847     __ lsr(temp0, temp1, 16);
3848     __ lsl(s2, temp0, 4);
3849     __ sub(s2, s2, temp0);
3850     __ add(s2, s2, temp1, ext:: uxth);
3851 
3852     __ subs(temp0, s2, base);
3853     __ csel(s2, temp0, s2, Assembler::HS);
3854 
3855     // Combine lower bits and higher bits
3856     __ bind(L_combine);
3857     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3858 
3859     __ ret(lr);
3860 
3861     return start;
3862   }
3863 
generate_updateBytesAdler32_accum(Register s1,Register s2,Register buff,Register temp0,Register temp1,FloatRegister vbytes,FloatRegister vs1acc,FloatRegister vs2acc,FloatRegister vtable)3864   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3865           Register temp0, Register temp1, FloatRegister vbytes,
3866           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3867     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3868     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3869     // In non-vectorized code, we update s1 and s2 as:
3870     //   s1 <- s1 + b1
3871     //   s2 <- s2 + s1
3872     //   s1 <- s1 + b2
3873     //   s2 <- s2 + b1
3874     //   ...
3875     //   s1 <- s1 + b16
3876     //   s2 <- s2 + s1
3877     // Putting above assignments together, we have:
3878     //   s1_new = s1 + b1 + b2 + ... + b16
3879     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3880     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3881     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3882     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3883 
3884     // s2 = s2 + s1 * 16
3885     __ add(s2, s2, s1, Assembler::LSL, 4);
3886 
3887     // vs1acc = b1 + b2 + b3 + ... + b16
3888     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3889     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3890     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3891     __ uaddlv(vs1acc, __ T16B, vbytes);
3892     __ uaddlv(vs2acc, __ T8H, vs2acc);
3893 
3894     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3895     __ fmovd(temp0, vs1acc);
3896     __ fmovd(temp1, vs2acc);
3897     __ add(s1, s1, temp0);
3898     __ add(s2, s2, temp1);
3899   }
3900 
3901   /**
3902    *  Arguments:
3903    *
3904    *  Input:
3905    *    c_rarg0   - x address
3906    *    c_rarg1   - x length
3907    *    c_rarg2   - y address
3908    *    c_rarg3   - y lenth
3909    *    c_rarg4   - z address
3910    *    c_rarg5   - z length
3911    */
generate_multiplyToLen()3912   address generate_multiplyToLen() {
3913     __ align(CodeEntryAlignment);
3914     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3915 
3916     address start = __ pc();
3917     const Register x     = r0;
3918     const Register xlen  = r1;
3919     const Register y     = r2;
3920     const Register ylen  = r3;
3921     const Register z     = r4;
3922     const Register zlen  = r5;
3923 
3924     const Register tmp1  = r10;
3925     const Register tmp2  = r11;
3926     const Register tmp3  = r12;
3927     const Register tmp4  = r13;
3928     const Register tmp5  = r14;
3929     const Register tmp6  = r15;
3930     const Register tmp7  = r16;
3931 
3932     BLOCK_COMMENT("Entry:");
3933     __ enter(); // required for proper stackwalking of RuntimeStub frame
3934     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3935     __ leave(); // required for proper stackwalking of RuntimeStub frame
3936     __ ret(lr);
3937 
3938     return start;
3939   }
3940 
generate_squareToLen()3941   address generate_squareToLen() {
3942     // squareToLen algorithm for sizes 1..127 described in java code works
3943     // faster than multiply_to_len on some CPUs and slower on others, but
3944     // multiply_to_len shows a bit better overall results
3945     __ align(CodeEntryAlignment);
3946     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3947     address start = __ pc();
3948 
3949     const Register x     = r0;
3950     const Register xlen  = r1;
3951     const Register z     = r2;
3952     const Register zlen  = r3;
3953     const Register y     = r4; // == x
3954     const Register ylen  = r5; // == xlen
3955 
3956     const Register tmp1  = r10;
3957     const Register tmp2  = r11;
3958     const Register tmp3  = r12;
3959     const Register tmp4  = r13;
3960     const Register tmp5  = r14;
3961     const Register tmp6  = r15;
3962     const Register tmp7  = r16;
3963 
3964     RegSet spilled_regs = RegSet::of(y, ylen);
3965     BLOCK_COMMENT("Entry:");
3966     __ enter();
3967     __ push(spilled_regs, sp);
3968     __ mov(y, x);
3969     __ mov(ylen, xlen);
3970     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3971     __ pop(spilled_regs, sp);
3972     __ leave();
3973     __ ret(lr);
3974     return start;
3975   }
3976 
generate_mulAdd()3977   address generate_mulAdd() {
3978     __ align(CodeEntryAlignment);
3979     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3980 
3981     address start = __ pc();
3982 
3983     const Register out     = r0;
3984     const Register in      = r1;
3985     const Register offset  = r2;
3986     const Register len     = r3;
3987     const Register k       = r4;
3988 
3989     BLOCK_COMMENT("Entry:");
3990     __ enter();
3991     __ mul_add(out, in, offset, len, k);
3992     __ leave();
3993     __ ret(lr);
3994 
3995     return start;
3996   }
3997 
3998   // Arguments:
3999   //
4000   // Input:
4001   //   c_rarg0   - newArr address
4002   //   c_rarg1   - oldArr address
4003   //   c_rarg2   - newIdx
4004   //   c_rarg3   - shiftCount
4005   //   c_rarg4   - numIter
4006   //
generate_bigIntegerRightShift()4007   address generate_bigIntegerRightShift() {
4008     __ align(CodeEntryAlignment);
4009     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4010     address start = __ pc();
4011 
4012     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4013 
4014     Register newArr        = c_rarg0;
4015     Register oldArr        = c_rarg1;
4016     Register newIdx        = c_rarg2;
4017     Register shiftCount    = c_rarg3;
4018     Register numIter       = c_rarg4;
4019     Register idx           = numIter;
4020 
4021     Register newArrCur     = rscratch1;
4022     Register shiftRevCount = rscratch2;
4023     Register oldArrCur     = r13;
4024     Register oldArrNext    = r14;
4025 
4026     FloatRegister oldElem0        = v0;
4027     FloatRegister oldElem1        = v1;
4028     FloatRegister newElem         = v2;
4029     FloatRegister shiftVCount     = v3;
4030     FloatRegister shiftVRevCount  = v4;
4031 
4032     __ cbz(idx, Exit);
4033 
4034     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4035 
4036     // left shift count
4037     __ movw(shiftRevCount, 32);
4038     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4039 
4040     // numIter too small to allow a 4-words SIMD loop, rolling back
4041     __ cmp(numIter, (u1)4);
4042     __ br(Assembler::LT, ShiftThree);
4043 
4044     __ dup(shiftVCount,    __ T4S, shiftCount);
4045     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4046     __ negr(shiftVCount,   __ T4S, shiftVCount);
4047 
4048     __ BIND(ShiftSIMDLoop);
4049 
4050     // Calculate the load addresses
4051     __ sub(idx, idx, 4);
4052     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4053     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4054     __ add(oldArrCur,  oldArrNext, 4);
4055 
4056     // Load 4 words and process
4057     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4058     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4059     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4060     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4061     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4062     __ st1(newElem,   __ T4S,  Address(newArrCur));
4063 
4064     __ cmp(idx, (u1)4);
4065     __ br(Assembler::LT, ShiftTwoLoop);
4066     __ b(ShiftSIMDLoop);
4067 
4068     __ BIND(ShiftTwoLoop);
4069     __ cbz(idx, Exit);
4070     __ cmp(idx, (u1)1);
4071     __ br(Assembler::EQ, ShiftOne);
4072 
4073     // Calculate the load addresses
4074     __ sub(idx, idx, 2);
4075     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4076     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4077     __ add(oldArrCur,  oldArrNext, 4);
4078 
4079     // Load 2 words and process
4080     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4081     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4082     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4083     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4084     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4085     __ st1(newElem,   __ T2S, Address(newArrCur));
4086     __ b(ShiftTwoLoop);
4087 
4088     __ BIND(ShiftThree);
4089     __ tbz(idx, 1, ShiftOne);
4090     __ tbz(idx, 0, ShiftTwo);
4091     __ ldrw(r10,  Address(oldArr, 12));
4092     __ ldrw(r11,  Address(oldArr, 8));
4093     __ lsrvw(r10, r10, shiftCount);
4094     __ lslvw(r11, r11, shiftRevCount);
4095     __ orrw(r12,  r10, r11);
4096     __ strw(r12,  Address(newArr, 8));
4097 
4098     __ BIND(ShiftTwo);
4099     __ ldrw(r10,  Address(oldArr, 8));
4100     __ ldrw(r11,  Address(oldArr, 4));
4101     __ lsrvw(r10, r10, shiftCount);
4102     __ lslvw(r11, r11, shiftRevCount);
4103     __ orrw(r12,  r10, r11);
4104     __ strw(r12,  Address(newArr, 4));
4105 
4106     __ BIND(ShiftOne);
4107     __ ldrw(r10,  Address(oldArr, 4));
4108     __ ldrw(r11,  Address(oldArr));
4109     __ lsrvw(r10, r10, shiftCount);
4110     __ lslvw(r11, r11, shiftRevCount);
4111     __ orrw(r12,  r10, r11);
4112     __ strw(r12,  Address(newArr));
4113 
4114     __ BIND(Exit);
4115     __ ret(lr);
4116 
4117     return start;
4118   }
4119 
4120   // Arguments:
4121   //
4122   // Input:
4123   //   c_rarg0   - newArr address
4124   //   c_rarg1   - oldArr address
4125   //   c_rarg2   - newIdx
4126   //   c_rarg3   - shiftCount
4127   //   c_rarg4   - numIter
4128   //
generate_bigIntegerLeftShift()4129   address generate_bigIntegerLeftShift() {
4130     __ align(CodeEntryAlignment);
4131     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4132     address start = __ pc();
4133 
4134     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4135 
4136     Register newArr        = c_rarg0;
4137     Register oldArr        = c_rarg1;
4138     Register newIdx        = c_rarg2;
4139     Register shiftCount    = c_rarg3;
4140     Register numIter       = c_rarg4;
4141 
4142     Register shiftRevCount = rscratch1;
4143     Register oldArrNext    = rscratch2;
4144 
4145     FloatRegister oldElem0        = v0;
4146     FloatRegister oldElem1        = v1;
4147     FloatRegister newElem         = v2;
4148     FloatRegister shiftVCount     = v3;
4149     FloatRegister shiftVRevCount  = v4;
4150 
4151     __ cbz(numIter, Exit);
4152 
4153     __ add(oldArrNext, oldArr, 4);
4154     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4155 
4156     // right shift count
4157     __ movw(shiftRevCount, 32);
4158     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4159 
4160     // numIter too small to allow a 4-words SIMD loop, rolling back
4161     __ cmp(numIter, (u1)4);
4162     __ br(Assembler::LT, ShiftThree);
4163 
4164     __ dup(shiftVCount,     __ T4S, shiftCount);
4165     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4166     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4167 
4168     __ BIND(ShiftSIMDLoop);
4169 
4170     // load 4 words and process
4171     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4172     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4173     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4174     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4175     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4176     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4177     __ sub(numIter,   numIter, 4);
4178 
4179     __ cmp(numIter, (u1)4);
4180     __ br(Assembler::LT, ShiftTwoLoop);
4181     __ b(ShiftSIMDLoop);
4182 
4183     __ BIND(ShiftTwoLoop);
4184     __ cbz(numIter, Exit);
4185     __ cmp(numIter, (u1)1);
4186     __ br(Assembler::EQ, ShiftOne);
4187 
4188     // load 2 words and process
4189     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4190     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4191     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4192     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4193     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4194     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4195     __ sub(numIter,   numIter, 2);
4196     __ b(ShiftTwoLoop);
4197 
4198     __ BIND(ShiftThree);
4199     __ ldrw(r10,  __ post(oldArr, 4));
4200     __ ldrw(r11,  __ post(oldArrNext, 4));
4201     __ lslvw(r10, r10, shiftCount);
4202     __ lsrvw(r11, r11, shiftRevCount);
4203     __ orrw(r12,  r10, r11);
4204     __ strw(r12,  __ post(newArr, 4));
4205     __ tbz(numIter, 1, Exit);
4206     __ tbz(numIter, 0, ShiftOne);
4207 
4208     __ BIND(ShiftTwo);
4209     __ ldrw(r10,  __ post(oldArr, 4));
4210     __ ldrw(r11,  __ post(oldArrNext, 4));
4211     __ lslvw(r10, r10, shiftCount);
4212     __ lsrvw(r11, r11, shiftRevCount);
4213     __ orrw(r12,  r10, r11);
4214     __ strw(r12,  __ post(newArr, 4));
4215 
4216     __ BIND(ShiftOne);
4217     __ ldrw(r10,  Address(oldArr));
4218     __ ldrw(r11,  Address(oldArrNext));
4219     __ lslvw(r10, r10, shiftCount);
4220     __ lsrvw(r11, r11, shiftRevCount);
4221     __ orrw(r12,  r10, r11);
4222     __ strw(r12,  Address(newArr));
4223 
4224     __ BIND(Exit);
4225     __ ret(lr);
4226 
4227     return start;
4228   }
4229 
ghash_multiply(FloatRegister result_lo,FloatRegister result_hi,FloatRegister a,FloatRegister b,FloatRegister a1_xor_a0,FloatRegister tmp1,FloatRegister tmp2,FloatRegister tmp3,FloatRegister tmp4)4230   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
4231                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
4232                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
4233     // Karatsuba multiplication performs a 128*128 -> 256-bit
4234     // multiplication in three 128-bit multiplications and a few
4235     // additions.
4236     //
4237     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
4238     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
4239     //
4240     // Inputs:
4241     //
4242     // A0 in a.d[0]     (subkey)
4243     // A1 in a.d[1]
4244     // (A1+A0) in a1_xor_a0.d[0]
4245     //
4246     // B0 in b.d[0]     (state)
4247     // B1 in b.d[1]
4248 
4249     __ ext(tmp1, __ T16B, b, b, 0x08);
4250     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
4251     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
4252     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
4253     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
4254 
4255     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
4256     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
4257     __ eor(tmp2, __ T16B, tmp2, tmp4);
4258     __ eor(tmp2, __ T16B, tmp2, tmp3);
4259 
4260     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
4261     __ ins(result_hi, __ D, tmp2, 0, 1);
4262     __ ins(result_lo, __ D, tmp2, 1, 0);
4263   }
4264 
ghash_reduce(FloatRegister result,FloatRegister lo,FloatRegister hi,FloatRegister p,FloatRegister z,FloatRegister t1)4265   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
4266                     FloatRegister p, FloatRegister z, FloatRegister t1) {
4267     const FloatRegister t0 = result;
4268 
4269     // The GCM field polynomial f is z^128 + p(z), where p =
4270     // z^7+z^2+z+1.
4271     //
4272     //    z^128 === -p(z)  (mod (z^128 + p(z)))
4273     //
4274     // so, given that the product we're reducing is
4275     //    a == lo + hi * z^128
4276     // substituting,
4277     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
4278     //
4279     // we reduce by multiplying hi by p(z) and subtracting the result
4280     // from (i.e. XORing it with) lo.  Because p has no nonzero high
4281     // bits we can do this with two 64-bit multiplications, lo*p and
4282     // hi*p.
4283 
4284     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
4285     __ ext(t1, __ T16B, t0, z, 8);
4286     __ eor(hi, __ T16B, hi, t1);
4287     __ ext(t1, __ T16B, z, t0, 8);
4288     __ eor(lo, __ T16B, lo, t1);
4289     __ pmull(t0, __ T1Q, hi, p, __ T1D);
4290     __ eor(result, __ T16B, lo, t0);
4291   }
4292 
generate_has_negatives(address & has_negatives_long)4293   address generate_has_negatives(address &has_negatives_long) {
4294     const u1 large_loop_size = 64;
4295     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4296     int dcache_line = VM_Version::dcache_line_size();
4297 
4298     Register ary1 = r1, len = r2, result = r0;
4299 
4300     __ align(CodeEntryAlignment);
4301 
4302     StubCodeMark mark(this, "StubRoutines", "has_negatives");
4303 
4304     address entry = __ pc();
4305 
4306     __ enter();
4307 
4308   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
4309         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4310 
4311   __ cmp(len, (u1)15);
4312   __ br(Assembler::GT, LEN_OVER_15);
4313   // The only case when execution falls into this code is when pointer is near
4314   // the end of memory page and we have to avoid reading next page
4315   __ add(ary1, ary1, len);
4316   __ subs(len, len, 8);
4317   __ br(Assembler::GT, LEN_OVER_8);
4318   __ ldr(rscratch2, Address(ary1, -8));
4319   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4320   __ lsrv(rscratch2, rscratch2, rscratch1);
4321   __ tst(rscratch2, UPPER_BIT_MASK);
4322   __ cset(result, Assembler::NE);
4323   __ leave();
4324   __ ret(lr);
4325   __ bind(LEN_OVER_8);
4326   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4327   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4328   __ tst(rscratch2, UPPER_BIT_MASK);
4329   __ br(Assembler::NE, RET_TRUE_NO_POP);
4330   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4331   __ lsrv(rscratch1, rscratch1, rscratch2);
4332   __ tst(rscratch1, UPPER_BIT_MASK);
4333   __ cset(result, Assembler::NE);
4334   __ leave();
4335   __ ret(lr);
4336 
4337   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4338   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4339 
4340   has_negatives_long = __ pc(); // 2nd entry point
4341 
4342   __ enter();
4343 
4344   __ bind(LEN_OVER_15);
4345     __ push(spilled_regs, sp);
4346     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4347     __ cbz(rscratch2, ALIGNED);
4348     __ ldp(tmp6, tmp1, Address(ary1));
4349     __ mov(tmp5, 16);
4350     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4351     __ add(ary1, ary1, rscratch1);
4352     __ sub(len, len, rscratch1);
4353     __ orr(tmp6, tmp6, tmp1);
4354     __ tst(tmp6, UPPER_BIT_MASK);
4355     __ br(Assembler::NE, RET_TRUE);
4356 
4357   __ bind(ALIGNED);
4358     __ cmp(len, large_loop_size);
4359     __ br(Assembler::LT, CHECK_16);
4360     // Perform 16-byte load as early return in pre-loop to handle situation
4361     // when initially aligned large array has negative values at starting bytes,
4362     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4363     // slower. Cases with negative bytes further ahead won't be affected that
4364     // much. In fact, it'll be faster due to early loads, less instructions and
4365     // less branches in LARGE_LOOP.
4366     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4367     __ sub(len, len, 16);
4368     __ orr(tmp6, tmp6, tmp1);
4369     __ tst(tmp6, UPPER_BIT_MASK);
4370     __ br(Assembler::NE, RET_TRUE);
4371     __ cmp(len, large_loop_size);
4372     __ br(Assembler::LT, CHECK_16);
4373 
4374     if (SoftwarePrefetchHintDistance >= 0
4375         && SoftwarePrefetchHintDistance >= dcache_line) {
4376       // initial prefetch
4377       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4378     }
4379   __ bind(LARGE_LOOP);
4380     if (SoftwarePrefetchHintDistance >= 0) {
4381       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4382     }
4383     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4384     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4385     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4386     // instructions per cycle and have less branches, but this approach disables
4387     // early return, thus, all 64 bytes are loaded and checked every time.
4388     __ ldp(tmp2, tmp3, Address(ary1));
4389     __ ldp(tmp4, tmp5, Address(ary1, 16));
4390     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4391     __ ldp(tmp6, tmp1, Address(ary1, 48));
4392     __ add(ary1, ary1, large_loop_size);
4393     __ sub(len, len, large_loop_size);
4394     __ orr(tmp2, tmp2, tmp3);
4395     __ orr(tmp4, tmp4, tmp5);
4396     __ orr(rscratch1, rscratch1, rscratch2);
4397     __ orr(tmp6, tmp6, tmp1);
4398     __ orr(tmp2, tmp2, tmp4);
4399     __ orr(rscratch1, rscratch1, tmp6);
4400     __ orr(tmp2, tmp2, rscratch1);
4401     __ tst(tmp2, UPPER_BIT_MASK);
4402     __ br(Assembler::NE, RET_TRUE);
4403     __ cmp(len, large_loop_size);
4404     __ br(Assembler::GE, LARGE_LOOP);
4405 
4406   __ bind(CHECK_16); // small 16-byte load pre-loop
4407     __ cmp(len, (u1)16);
4408     __ br(Assembler::LT, POST_LOOP16);
4409 
4410   __ bind(LOOP16); // small 16-byte load loop
4411     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4412     __ sub(len, len, 16);
4413     __ orr(tmp2, tmp2, tmp3);
4414     __ tst(tmp2, UPPER_BIT_MASK);
4415     __ br(Assembler::NE, RET_TRUE);
4416     __ cmp(len, (u1)16);
4417     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4418 
4419   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4420     __ cmp(len, (u1)8);
4421     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4422     __ ldr(tmp3, Address(__ post(ary1, 8)));
4423     __ sub(len, len, 8);
4424     __ tst(tmp3, UPPER_BIT_MASK);
4425     __ br(Assembler::NE, RET_TRUE);
4426 
4427   __ bind(POST_LOOP16_LOAD_TAIL);
4428     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
4429     __ ldr(tmp1, Address(ary1));
4430     __ mov(tmp2, 64);
4431     __ sub(tmp4, tmp2, len, __ LSL, 3);
4432     __ lslv(tmp1, tmp1, tmp4);
4433     __ tst(tmp1, UPPER_BIT_MASK);
4434     __ br(Assembler::NE, RET_TRUE);
4435     // Fallthrough
4436 
4437   __ bind(RET_FALSE);
4438     __ pop(spilled_regs, sp);
4439     __ leave();
4440     __ mov(result, zr);
4441     __ ret(lr);
4442 
4443   __ bind(RET_TRUE);
4444     __ pop(spilled_regs, sp);
4445   __ bind(RET_TRUE_NO_POP);
4446     __ leave();
4447     __ mov(result, 1);
4448     __ ret(lr);
4449 
4450   __ bind(DONE);
4451     __ pop(spilled_regs, sp);
4452     __ leave();
4453     __ ret(lr);
4454     return entry;
4455   }
4456 
generate_large_array_equals_loop_nonsimd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)4457   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4458         bool usePrefetch, Label &NOT_EQUAL) {
4459     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4460         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4461         tmp7 = r12, tmp8 = r13;
4462     Label LOOP;
4463 
4464     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4465     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4466     __ bind(LOOP);
4467     if (usePrefetch) {
4468       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4469       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4470     }
4471     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4472     __ eor(tmp1, tmp1, tmp2);
4473     __ eor(tmp3, tmp3, tmp4);
4474     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4475     __ orr(tmp1, tmp1, tmp3);
4476     __ cbnz(tmp1, NOT_EQUAL);
4477     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4478     __ eor(tmp5, tmp5, tmp6);
4479     __ eor(tmp7, tmp7, tmp8);
4480     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4481     __ orr(tmp5, tmp5, tmp7);
4482     __ cbnz(tmp5, NOT_EQUAL);
4483     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4484     __ eor(tmp1, tmp1, tmp2);
4485     __ eor(tmp3, tmp3, tmp4);
4486     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4487     __ orr(tmp1, tmp1, tmp3);
4488     __ cbnz(tmp1, NOT_EQUAL);
4489     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4490     __ eor(tmp5, tmp5, tmp6);
4491     __ sub(cnt1, cnt1, 8 * wordSize);
4492     __ eor(tmp7, tmp7, tmp8);
4493     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4494     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4495     // cmp) because subs allows an unlimited range of immediate operand.
4496     __ subs(tmp6, cnt1, loopThreshold);
4497     __ orr(tmp5, tmp5, tmp7);
4498     __ cbnz(tmp5, NOT_EQUAL);
4499     __ br(__ GE, LOOP);
4500     // post-loop
4501     __ eor(tmp1, tmp1, tmp2);
4502     __ eor(tmp3, tmp3, tmp4);
4503     __ orr(tmp1, tmp1, tmp3);
4504     __ sub(cnt1, cnt1, 2 * wordSize);
4505     __ cbnz(tmp1, NOT_EQUAL);
4506   }
4507 
generate_large_array_equals_loop_simd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)4508   void generate_large_array_equals_loop_simd(int loopThreshold,
4509         bool usePrefetch, Label &NOT_EQUAL) {
4510     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4511         tmp2 = rscratch2;
4512     Label LOOP;
4513 
4514     __ bind(LOOP);
4515     if (usePrefetch) {
4516       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4517       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4518     }
4519     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4520     __ sub(cnt1, cnt1, 8 * wordSize);
4521     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4522     __ subs(tmp1, cnt1, loopThreshold);
4523     __ eor(v0, __ T16B, v0, v4);
4524     __ eor(v1, __ T16B, v1, v5);
4525     __ eor(v2, __ T16B, v2, v6);
4526     __ eor(v3, __ T16B, v3, v7);
4527     __ orr(v0, __ T16B, v0, v1);
4528     __ orr(v1, __ T16B, v2, v3);
4529     __ orr(v0, __ T16B, v0, v1);
4530     __ umov(tmp1, v0, __ D, 0);
4531     __ umov(tmp2, v0, __ D, 1);
4532     __ orr(tmp1, tmp1, tmp2);
4533     __ cbnz(tmp1, NOT_EQUAL);
4534     __ br(__ GE, LOOP);
4535   }
4536 
4537   // a1 = r1 - array1 address
4538   // a2 = r2 - array2 address
4539   // result = r0 - return value. Already contains "false"
4540   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4541   // r3-r5 are reserved temporary registers
generate_large_array_equals()4542   address generate_large_array_equals() {
4543     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4544         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4545         tmp7 = r12, tmp8 = r13;
4546     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4547         SMALL_LOOP, POST_LOOP;
4548     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4549     // calculate if at least 32 prefetched bytes are used
4550     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4551     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4552     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4553     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4554         tmp5, tmp6, tmp7, tmp8);
4555 
4556     __ align(CodeEntryAlignment);
4557 
4558     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4559 
4560     address entry = __ pc();
4561     __ enter();
4562     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4563     // also advance pointers to use post-increment instead of pre-increment
4564     __ add(a1, a1, wordSize);
4565     __ add(a2, a2, wordSize);
4566     if (AvoidUnalignedAccesses) {
4567       // both implementations (SIMD/nonSIMD) are using relatively large load
4568       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4569       // on some CPUs in case of address is not at least 16-byte aligned.
4570       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4571       // load if needed at least for 1st address and make if 16-byte aligned.
4572       Label ALIGNED16;
4573       __ tbz(a1, 3, ALIGNED16);
4574       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4575       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4576       __ sub(cnt1, cnt1, wordSize);
4577       __ eor(tmp1, tmp1, tmp2);
4578       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4579       __ bind(ALIGNED16);
4580     }
4581     if (UseSIMDForArrayEquals) {
4582       if (SoftwarePrefetchHintDistance >= 0) {
4583         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4584         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4585         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4586             /* prfm = */ true, NOT_EQUAL);
4587         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4588         __ br(__ LT, TAIL);
4589       }
4590       __ bind(NO_PREFETCH_LARGE_LOOP);
4591       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4592           /* prfm = */ false, NOT_EQUAL);
4593     } else {
4594       __ push(spilled_regs, sp);
4595       if (SoftwarePrefetchHintDistance >= 0) {
4596         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4597         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4598         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4599             /* prfm = */ true, NOT_EQUAL);
4600         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4601         __ br(__ LT, TAIL);
4602       }
4603       __ bind(NO_PREFETCH_LARGE_LOOP);
4604       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4605           /* prfm = */ false, NOT_EQUAL);
4606     }
4607     __ bind(TAIL);
4608       __ cbz(cnt1, EQUAL);
4609       __ subs(cnt1, cnt1, wordSize);
4610       __ br(__ LE, POST_LOOP);
4611     __ bind(SMALL_LOOP);
4612       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4613       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4614       __ subs(cnt1, cnt1, wordSize);
4615       __ eor(tmp1, tmp1, tmp2);
4616       __ cbnz(tmp1, NOT_EQUAL);
4617       __ br(__ GT, SMALL_LOOP);
4618     __ bind(POST_LOOP);
4619       __ ldr(tmp1, Address(a1, cnt1));
4620       __ ldr(tmp2, Address(a2, cnt1));
4621       __ eor(tmp1, tmp1, tmp2);
4622       __ cbnz(tmp1, NOT_EQUAL);
4623     __ bind(EQUAL);
4624       __ mov(result, true);
4625     __ bind(NOT_EQUAL);
4626       if (!UseSIMDForArrayEquals) {
4627         __ pop(spilled_regs, sp);
4628       }
4629     __ bind(NOT_EQUAL_NO_POP);
4630     __ leave();
4631     __ ret(lr);
4632     return entry;
4633   }
4634 
generate_dsin_dcos(bool isCos)4635   address generate_dsin_dcos(bool isCos) {
4636     __ align(CodeEntryAlignment);
4637     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4638     address start = __ pc();
4639     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4640         (address)StubRoutines::aarch64::_two_over_pi,
4641         (address)StubRoutines::aarch64::_pio2,
4642         (address)StubRoutines::aarch64::_dsin_coef,
4643         (address)StubRoutines::aarch64::_dcos_coef);
4644     return start;
4645   }
4646 
generate_dlog()4647   address generate_dlog() {
4648     __ align(CodeEntryAlignment);
4649     StubCodeMark mark(this, "StubRoutines", "dlog");
4650     address entry = __ pc();
4651     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4652         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4653     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4654     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4655         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4656     return entry;
4657   }
4658 
4659   // code for comparing 16 bytes of strings with same encoding
compare_string_16_bytes_same(Label & DIFF1,Label & DIFF2)4660   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4661     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4662     __ ldr(rscratch1, Address(__ post(str1, 8)));
4663     __ eor(rscratch2, tmp1, tmp2);
4664     __ ldr(cnt1, Address(__ post(str2, 8)));
4665     __ cbnz(rscratch2, DIFF1);
4666     __ ldr(tmp1, Address(__ post(str1, 8)));
4667     __ eor(rscratch2, rscratch1, cnt1);
4668     __ ldr(tmp2, Address(__ post(str2, 8)));
4669     __ cbnz(rscratch2, DIFF2);
4670   }
4671 
4672   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
compare_string_16_x_LU(Register tmpL,Register tmpU,Label & DIFF1,Label & DIFF2)4673   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4674       Label &DIFF2) {
4675     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4676     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4677 
4678     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4679     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4680     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4681     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4682 
4683     __ fmovd(tmpL, vtmp3);
4684     __ eor(rscratch2, tmp3, tmpL);
4685     __ cbnz(rscratch2, DIFF2);
4686 
4687     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4688     __ umov(tmpL, vtmp3, __ D, 1);
4689     __ eor(rscratch2, tmpU, tmpL);
4690     __ cbnz(rscratch2, DIFF1);
4691 
4692     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4693     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4694     __ fmovd(tmpL, vtmp);
4695     __ eor(rscratch2, tmp3, tmpL);
4696     __ cbnz(rscratch2, DIFF2);
4697 
4698     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4699     __ umov(tmpL, vtmp, __ D, 1);
4700     __ eor(rscratch2, tmpU, tmpL);
4701     __ cbnz(rscratch2, DIFF1);
4702   }
4703 
4704   // r0  = result
4705   // r1  = str1
4706   // r2  = cnt1
4707   // r3  = str2
4708   // r4  = cnt2
4709   // r10 = tmp1
4710   // r11 = tmp2
generate_compare_long_string_different_encoding(bool isLU)4711   address generate_compare_long_string_different_encoding(bool isLU) {
4712     __ align(CodeEntryAlignment);
4713     StubCodeMark mark(this, "StubRoutines", isLU
4714         ? "compare_long_string_different_encoding LU"
4715         : "compare_long_string_different_encoding UL");
4716     address entry = __ pc();
4717     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4718         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4719         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4720     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4721         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4722     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4723     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4724 
4725     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4726 
4727     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4728     // cnt2 == amount of characters left to compare
4729     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4730     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4731     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4732     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4733     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4734     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4735     __ eor(rscratch2, tmp1, tmp2);
4736     __ mov(rscratch1, tmp2);
4737     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4738     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4739              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4740     __ push(spilled_regs, sp);
4741     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4742     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4743 
4744     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4745 
4746     if (SoftwarePrefetchHintDistance >= 0) {
4747       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4748       __ br(__ LT, NO_PREFETCH);
4749       __ bind(LARGE_LOOP_PREFETCH);
4750         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4751         __ mov(tmp4, 2);
4752         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4753         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4754           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4755           __ subs(tmp4, tmp4, 1);
4756           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4757           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4758           __ mov(tmp4, 2);
4759         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4760           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4761           __ subs(tmp4, tmp4, 1);
4762           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4763           __ sub(cnt2, cnt2, 64);
4764           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4765           __ br(__ GE, LARGE_LOOP_PREFETCH);
4766     }
4767     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4768     __ bind(NO_PREFETCH);
4769     __ subs(cnt2, cnt2, 16);
4770     __ br(__ LT, TAIL);
4771     __ align(OptoLoopAlignment);
4772     __ bind(SMALL_LOOP); // smaller loop
4773       __ subs(cnt2, cnt2, 16);
4774       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4775       __ br(__ GE, SMALL_LOOP);
4776       __ cmn(cnt2, (u1)16);
4777       __ br(__ EQ, LOAD_LAST);
4778     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4779       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4780       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4781       __ ldr(tmp3, Address(cnt1, -8));
4782       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4783       __ b(LOAD_LAST);
4784     __ bind(DIFF2);
4785       __ mov(tmpU, tmp3);
4786     __ bind(DIFF1);
4787       __ pop(spilled_regs, sp);
4788       __ b(CALCULATE_DIFFERENCE);
4789     __ bind(LOAD_LAST);
4790       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4791       // No need to load it again
4792       __ mov(tmpU, tmp3);
4793       __ pop(spilled_regs, sp);
4794 
4795       // tmp2 points to the address of the last 4 Latin1 characters right now
4796       __ ldrs(vtmp, Address(tmp2));
4797       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4798       __ fmovd(tmpL, vtmp);
4799 
4800       __ eor(rscratch2, tmpU, tmpL);
4801       __ cbz(rscratch2, DONE);
4802 
4803     // Find the first different characters in the longwords and
4804     // compute their difference.
4805     __ bind(CALCULATE_DIFFERENCE);
4806       __ rev(rscratch2, rscratch2);
4807       __ clz(rscratch2, rscratch2);
4808       __ andr(rscratch2, rscratch2, -16);
4809       __ lsrv(tmp1, tmp1, rscratch2);
4810       __ uxthw(tmp1, tmp1);
4811       __ lsrv(rscratch1, rscratch1, rscratch2);
4812       __ uxthw(rscratch1, rscratch1);
4813       __ subw(result, tmp1, rscratch1);
4814     __ bind(DONE);
4815       __ ret(lr);
4816     return entry;
4817   }
4818 
generate_method_entry_barrier()4819     address generate_method_entry_barrier() {
4820     __ align(CodeEntryAlignment);
4821     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
4822 
4823     Label deoptimize_label;
4824 
4825     address start = __ pc();
4826 
4827     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
4828 
4829     __ enter();
4830     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
4831 
4832     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
4833 
4834     __ push_call_clobbered_registers();
4835 
4836     __ mov(c_rarg0, rscratch2);
4837     __ call_VM_leaf
4838          (CAST_FROM_FN_PTR
4839           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
4840 
4841     __ reset_last_Java_frame(true);
4842 
4843     __ mov(rscratch1, r0);
4844 
4845     __ pop_call_clobbered_registers();
4846 
4847     __ cbnz(rscratch1, deoptimize_label);
4848 
4849     __ leave();
4850     __ ret(lr);
4851 
4852     __ BIND(deoptimize_label);
4853 
4854     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
4855     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
4856 
4857     __ mov(sp, rscratch1);
4858     __ br(rscratch2);
4859 
4860     return start;
4861   }
4862 
4863   // r0  = result
4864   // r1  = str1
4865   // r2  = cnt1
4866   // r3  = str2
4867   // r4  = cnt2
4868   // r10 = tmp1
4869   // r11 = tmp2
generate_compare_long_string_same_encoding(bool isLL)4870   address generate_compare_long_string_same_encoding(bool isLL) {
4871     __ align(CodeEntryAlignment);
4872     StubCodeMark mark(this, "StubRoutines", isLL
4873         ? "compare_long_string_same_encoding LL"
4874         : "compare_long_string_same_encoding UU");
4875     address entry = __ pc();
4876     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4877         tmp1 = r10, tmp2 = r11;
4878     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4879         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4880         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4881     // exit from large loop when less than 64 bytes left to read or we're about
4882     // to prefetch memory behind array border
4883     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4884     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4885     // update cnt2 counter with already loaded 8 bytes
4886     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4887     // update pointers, because of previous read
4888     __ add(str1, str1, wordSize);
4889     __ add(str2, str2, wordSize);
4890     if (SoftwarePrefetchHintDistance >= 0) {
4891       __ bind(LARGE_LOOP_PREFETCH);
4892         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4893         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4894         compare_string_16_bytes_same(DIFF, DIFF2);
4895         compare_string_16_bytes_same(DIFF, DIFF2);
4896         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4897         compare_string_16_bytes_same(DIFF, DIFF2);
4898         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4899         compare_string_16_bytes_same(DIFF, DIFF2);
4900         __ br(__ GT, LARGE_LOOP_PREFETCH);
4901         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4902     }
4903     // less than 16 bytes left?
4904     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4905     __ br(__ LT, TAIL);
4906     __ align(OptoLoopAlignment);
4907     __ bind(SMALL_LOOP);
4908       compare_string_16_bytes_same(DIFF, DIFF2);
4909       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4910       __ br(__ GE, SMALL_LOOP);
4911     __ bind(TAIL);
4912       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4913       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4914       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4915       __ br(__ LE, CHECK_LAST);
4916       __ eor(rscratch2, tmp1, tmp2);
4917       __ cbnz(rscratch2, DIFF);
4918       __ ldr(tmp1, Address(__ post(str1, 8)));
4919       __ ldr(tmp2, Address(__ post(str2, 8)));
4920       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4921     __ bind(CHECK_LAST);
4922       if (!isLL) {
4923         __ add(cnt2, cnt2, cnt2); // now in bytes
4924       }
4925       __ eor(rscratch2, tmp1, tmp2);
4926       __ cbnz(rscratch2, DIFF);
4927       __ ldr(rscratch1, Address(str1, cnt2));
4928       __ ldr(cnt1, Address(str2, cnt2));
4929       __ eor(rscratch2, rscratch1, cnt1);
4930       __ cbz(rscratch2, LENGTH_DIFF);
4931       // Find the first different characters in the longwords and
4932       // compute their difference.
4933     __ bind(DIFF2);
4934       __ rev(rscratch2, rscratch2);
4935       __ clz(rscratch2, rscratch2);
4936       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4937       __ lsrv(rscratch1, rscratch1, rscratch2);
4938       if (isLL) {
4939         __ lsrv(cnt1, cnt1, rscratch2);
4940         __ uxtbw(rscratch1, rscratch1);
4941         __ uxtbw(cnt1, cnt1);
4942       } else {
4943         __ lsrv(cnt1, cnt1, rscratch2);
4944         __ uxthw(rscratch1, rscratch1);
4945         __ uxthw(cnt1, cnt1);
4946       }
4947       __ subw(result, rscratch1, cnt1);
4948       __ b(LENGTH_DIFF);
4949     __ bind(DIFF);
4950       __ rev(rscratch2, rscratch2);
4951       __ clz(rscratch2, rscratch2);
4952       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4953       __ lsrv(tmp1, tmp1, rscratch2);
4954       if (isLL) {
4955         __ lsrv(tmp2, tmp2, rscratch2);
4956         __ uxtbw(tmp1, tmp1);
4957         __ uxtbw(tmp2, tmp2);
4958       } else {
4959         __ lsrv(tmp2, tmp2, rscratch2);
4960         __ uxthw(tmp1, tmp1);
4961         __ uxthw(tmp2, tmp2);
4962       }
4963       __ subw(result, tmp1, tmp2);
4964       __ b(LENGTH_DIFF);
4965     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4966       __ eor(rscratch2, tmp1, tmp2);
4967       __ cbnz(rscratch2, DIFF);
4968     __ bind(LENGTH_DIFF);
4969       __ ret(lr);
4970     return entry;
4971   }
4972 
generate_compare_long_strings()4973   void generate_compare_long_strings() {
4974       StubRoutines::aarch64::_compare_long_string_LL
4975           = generate_compare_long_string_same_encoding(true);
4976       StubRoutines::aarch64::_compare_long_string_UU
4977           = generate_compare_long_string_same_encoding(false);
4978       StubRoutines::aarch64::_compare_long_string_LU
4979           = generate_compare_long_string_different_encoding(true);
4980       StubRoutines::aarch64::_compare_long_string_UL
4981           = generate_compare_long_string_different_encoding(false);
4982   }
4983 
4984   // R0 = result
4985   // R1 = str2
4986   // R2 = cnt1
4987   // R3 = str1
4988   // R4 = cnt2
4989   // This generic linear code use few additional ideas, which makes it faster:
4990   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4991   // in order to skip initial loading(help in systems with 1 ld pipeline)
4992   // 2) we can use "fast" algorithm of finding single character to search for
4993   // first symbol with less branches(1 branch per each loaded register instead
4994   // of branch for each symbol), so, this is where constants like
4995   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4996   // 3) after loading and analyzing 1st register of source string, it can be
4997   // used to search for every 1st character entry, saving few loads in
4998   // comparison with "simplier-but-slower" implementation
4999   // 4) in order to avoid lots of push/pop operations, code below is heavily
5000   // re-using/re-initializing/compressing register values, which makes code
5001   // larger and a bit less readable, however, most of extra operations are
5002   // issued during loads or branches, so, penalty is minimal
generate_string_indexof_linear(bool str1_isL,bool str2_isL)5003   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5004     const char* stubName = str1_isL
5005         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5006         : "indexof_linear_uu";
5007     __ align(CodeEntryAlignment);
5008     StubCodeMark mark(this, "StubRoutines", stubName);
5009     address entry = __ pc();
5010 
5011     int str1_chr_size = str1_isL ? 1 : 2;
5012     int str2_chr_size = str2_isL ? 1 : 2;
5013     int str1_chr_shift = str1_isL ? 0 : 1;
5014     int str2_chr_shift = str2_isL ? 0 : 1;
5015     bool isL = str1_isL && str2_isL;
5016    // parameters
5017     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5018     // temporary registers
5019     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5020     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5021     // redefinitions
5022     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5023 
5024     __ push(spilled_regs, sp);
5025     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5026         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5027         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5028         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5029         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5030         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5031     // Read whole register from str1. It is safe, because length >=8 here
5032     __ ldr(ch1, Address(str1));
5033     // Read whole register from str2. It is safe, because length >=8 here
5034     __ ldr(ch2, Address(str2));
5035     __ sub(cnt2, cnt2, cnt1);
5036     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5037     if (str1_isL != str2_isL) {
5038       __ eor(v0, __ T16B, v0, v0);
5039     }
5040     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5041     __ mul(first, first, tmp1);
5042     // check if we have less than 1 register to check
5043     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5044     if (str1_isL != str2_isL) {
5045       __ fmovd(v1, ch1);
5046     }
5047     __ br(__ LE, L_SMALL);
5048     __ eor(ch2, first, ch2);
5049     if (str1_isL != str2_isL) {
5050       __ zip1(v1, __ T16B, v1, v0);
5051     }
5052     __ sub(tmp2, ch2, tmp1);
5053     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5054     __ bics(tmp2, tmp2, ch2);
5055     if (str1_isL != str2_isL) {
5056       __ fmovd(ch1, v1);
5057     }
5058     __ br(__ NE, L_HAS_ZERO);
5059     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5060     __ add(result, result, wordSize/str2_chr_size);
5061     __ add(str2, str2, wordSize);
5062     __ br(__ LT, L_POST_LOOP);
5063     __ BIND(L_LOOP);
5064       __ ldr(ch2, Address(str2));
5065       __ eor(ch2, first, ch2);
5066       __ sub(tmp2, ch2, tmp1);
5067       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5068       __ bics(tmp2, tmp2, ch2);
5069       __ br(__ NE, L_HAS_ZERO);
5070     __ BIND(L_LOOP_PROCEED);
5071       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5072       __ add(str2, str2, wordSize);
5073       __ add(result, result, wordSize/str2_chr_size);
5074       __ br(__ GE, L_LOOP);
5075     __ BIND(L_POST_LOOP);
5076       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5077       __ br(__ LE, NOMATCH);
5078       __ ldr(ch2, Address(str2));
5079       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5080       __ eor(ch2, first, ch2);
5081       __ sub(tmp2, ch2, tmp1);
5082       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5083       __ mov(tmp4, -1); // all bits set
5084       __ b(L_SMALL_PROCEED);
5085     __ align(OptoLoopAlignment);
5086     __ BIND(L_SMALL);
5087       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5088       __ eor(ch2, first, ch2);
5089       if (str1_isL != str2_isL) {
5090         __ zip1(v1, __ T16B, v1, v0);
5091       }
5092       __ sub(tmp2, ch2, tmp1);
5093       __ mov(tmp4, -1); // all bits set
5094       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5095       if (str1_isL != str2_isL) {
5096         __ fmovd(ch1, v1); // move converted 4 symbols
5097       }
5098     __ BIND(L_SMALL_PROCEED);
5099       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5100       __ bic(tmp2, tmp2, ch2);
5101       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5102       __ rbit(tmp2, tmp2);
5103       __ br(__ EQ, NOMATCH);
5104     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5105       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5106       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5107       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5108       if (str2_isL) { // LL
5109         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5110         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5111         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5112         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5113         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5114       } else {
5115         __ mov(ch2, 0xE); // all bits in byte set except last one
5116         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5117         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5118         __ lslv(tmp2, tmp2, tmp4);
5119         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5120         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5121         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5122         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5123       }
5124       __ cmp(ch1, ch2);
5125       __ mov(tmp4, wordSize/str2_chr_size);
5126       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5127     __ BIND(L_SMALL_CMP_LOOP);
5128       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5129                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5130       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5131                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5132       __ add(tmp4, tmp4, 1);
5133       __ cmp(tmp4, cnt1);
5134       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5135       __ cmp(first, ch2);
5136       __ br(__ EQ, L_SMALL_CMP_LOOP);
5137     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5138       __ cbz(tmp2, NOMATCH); // no more matches. exit
5139       __ clz(tmp4, tmp2);
5140       __ add(result, result, 1); // advance index
5141       __ add(str2, str2, str2_chr_size); // advance pointer
5142       __ b(L_SMALL_HAS_ZERO_LOOP);
5143     __ align(OptoLoopAlignment);
5144     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5145       __ cmp(first, ch2);
5146       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5147       __ b(DONE);
5148     __ align(OptoLoopAlignment);
5149     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5150       if (str2_isL) { // LL
5151         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5152         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5153         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5154         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5155         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5156       } else {
5157         __ mov(ch2, 0xE); // all bits in byte set except last one
5158         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5159         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5160         __ lslv(tmp2, tmp2, tmp4);
5161         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5162         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5163         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5164         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5165       }
5166       __ cmp(ch1, ch2);
5167       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5168       __ b(DONE);
5169     __ align(OptoLoopAlignment);
5170     __ BIND(L_HAS_ZERO);
5171       __ rbit(tmp2, tmp2);
5172       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5173       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5174       // It's fine because both counters are 32bit and are not changed in this
5175       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5176       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5177       __ sub(result, result, 1);
5178     __ BIND(L_HAS_ZERO_LOOP);
5179       __ mov(cnt1, wordSize/str2_chr_size);
5180       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5181       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5182       if (str2_isL) {
5183         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5184         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5185         __ lslv(tmp2, tmp2, tmp4);
5186         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5187         __ add(tmp4, tmp4, 1);
5188         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5189         __ lsl(tmp2, tmp2, 1);
5190         __ mov(tmp4, wordSize/str2_chr_size);
5191       } else {
5192         __ mov(ch2, 0xE);
5193         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5194         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5195         __ lslv(tmp2, tmp2, tmp4);
5196         __ add(tmp4, tmp4, 1);
5197         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5198         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5199         __ lsl(tmp2, tmp2, 1);
5200         __ mov(tmp4, wordSize/str2_chr_size);
5201         __ sub(str2, str2, str2_chr_size);
5202       }
5203       __ cmp(ch1, ch2);
5204       __ mov(tmp4, wordSize/str2_chr_size);
5205       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5206     __ BIND(L_CMP_LOOP);
5207       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5208                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5209       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5210                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5211       __ add(tmp4, tmp4, 1);
5212       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5213       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5214       __ cmp(cnt1, ch2);
5215       __ br(__ EQ, L_CMP_LOOP);
5216     __ BIND(L_CMP_LOOP_NOMATCH);
5217       // here we're not matched
5218       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5219       __ clz(tmp4, tmp2);
5220       __ add(str2, str2, str2_chr_size); // advance pointer
5221       __ b(L_HAS_ZERO_LOOP);
5222     __ align(OptoLoopAlignment);
5223     __ BIND(L_CMP_LOOP_LAST_CMP);
5224       __ cmp(cnt1, ch2);
5225       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5226       __ b(DONE);
5227     __ align(OptoLoopAlignment);
5228     __ BIND(L_CMP_LOOP_LAST_CMP2);
5229       if (str2_isL) {
5230         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5231         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5232         __ lslv(tmp2, tmp2, tmp4);
5233         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5234         __ add(tmp4, tmp4, 1);
5235         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5236         __ lsl(tmp2, tmp2, 1);
5237       } else {
5238         __ mov(ch2, 0xE);
5239         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5240         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5241         __ lslv(tmp2, tmp2, tmp4);
5242         __ add(tmp4, tmp4, 1);
5243         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5244         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5245         __ lsl(tmp2, tmp2, 1);
5246         __ sub(str2, str2, str2_chr_size);
5247       }
5248       __ cmp(ch1, ch2);
5249       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5250       __ b(DONE);
5251     __ align(OptoLoopAlignment);
5252     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5253       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5254       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5255       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5256       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5257       // result by analyzed characters value, so, we can just reset lower bits
5258       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5259       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5260       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5261       // index of last analyzed substring inside current octet. So, str2 in at
5262       // respective start address. We need to advance it to next octet
5263       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5264       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5265       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5266       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5267       __ movw(cnt2, cnt2);
5268       __ b(L_LOOP_PROCEED);
5269     __ align(OptoLoopAlignment);
5270     __ BIND(NOMATCH);
5271       __ mov(result, -1);
5272     __ BIND(DONE);
5273       __ pop(spilled_regs, sp);
5274       __ ret(lr);
5275     return entry;
5276   }
5277 
generate_string_indexof_stubs()5278   void generate_string_indexof_stubs() {
5279     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5280     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5281     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5282   }
5283 
inflate_and_store_2_fp_registers(bool generatePrfm,FloatRegister src1,FloatRegister src2)5284   void inflate_and_store_2_fp_registers(bool generatePrfm,
5285       FloatRegister src1, FloatRegister src2) {
5286     Register dst = r1;
5287     __ zip1(v1, __ T16B, src1, v0);
5288     __ zip2(v2, __ T16B, src1, v0);
5289     if (generatePrfm) {
5290       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5291     }
5292     __ zip1(v3, __ T16B, src2, v0);
5293     __ zip2(v4, __ T16B, src2, v0);
5294     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5295   }
5296 
5297   // R0 = src
5298   // R1 = dst
5299   // R2 = len
5300   // R3 = len >> 3
5301   // V0 = 0
5302   // v1 = loaded 8 bytes
generate_large_byte_array_inflate()5303   address generate_large_byte_array_inflate() {
5304     __ align(CodeEntryAlignment);
5305     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5306     address entry = __ pc();
5307     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5308     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5309     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5310 
5311     // do one more 8-byte read to have address 16-byte aligned in most cases
5312     // also use single store instruction
5313     __ ldrd(v2, __ post(src, 8));
5314     __ sub(octetCounter, octetCounter, 2);
5315     __ zip1(v1, __ T16B, v1, v0);
5316     __ zip1(v2, __ T16B, v2, v0);
5317     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5318     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5319     __ subs(rscratch1, octetCounter, large_loop_threshold);
5320     __ br(__ LE, LOOP_START);
5321     __ b(LOOP_PRFM_START);
5322     __ bind(LOOP_PRFM);
5323       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5324     __ bind(LOOP_PRFM_START);
5325       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5326       __ sub(octetCounter, octetCounter, 8);
5327       __ subs(rscratch1, octetCounter, large_loop_threshold);
5328       inflate_and_store_2_fp_registers(true, v3, v4);
5329       inflate_and_store_2_fp_registers(true, v5, v6);
5330       __ br(__ GT, LOOP_PRFM);
5331       __ cmp(octetCounter, (u1)8);
5332       __ br(__ LT, DONE);
5333     __ bind(LOOP);
5334       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5335       __ bind(LOOP_START);
5336       __ sub(octetCounter, octetCounter, 8);
5337       __ cmp(octetCounter, (u1)8);
5338       inflate_and_store_2_fp_registers(false, v3, v4);
5339       inflate_and_store_2_fp_registers(false, v5, v6);
5340       __ br(__ GE, LOOP);
5341     __ bind(DONE);
5342       __ ret(lr);
5343     return entry;
5344   }
5345 
5346   /**
5347    *  Arguments:
5348    *
5349    *  Input:
5350    *  c_rarg0   - current state address
5351    *  c_rarg1   - H key address
5352    *  c_rarg2   - data address
5353    *  c_rarg3   - number of blocks
5354    *
5355    *  Output:
5356    *  Updated state at c_rarg0
5357    */
generate_ghash_processBlocks()5358   address generate_ghash_processBlocks() {
5359     // Bafflingly, GCM uses little-endian for the byte order, but
5360     // big-endian for the bit order.  For example, the polynomial 1 is
5361     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5362     //
5363     // So, we must either reverse the bytes in each word and do
5364     // everything big-endian or reverse the bits in each byte and do
5365     // it little-endian.  On AArch64 it's more idiomatic to reverse
5366     // the bits in each byte (we have an instruction, RBIT, to do
5367     // that) and keep the data in little-endian bit order throught the
5368     // calculation, bit-reversing the inputs and outputs.
5369 
5370     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5371     __ align(wordSize * 2);
5372     address p = __ pc();
5373     __ emit_int64(0x87);  // The low-order bits of the field
5374                           // polynomial (i.e. p = z^7+z^2+z+1)
5375                           // repeated in the low and high parts of a
5376                           // 128-bit vector
5377     __ emit_int64(0x87);
5378 
5379     __ align(CodeEntryAlignment);
5380     address start = __ pc();
5381 
5382     Register state   = c_rarg0;
5383     Register subkeyH = c_rarg1;
5384     Register data    = c_rarg2;
5385     Register blocks  = c_rarg3;
5386 
5387     FloatRegister vzr = v30;
5388     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5389 
5390     __ ldrq(v0, Address(state));
5391     __ ldrq(v1, Address(subkeyH));
5392 
5393     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5394     __ rbit(v0, __ T16B, v0);
5395     __ rev64(v1, __ T16B, v1);
5396     __ rbit(v1, __ T16B, v1);
5397 
5398     __ ldrq(v26, p);
5399 
5400     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5401     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5402 
5403     {
5404       Label L_ghash_loop;
5405       __ bind(L_ghash_loop);
5406 
5407       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5408                                                  // reversing each byte
5409       __ rbit(v2, __ T16B, v2);
5410       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5411 
5412       // Multiply state in v2 by subkey in v1
5413       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5414                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
5415                      /*temps*/v6, v20, v18, v21);
5416       // Reduce v7:v5 by the field polynomial
5417       ghash_reduce(v0, v5, v7, v26, vzr, v20);
5418 
5419       __ sub(blocks, blocks, 1);
5420       __ cbnz(blocks, L_ghash_loop);
5421     }
5422 
5423     // The bit-reversed result is at this point in v0
5424     __ rev64(v1, __ T16B, v0);
5425     __ rbit(v1, __ T16B, v1);
5426 
5427     __ st1(v1, __ T16B, state);
5428     __ ret(lr);
5429 
5430     return start;
5431   }
5432 
generate_base64_encode_simdround(Register src,Register dst,FloatRegister codec,u8 size)5433   void generate_base64_encode_simdround(Register src, Register dst,
5434         FloatRegister codec, u8 size) {
5435 
5436     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5437     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5438     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5439 
5440     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5441 
5442     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5443 
5444     __ ushr(ind0, arrangement, in0,  2);
5445 
5446     __ ushr(ind1, arrangement, in1,  2);
5447     __ shl(in0,   arrangement, in0,  6);
5448     __ orr(ind1,  arrangement, ind1, in0);
5449     __ ushr(ind1, arrangement, ind1, 2);
5450 
5451     __ ushr(ind2, arrangement, in2,  4);
5452     __ shl(in1,   arrangement, in1,  4);
5453     __ orr(ind2,  arrangement, in1,  ind2);
5454     __ ushr(ind2, arrangement, ind2, 2);
5455 
5456     __ shl(ind3,  arrangement, in2,  2);
5457     __ ushr(ind3, arrangement, ind3, 2);
5458 
5459     __ tbl(out0,  arrangement, codec,  4, ind0);
5460     __ tbl(out1,  arrangement, codec,  4, ind1);
5461     __ tbl(out2,  arrangement, codec,  4, ind2);
5462     __ tbl(out3,  arrangement, codec,  4, ind3);
5463 
5464     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
5465   }
5466 
5467    /**
5468    *  Arguments:
5469    *
5470    *  Input:
5471    *  c_rarg0   - src_start
5472    *  c_rarg1   - src_offset
5473    *  c_rarg2   - src_length
5474    *  c_rarg3   - dest_start
5475    *  c_rarg4   - dest_offset
5476    *  c_rarg5   - isURL
5477    *
5478    */
generate_base64_encodeBlock()5479   address generate_base64_encodeBlock() {
5480 
5481     static const char toBase64[64] = {
5482       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5483       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5484       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5485       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5486       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5487     };
5488 
5489     static const char toBase64URL[64] = {
5490       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5491       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5492       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5493       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5494       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5495     };
5496 
5497     __ align(CodeEntryAlignment);
5498     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5499     address start = __ pc();
5500 
5501     Register src   = c_rarg0;  // source array
5502     Register soff  = c_rarg1;  // source start offset
5503     Register send  = c_rarg2;  // source end offset
5504     Register dst   = c_rarg3;  // dest array
5505     Register doff  = c_rarg4;  // position for writing to dest array
5506     Register isURL = c_rarg5;  // Base64 or URL chracter set
5507 
5508     // c_rarg6 and c_rarg7 are free to use as temps
5509     Register codec  = c_rarg6;
5510     Register length = c_rarg7;
5511 
5512     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5513 
5514     __ add(src, src, soff);
5515     __ add(dst, dst, doff);
5516     __ sub(length, send, soff);
5517 
5518     // load the codec base address
5519     __ lea(codec, ExternalAddress((address) toBase64));
5520     __ cbz(isURL, ProcessData);
5521     __ lea(codec, ExternalAddress((address) toBase64URL));
5522 
5523     __ BIND(ProcessData);
5524 
5525     // too short to formup a SIMD loop, roll back
5526     __ cmp(length, (u1)24);
5527     __ br(Assembler::LT, Process3B);
5528 
5529     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5530 
5531     __ BIND(Process48B);
5532     __ cmp(length, (u1)48);
5533     __ br(Assembler::LT, Process24B);
5534     generate_base64_encode_simdround(src, dst, v0, 16);
5535     __ sub(length, length, 48);
5536     __ b(Process48B);
5537 
5538     __ BIND(Process24B);
5539     __ cmp(length, (u1)24);
5540     __ br(Assembler::LT, SIMDExit);
5541     generate_base64_encode_simdround(src, dst, v0, 8);
5542     __ sub(length, length, 24);
5543 
5544     __ BIND(SIMDExit);
5545     __ cbz(length, Exit);
5546 
5547     __ BIND(Process3B);
5548     //  3 src bytes, 24 bits
5549     __ ldrb(r10, __ post(src, 1));
5550     __ ldrb(r11, __ post(src, 1));
5551     __ ldrb(r12, __ post(src, 1));
5552     __ orrw(r11, r11, r10, Assembler::LSL, 8);
5553     __ orrw(r12, r12, r11, Assembler::LSL, 8);
5554     // codec index
5555     __ ubfmw(r15, r12, 18, 23);
5556     __ ubfmw(r14, r12, 12, 17);
5557     __ ubfmw(r13, r12, 6,  11);
5558     __ andw(r12,  r12, 63);
5559     // get the code based on the codec
5560     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
5561     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
5562     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
5563     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
5564     __ strb(r15, __ post(dst, 1));
5565     __ strb(r14, __ post(dst, 1));
5566     __ strb(r13, __ post(dst, 1));
5567     __ strb(r12, __ post(dst, 1));
5568     __ sub(length, length, 3);
5569     __ cbnz(length, Process3B);
5570 
5571     __ BIND(Exit);
5572     __ ret(lr);
5573 
5574     return start;
5575   }
5576 
generate_base64_decode_simdround(Register src,Register dst,FloatRegister codecL,FloatRegister codecH,int size,Label & Exit)5577   void generate_base64_decode_simdround(Register src, Register dst,
5578         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
5579 
5580     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
5581     FloatRegister out0 = v20, out1 = v21, out2 = v22;
5582 
5583     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
5584     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
5585 
5586     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
5587 
5588     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5589 
5590     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
5591 
5592     // we need unsigned saturating substract, to make sure all input values
5593     // in range [0, 63] will have 0U value in the higher half lookup
5594     __ uqsubv(decH0, __ T16B, in0, v27);
5595     __ uqsubv(decH1, __ T16B, in1, v27);
5596     __ uqsubv(decH2, __ T16B, in2, v27);
5597     __ uqsubv(decH3, __ T16B, in3, v27);
5598 
5599     // lower half lookup
5600     __ tbl(decL0, arrangement, codecL, 4, in0);
5601     __ tbl(decL1, arrangement, codecL, 4, in1);
5602     __ tbl(decL2, arrangement, codecL, 4, in2);
5603     __ tbl(decL3, arrangement, codecL, 4, in3);
5604 
5605     // higher half lookup
5606     __ tbx(decH0, arrangement, codecH, 4, decH0);
5607     __ tbx(decH1, arrangement, codecH, 4, decH1);
5608     __ tbx(decH2, arrangement, codecH, 4, decH2);
5609     __ tbx(decH3, arrangement, codecH, 4, decH3);
5610 
5611     // combine lower and higher
5612     __ orr(decL0, arrangement, decL0, decH0);
5613     __ orr(decL1, arrangement, decL1, decH1);
5614     __ orr(decL2, arrangement, decL2, decH2);
5615     __ orr(decL3, arrangement, decL3, decH3);
5616 
5617     // check illegal inputs, value larger than 63 (maximum of 6 bits)
5618     __ cmhi(decH0, arrangement, decL0, v27);
5619     __ cmhi(decH1, arrangement, decL1, v27);
5620     __ cmhi(decH2, arrangement, decL2, v27);
5621     __ cmhi(decH3, arrangement, decL3, v27);
5622     __ orr(in0, arrangement, decH0, decH1);
5623     __ orr(in1, arrangement, decH2, decH3);
5624     __ orr(in2, arrangement, in0,   in1);
5625     __ umaxv(in3, arrangement, in2);
5626     __ umov(rscratch2, in3, __ B, 0);
5627 
5628     // get the data to output
5629     __ shl(out0,  arrangement, decL0, 2);
5630     __ ushr(out1, arrangement, decL1, 4);
5631     __ orr(out0,  arrangement, out0,  out1);
5632     __ shl(out1,  arrangement, decL1, 4);
5633     __ ushr(out2, arrangement, decL2, 2);
5634     __ orr(out1,  arrangement, out1,  out2);
5635     __ shl(out2,  arrangement, decL2, 6);
5636     __ orr(out2,  arrangement, out2,  decL3);
5637 
5638     __ cbz(rscratch2, NoIllegalData);
5639 
5640     // handle illegal input
5641     __ umov(r10, in2, __ D, 0);
5642     if (size == 16) {
5643       __ cbnz(r10, ErrorInLowerHalf);
5644 
5645       // illegal input is in higher half, store the lower half now.
5646       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
5647 
5648       __ umov(r10, in2,  __ D, 1);
5649       __ umov(r11, out0, __ D, 1);
5650       __ umov(r12, out1, __ D, 1);
5651       __ umov(r13, out2, __ D, 1);
5652       __ b(StoreLegalData);
5653 
5654       __ BIND(ErrorInLowerHalf);
5655     }
5656     __ umov(r11, out0, __ D, 0);
5657     __ umov(r12, out1, __ D, 0);
5658     __ umov(r13, out2, __ D, 0);
5659 
5660     __ BIND(StoreLegalData);
5661     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
5662     __ strb(r11, __ post(dst, 1));
5663     __ strb(r12, __ post(dst, 1));
5664     __ strb(r13, __ post(dst, 1));
5665     __ lsr(r10, r10, 8);
5666     __ lsr(r11, r11, 8);
5667     __ lsr(r12, r12, 8);
5668     __ lsr(r13, r13, 8);
5669     __ b(StoreLegalData);
5670 
5671     __ BIND(NoIllegalData);
5672     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
5673   }
5674 
5675 
5676    /**
5677    *  Arguments:
5678    *
5679    *  Input:
5680    *  c_rarg0   - src_start
5681    *  c_rarg1   - src_offset
5682    *  c_rarg2   - src_length
5683    *  c_rarg3   - dest_start
5684    *  c_rarg4   - dest_offset
5685    *  c_rarg5   - isURL
5686    *
5687    */
generate_base64_decodeBlock()5688   address generate_base64_decodeBlock() {
5689 
5690     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
5691     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
5692     // titled "Base64 decoding".
5693 
5694     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
5695     // except the trailing character '=' is also treated illegal value in this instrinsic. That
5696     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
5697     static const uint8_t fromBase64ForNoSIMD[256] = {
5698       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5699       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5700       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5701        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5702       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5703        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
5704       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5705        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5706       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5707       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5708       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5709       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5710       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5711       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5712       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5713       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5714     };
5715 
5716     static const uint8_t fromBase64URLForNoSIMD[256] = {
5717       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5718       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5719       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5720        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5721       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5722        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
5723       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5724        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5725       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5726       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5727       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5728       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5729       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5730       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5731       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5732       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5733     };
5734 
5735     // A legal value of base64 code is in range [0, 127].  We need two lookups
5736     // with tbl/tbx and combine them to get the decode data. The 1st table vector
5737     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
5738     // table vector lookup use tbx, out of range indices are unchanged in
5739     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
5740     // The value of index 64 is set to 0, so that we know that we already get the
5741     // decoded data with the 1st lookup.
5742     static const uint8_t fromBase64ForSIMD[128] = {
5743       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5744       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5745       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5746        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5747         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
5748        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
5749       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
5750        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
5751     };
5752 
5753     static const uint8_t fromBase64URLForSIMD[128] = {
5754       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5755       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5756       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5757        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5758         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
5759        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
5760        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
5761        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
5762     };
5763 
5764     __ align(CodeEntryAlignment);
5765     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
5766     address start = __ pc();
5767 
5768     Register src   = c_rarg0;  // source array
5769     Register soff  = c_rarg1;  // source start offset
5770     Register send  = c_rarg2;  // source end offset
5771     Register dst   = c_rarg3;  // dest array
5772     Register doff  = c_rarg4;  // position for writing to dest array
5773     Register isURL = c_rarg5;  // Base64 or URL character set
5774 
5775     Register length = send;    // reuse send as length of source data to process
5776 
5777     Register simd_codec   = c_rarg6;
5778     Register nosimd_codec = c_rarg7;
5779 
5780     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
5781 
5782     __ enter();
5783 
5784     __ add(src, src, soff);
5785     __ add(dst, dst, doff);
5786 
5787     __ mov(doff, dst);
5788 
5789     __ sub(length, send, soff);
5790     __ bfm(length, zr, 0, 1);
5791 
5792     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
5793     __ cbz(isURL, ProcessData);
5794     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
5795 
5796     __ BIND(ProcessData);
5797     __ mov(rscratch1, length);
5798     __ cmp(length, (u1)144); // 144 = 80 + 64
5799     __ br(Assembler::LT, Process4B);
5800 
5801     // In the MIME case, the line length cannot be more than 76
5802     // bytes (see RFC 2045). This is too short a block for SIMD
5803     // to be worthwhile, so we use non-SIMD here.
5804     __ movw(rscratch1, 79);
5805 
5806     __ BIND(Process4B);
5807     __ ldrw(r14, __ post(src, 4));
5808     __ ubfxw(r10, r14, 0,  8);
5809     __ ubfxw(r11, r14, 8,  8);
5810     __ ubfxw(r12, r14, 16, 8);
5811     __ ubfxw(r13, r14, 24, 8);
5812     // get the de-code
5813     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
5814     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
5815     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
5816     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
5817     // error detection, 255u indicates an illegal input
5818     __ orrw(r14, r10, r11);
5819     __ orrw(r15, r12, r13);
5820     __ orrw(r14, r14, r15);
5821     __ tbnz(r14, 7, Exit);
5822     // recover the data
5823     __ lslw(r14, r10, 10);
5824     __ bfiw(r14, r11, 4, 6);
5825     __ bfmw(r14, r12, 2, 5);
5826     __ rev16w(r14, r14);
5827     __ bfiw(r13, r12, 6, 2);
5828     __ strh(r14, __ post(dst, 2));
5829     __ strb(r13, __ post(dst, 1));
5830     // non-simd loop
5831     __ subsw(rscratch1, rscratch1, 4);
5832     __ br(Assembler::GT, Process4B);
5833 
5834     // if exiting from PreProcess80B, rscratch1 == -1;
5835     // otherwise, rscratch1 == 0.
5836     __ cbzw(rscratch1, Exit);
5837     __ sub(length, length, 80);
5838 
5839     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
5840     __ cbz(isURL, SIMDEnter);
5841     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
5842 
5843     __ BIND(SIMDEnter);
5844     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
5845     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
5846     __ mov(rscratch1, 63);
5847     __ dup(v27, __ T16B, rscratch1);
5848 
5849     __ BIND(Process64B);
5850     __ cmp(length, (u1)64);
5851     __ br(Assembler::LT, Process32B);
5852     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
5853     __ sub(length, length, 64);
5854     __ b(Process64B);
5855 
5856     __ BIND(Process32B);
5857     __ cmp(length, (u1)32);
5858     __ br(Assembler::LT, SIMDExit);
5859     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
5860     __ sub(length, length, 32);
5861     __ b(Process32B);
5862 
5863     __ BIND(SIMDExit);
5864     __ cbz(length, Exit);
5865     __ movw(rscratch1, length);
5866     __ b(Process4B);
5867 
5868     __ BIND(Exit);
5869     __ sub(c_rarg0, dst, doff);
5870 
5871     __ leave();
5872     __ ret(lr);
5873 
5874     return start;
5875   }
5876 
5877 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
5878 
5879   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
5880   //
5881   // If LSE is in use, generate LSE versions of all the stubs. The
5882   // non-LSE versions are in atomic_aarch64.S.
5883 
5884   // class AtomicStubMark records the entry point of a stub and the
5885   // stub pointer which will point to it. The stub pointer is set to
5886   // the entry point when ~AtomicStubMark() is called, which must be
5887   // after ICache::invalidate_range. This ensures safe publication of
5888   // the generated code.
5889   class AtomicStubMark {
5890     address _entry_point;
5891     aarch64_atomic_stub_t *_stub;
5892     MacroAssembler *_masm;
5893   public:
AtomicStubMark(MacroAssembler * masm,aarch64_atomic_stub_t * stub)5894     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
5895       _masm = masm;
5896       __ align(32);
5897       _entry_point = __ pc();
5898       _stub = stub;
5899     }
~AtomicStubMark()5900     ~AtomicStubMark() {
5901       *_stub = (aarch64_atomic_stub_t)_entry_point;
5902     }
5903   };
5904 
5905   // NB: For memory_order_conservative we need a trailing membar after
5906   // LSE atomic operations but not a leading membar.
5907   //
5908   // We don't need a leading membar because a clause in the Arm ARM
5909   // says:
5910   //
5911   //   Barrier-ordered-before
5912   //
5913   //   Barrier instructions order prior Memory effects before subsequent
5914   //   Memory effects generated by the same Observer. A read or a write
5915   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
5916   //   Observer if and only if RW1 appears in program order before RW 2
5917   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
5918   //   instruction with both Acquire and Release semantics.
5919   //
5920   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
5921   // and Release semantics, therefore we don't need a leading
5922   // barrier. However, there is no corresponding Barrier-ordered-after
5923   // relationship, therefore we need a trailing membar to prevent a
5924   // later store or load from being reordered with the store in an
5925   // atomic instruction.
5926   //
5927   // This was checked by using the herd7 consistency model simulator
5928   // (http://diy.inria.fr/) with this test case:
5929   //
5930   // AArch64 LseCas
5931   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
5932   // P0 | P1;
5933   // LDR W4, [X2] | MOV W3, #0;
5934   // DMB LD       | MOV W4, #1;
5935   // LDR W3, [X1] | CASAL W3, W4, [X1];
5936   //              | DMB ISH;
5937   //              | STR W4, [X2];
5938   // exists
5939   // (0:X3=0 /\ 0:X4=1)
5940   //
5941   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
5942   // with the store to x in P1. Without the DMB in P1 this may happen.
5943   //
5944   // At the time of writing we don't know of any AArch64 hardware that
5945   // reorders stores in this way, but the Reference Manual permits it.
5946 
gen_cas_entry(Assembler::operand_size size,atomic_memory_order order)5947   void gen_cas_entry(Assembler::operand_size size,
5948                      atomic_memory_order order) {
5949     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
5950       exchange_val = c_rarg2;
5951     bool acquire, release;
5952     switch (order) {
5953       case memory_order_relaxed:
5954         acquire = false;
5955         release = false;
5956         break;
5957       default:
5958         acquire = true;
5959         release = true;
5960         break;
5961     }
5962     __ mov(prev, compare_val);
5963     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
5964     if (order == memory_order_conservative) {
5965       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
5966     }
5967     if (size == Assembler::xword) {
5968       __ mov(r0, prev);
5969     } else {
5970       __ movw(r0, prev);
5971     }
5972     __ ret(lr);
5973   }
5974 
gen_ldaddal_entry(Assembler::operand_size size)5975   void gen_ldaddal_entry(Assembler::operand_size size) {
5976     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
5977     __ ldaddal(size, incr, prev, addr);
5978     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
5979     if (size == Assembler::xword) {
5980       __ mov(r0, prev);
5981     } else {
5982       __ movw(r0, prev);
5983     }
5984     __ ret(lr);
5985   }
5986 
gen_swpal_entry(Assembler::operand_size size)5987   void gen_swpal_entry(Assembler::operand_size size) {
5988     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
5989     __ swpal(size, incr, prev, addr);
5990     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
5991     if (size == Assembler::xword) {
5992       __ mov(r0, prev);
5993     } else {
5994       __ movw(r0, prev);
5995     }
5996     __ ret(lr);
5997   }
5998 
generate_atomic_entry_points()5999   void generate_atomic_entry_points() {
6000     if (! UseLSE) {
6001       return;
6002     }
6003 
6004     __ align(CodeEntryAlignment);
6005     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6006     address first_entry = __ pc();
6007 
6008     // All memory_order_conservative
6009     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6010     gen_ldaddal_entry(Assembler::word);
6011     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6012     gen_ldaddal_entry(Assembler::xword);
6013 
6014     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6015     gen_swpal_entry(Assembler::word);
6016     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6017     gen_swpal_entry(Assembler::xword);
6018 
6019     // CAS, memory_order_conservative
6020     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6021     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6022     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6023     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6024     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6025     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6026 
6027     // CAS, memory_order_relaxed
6028     AtomicStubMark mark_cmpxchg_1_relaxed
6029       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6030     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6031     AtomicStubMark mark_cmpxchg_4_relaxed
6032       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6033     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6034     AtomicStubMark mark_cmpxchg_8_relaxed
6035       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6036     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6037 
6038     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6039   }
6040 #endif // LINUX
6041 
6042   // Continuation point for throwing of implicit exceptions that are
6043   // not handled in the current activation. Fabricates an exception
6044   // oop and initiates normal exception dispatching in this
6045   // frame. Since we need to preserve callee-saved values (currently
6046   // only for C2, but done for C1 as well) we need a callee-saved oop
6047   // map and therefore have to make these stubs into RuntimeStubs
6048   // rather than BufferBlobs.  If the compiler needs all registers to
6049   // be preserved between the fault point and the exception handler
6050   // then it must assume responsibility for that in
6051   // AbstractCompiler::continuation_for_implicit_null_exception or
6052   // continuation_for_implicit_division_by_zero_exception. All other
6053   // implicit exceptions (e.g., NullPointerException or
6054   // AbstractMethodError on entry) are either at call sites or
6055   // otherwise assume that stack unwinding will be initiated, so
6056   // caller saved registers were assumed volatile in the compiler.
6057 
6058 #undef __
6059 #define __ masm->
6060 
generate_throw_exception(const char * name,address runtime_entry,Register arg1=noreg,Register arg2=noreg)6061   address generate_throw_exception(const char* name,
6062                                    address runtime_entry,
6063                                    Register arg1 = noreg,
6064                                    Register arg2 = noreg) {
6065     // Information about frame layout at time of blocking runtime call.
6066     // Note that we only have to preserve callee-saved registers since
6067     // the compilers are responsible for supplying a continuation point
6068     // if they expect all registers to be preserved.
6069     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6070     enum layout {
6071       rfp_off = 0,
6072       rfp_off2,
6073       return_off,
6074       return_off2,
6075       framesize // inclusive of return address
6076     };
6077 
6078     int insts_size = 512;
6079     int locs_size  = 64;
6080 
6081     CodeBuffer code(name, insts_size, locs_size);
6082     OopMapSet* oop_maps  = new OopMapSet();
6083     MacroAssembler* masm = new MacroAssembler(&code);
6084 
6085     address start = __ pc();
6086 
6087     // This is an inlined and slightly modified version of call_VM
6088     // which has the ability to fetch the return PC out of
6089     // thread-local storage and also sets up last_Java_sp slightly
6090     // differently than the real call_VM
6091 
6092     __ enter(); // Save FP and LR before call
6093 
6094     assert(is_even(framesize/2), "sp not 16-byte aligned");
6095 
6096     // lr and fp are already in place
6097     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
6098 
6099     int frame_complete = __ pc() - start;
6100 
6101     // Set up last_Java_sp and last_Java_fp
6102     address the_pc = __ pc();
6103     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6104 
6105     // Call runtime
6106     if (arg1 != noreg) {
6107       assert(arg2 != c_rarg1, "clobbered");
6108       __ mov(c_rarg1, arg1);
6109     }
6110     if (arg2 != noreg) {
6111       __ mov(c_rarg2, arg2);
6112     }
6113     __ mov(c_rarg0, rthread);
6114     BLOCK_COMMENT("call runtime_entry");
6115     __ mov(rscratch1, runtime_entry);
6116     __ blr(rscratch1);
6117 
6118     // Generate oop map
6119     OopMap* map = new OopMap(framesize, 0);
6120 
6121     oop_maps->add_gc_map(the_pc - start, map);
6122 
6123     __ reset_last_Java_frame(true);
6124 
6125     // Reinitialize the ptrue predicate register, in case the external runtime
6126     // call clobbers ptrue reg, as we may return to SVE compiled code.
6127     __ reinitialize_ptrue();
6128 
6129     __ leave();
6130 
6131     // check for pending exceptions
6132 #ifdef ASSERT
6133     Label L;
6134     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6135     __ cbnz(rscratch1, L);
6136     __ should_not_reach_here();
6137     __ bind(L);
6138 #endif // ASSERT
6139     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6140 
6141 
6142     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6143     RuntimeStub* stub =
6144       RuntimeStub::new_runtime_stub(name,
6145                                     &code,
6146                                     frame_complete,
6147                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6148                                     oop_maps, false);
6149     return stub->entry_point();
6150   }
6151 
6152   class MontgomeryMultiplyGenerator : public MacroAssembler {
6153 
6154     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6155       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6156 
6157     RegSet _toSave;
6158     bool _squaring;
6159 
6160   public:
MontgomeryMultiplyGenerator(Assembler * as,bool squaring)6161     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6162       : MacroAssembler(as->code()), _squaring(squaring) {
6163 
6164       // Register allocation
6165 
6166       RegSetIterator<> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6167       Pa_base = *regs;       // Argument registers
6168       if (squaring)
6169         Pb_base = Pa_base;
6170       else
6171         Pb_base = *++regs;
6172       Pn_base = *++regs;
6173       Rlen= *++regs;
6174       inv = *++regs;
6175       Pm_base = *++regs;
6176 
6177                           // Working registers:
6178       Ra =  *++regs;        // The current digit of a, b, n, and m.
6179       Rb =  *++regs;
6180       Rm =  *++regs;
6181       Rn =  *++regs;
6182 
6183       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6184       Pb =  *++regs;
6185       Pm =  *++regs;
6186       Pn =  *++regs;
6187 
6188       t0 =  *++regs;        // Three registers which form a
6189       t1 =  *++regs;        // triple-precision accumuator.
6190       t2 =  *++regs;
6191 
6192       Ri =  *++regs;        // Inner and outer loop indexes.
6193       Rj =  *++regs;
6194 
6195       Rhi_ab = *++regs;     // Product registers: low and high parts
6196       Rlo_ab = *++regs;     // of a*b and m*n.
6197       Rhi_mn = *++regs;
6198       Rlo_mn = *++regs;
6199 
6200       // r19 and up are callee-saved.
6201       _toSave = RegSet::range(r19, *regs) + Pm_base;
6202     }
6203 
6204   private:
save_regs()6205     void save_regs() {
6206       push(_toSave, sp);
6207     }
6208 
restore_regs()6209     void restore_regs() {
6210       pop(_toSave, sp);
6211     }
6212 
6213     template <typename T>
unroll_2(Register count,T block)6214     void unroll_2(Register count, T block) {
6215       Label loop, end, odd;
6216       tbnz(count, 0, odd);
6217       cbz(count, end);
6218       align(16);
6219       bind(loop);
6220       (this->*block)();
6221       bind(odd);
6222       (this->*block)();
6223       subs(count, count, 2);
6224       br(Assembler::GT, loop);
6225       bind(end);
6226     }
6227 
6228     template <typename T>
unroll_2(Register count,T block,Register d,Register s,Register tmp)6229     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6230       Label loop, end, odd;
6231       tbnz(count, 0, odd);
6232       cbz(count, end);
6233       align(16);
6234       bind(loop);
6235       (this->*block)(d, s, tmp);
6236       bind(odd);
6237       (this->*block)(d, s, tmp);
6238       subs(count, count, 2);
6239       br(Assembler::GT, loop);
6240       bind(end);
6241     }
6242 
pre1(RegisterOrConstant i)6243     void pre1(RegisterOrConstant i) {
6244       block_comment("pre1");
6245       // Pa = Pa_base;
6246       // Pb = Pb_base + i;
6247       // Pm = Pm_base;
6248       // Pn = Pn_base + i;
6249       // Ra = *Pa;
6250       // Rb = *Pb;
6251       // Rm = *Pm;
6252       // Rn = *Pn;
6253       ldr(Ra, Address(Pa_base));
6254       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6255       ldr(Rm, Address(Pm_base));
6256       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6257       lea(Pa, Address(Pa_base));
6258       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6259       lea(Pm, Address(Pm_base));
6260       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6261 
6262       // Zero the m*n result.
6263       mov(Rhi_mn, zr);
6264       mov(Rlo_mn, zr);
6265     }
6266 
6267     // The core multiply-accumulate step of a Montgomery
6268     // multiplication.  The idea is to schedule operations as a
6269     // pipeline so that instructions with long latencies (loads and
6270     // multiplies) have time to complete before their results are
6271     // used.  This most benefits in-order implementations of the
6272     // architecture but out-of-order ones also benefit.
step()6273     void step() {
6274       block_comment("step");
6275       // MACC(Ra, Rb, t0, t1, t2);
6276       // Ra = *++Pa;
6277       // Rb = *--Pb;
6278       umulh(Rhi_ab, Ra, Rb);
6279       mul(Rlo_ab, Ra, Rb);
6280       ldr(Ra, pre(Pa, wordSize));
6281       ldr(Rb, pre(Pb, -wordSize));
6282       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6283                                        // previous iteration.
6284       // MACC(Rm, Rn, t0, t1, t2);
6285       // Rm = *++Pm;
6286       // Rn = *--Pn;
6287       umulh(Rhi_mn, Rm, Rn);
6288       mul(Rlo_mn, Rm, Rn);
6289       ldr(Rm, pre(Pm, wordSize));
6290       ldr(Rn, pre(Pn, -wordSize));
6291       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6292     }
6293 
post1()6294     void post1() {
6295       block_comment("post1");
6296 
6297       // MACC(Ra, Rb, t0, t1, t2);
6298       // Ra = *++Pa;
6299       // Rb = *--Pb;
6300       umulh(Rhi_ab, Ra, Rb);
6301       mul(Rlo_ab, Ra, Rb);
6302       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6303       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6304 
6305       // *Pm = Rm = t0 * inv;
6306       mul(Rm, t0, inv);
6307       str(Rm, Address(Pm));
6308 
6309       // MACC(Rm, Rn, t0, t1, t2);
6310       // t0 = t1; t1 = t2; t2 = 0;
6311       umulh(Rhi_mn, Rm, Rn);
6312 
6313 #ifndef PRODUCT
6314       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6315       {
6316         mul(Rlo_mn, Rm, Rn);
6317         add(Rlo_mn, t0, Rlo_mn);
6318         Label ok;
6319         cbz(Rlo_mn, ok); {
6320           stop("broken Montgomery multiply");
6321         } bind(ok);
6322       }
6323 #endif
6324       // We have very carefully set things up so that
6325       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6326       // the lower half of Rm * Rn because we know the result already:
6327       // it must be -t0.  t0 + (-t0) must generate a carry iff
6328       // t0 != 0.  So, rather than do a mul and an adds we just set
6329       // the carry flag iff t0 is nonzero.
6330       //
6331       // mul(Rlo_mn, Rm, Rn);
6332       // adds(zr, t0, Rlo_mn);
6333       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6334       adcs(t0, t1, Rhi_mn);
6335       adc(t1, t2, zr);
6336       mov(t2, zr);
6337     }
6338 
pre2(RegisterOrConstant i,RegisterOrConstant len)6339     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6340       block_comment("pre2");
6341       // Pa = Pa_base + i-len;
6342       // Pb = Pb_base + len;
6343       // Pm = Pm_base + i-len;
6344       // Pn = Pn_base + len;
6345 
6346       if (i.is_register()) {
6347         sub(Rj, i.as_register(), len);
6348       } else {
6349         mov(Rj, i.as_constant());
6350         sub(Rj, Rj, len);
6351       }
6352       // Rj == i-len
6353 
6354       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6355       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6356       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6357       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6358 
6359       // Ra = *++Pa;
6360       // Rb = *--Pb;
6361       // Rm = *++Pm;
6362       // Rn = *--Pn;
6363       ldr(Ra, pre(Pa, wordSize));
6364       ldr(Rb, pre(Pb, -wordSize));
6365       ldr(Rm, pre(Pm, wordSize));
6366       ldr(Rn, pre(Pn, -wordSize));
6367 
6368       mov(Rhi_mn, zr);
6369       mov(Rlo_mn, zr);
6370     }
6371 
post2(RegisterOrConstant i,RegisterOrConstant len)6372     void post2(RegisterOrConstant i, RegisterOrConstant len) {
6373       block_comment("post2");
6374       if (i.is_constant()) {
6375         mov(Rj, i.as_constant()-len.as_constant());
6376       } else {
6377         sub(Rj, i.as_register(), len);
6378       }
6379 
6380       adds(t0, t0, Rlo_mn); // The pending m*n, low part
6381 
6382       // As soon as we know the least significant digit of our result,
6383       // store it.
6384       // Pm_base[i-len] = t0;
6385       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6386 
6387       // t0 = t1; t1 = t2; t2 = 0;
6388       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6389       adc(t1, t2, zr);
6390       mov(t2, zr);
6391     }
6392 
6393     // A carry in t0 after Montgomery multiplication means that we
6394     // should subtract multiples of n from our result in m.  We'll
6395     // keep doing that until there is no carry.
normalize(RegisterOrConstant len)6396     void normalize(RegisterOrConstant len) {
6397       block_comment("normalize");
6398       // while (t0)
6399       //   t0 = sub(Pm_base, Pn_base, t0, len);
6400       Label loop, post, again;
6401       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6402       cbz(t0, post); {
6403         bind(again); {
6404           mov(i, zr);
6405           mov(cnt, len);
6406           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6407           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6408           subs(zr, zr, zr); // set carry flag, i.e. no borrow
6409           align(16);
6410           bind(loop); {
6411             sbcs(Rm, Rm, Rn);
6412             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6413             add(i, i, 1);
6414             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6415             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6416             sub(cnt, cnt, 1);
6417           } cbnz(cnt, loop);
6418           sbc(t0, t0, zr);
6419         } cbnz(t0, again);
6420       } bind(post);
6421     }
6422 
6423     // Move memory at s to d, reversing words.
6424     //    Increments d to end of copied memory
6425     //    Destroys tmp1, tmp2
6426     //    Preserves len
6427     //    Leaves s pointing to the address which was in d at start
reverse(Register d,Register s,Register len,Register tmp1,Register tmp2)6428     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6429       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
6430 
6431       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
6432       mov(tmp1, len);
6433       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
6434       sub(s, d, len, ext::uxtw, LogBytesPerWord);
6435     }
6436     // where
reverse1(Register d,Register s,Register tmp)6437     void reverse1(Register d, Register s, Register tmp) {
6438       ldr(tmp, pre(s, -wordSize));
6439       ror(tmp, tmp, 32);
6440       str(tmp, post(d, wordSize));
6441     }
6442 
step_squaring()6443     void step_squaring() {
6444       // An extra ACC
6445       step();
6446       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6447     }
6448 
last_squaring(RegisterOrConstant i)6449     void last_squaring(RegisterOrConstant i) {
6450       Label dont;
6451       // if ((i & 1) == 0) {
6452       tbnz(i.as_register(), 0, dont); {
6453         // MACC(Ra, Rb, t0, t1, t2);
6454         // Ra = *++Pa;
6455         // Rb = *--Pb;
6456         umulh(Rhi_ab, Ra, Rb);
6457         mul(Rlo_ab, Ra, Rb);
6458         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6459       } bind(dont);
6460     }
6461 
extra_step_squaring()6462     void extra_step_squaring() {
6463       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6464 
6465       // MACC(Rm, Rn, t0, t1, t2);
6466       // Rm = *++Pm;
6467       // Rn = *--Pn;
6468       umulh(Rhi_mn, Rm, Rn);
6469       mul(Rlo_mn, Rm, Rn);
6470       ldr(Rm, pre(Pm, wordSize));
6471       ldr(Rn, pre(Pn, -wordSize));
6472     }
6473 
post1_squaring()6474     void post1_squaring() {
6475       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6476 
6477       // *Pm = Rm = t0 * inv;
6478       mul(Rm, t0, inv);
6479       str(Rm, Address(Pm));
6480 
6481       // MACC(Rm, Rn, t0, t1, t2);
6482       // t0 = t1; t1 = t2; t2 = 0;
6483       umulh(Rhi_mn, Rm, Rn);
6484 
6485 #ifndef PRODUCT
6486       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6487       {
6488         mul(Rlo_mn, Rm, Rn);
6489         add(Rlo_mn, t0, Rlo_mn);
6490         Label ok;
6491         cbz(Rlo_mn, ok); {
6492           stop("broken Montgomery multiply");
6493         } bind(ok);
6494       }
6495 #endif
6496       // We have very carefully set things up so that
6497       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6498       // the lower half of Rm * Rn because we know the result already:
6499       // it must be -t0.  t0 + (-t0) must generate a carry iff
6500       // t0 != 0.  So, rather than do a mul and an adds we just set
6501       // the carry flag iff t0 is nonzero.
6502       //
6503       // mul(Rlo_mn, Rm, Rn);
6504       // adds(zr, t0, Rlo_mn);
6505       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6506       adcs(t0, t1, Rhi_mn);
6507       adc(t1, t2, zr);
6508       mov(t2, zr);
6509     }
6510 
acc(Register Rhi,Register Rlo,Register t0,Register t1,Register t2)6511     void acc(Register Rhi, Register Rlo,
6512              Register t0, Register t1, Register t2) {
6513       adds(t0, t0, Rlo);
6514       adcs(t1, t1, Rhi);
6515       adc(t2, t2, zr);
6516     }
6517 
6518   public:
6519     /**
6520      * Fast Montgomery multiplication.  The derivation of the
6521      * algorithm is in A Cryptographic Library for the Motorola
6522      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
6523      *
6524      * Arguments:
6525      *
6526      * Inputs for multiplication:
6527      *   c_rarg0   - int array elements a
6528      *   c_rarg1   - int array elements b
6529      *   c_rarg2   - int array elements n (the modulus)
6530      *   c_rarg3   - int length
6531      *   c_rarg4   - int inv
6532      *   c_rarg5   - int array elements m (the result)
6533      *
6534      * Inputs for squaring:
6535      *   c_rarg0   - int array elements a
6536      *   c_rarg1   - int array elements n (the modulus)
6537      *   c_rarg2   - int length
6538      *   c_rarg3   - int inv
6539      *   c_rarg4   - int array elements m (the result)
6540      *
6541      */
generate_multiply()6542     address generate_multiply() {
6543       Label argh, nothing;
6544       bind(argh);
6545       stop("MontgomeryMultiply total_allocation must be <= 8192");
6546 
6547       align(CodeEntryAlignment);
6548       address entry = pc();
6549 
6550       cbzw(Rlen, nothing);
6551 
6552       enter();
6553 
6554       // Make room.
6555       cmpw(Rlen, 512);
6556       br(Assembler::HI, argh);
6557       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
6558       andr(sp, Ra, -2 * wordSize);
6559 
6560       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
6561 
6562       {
6563         // Copy input args, reversing as we go.  We use Ra as a
6564         // temporary variable.
6565         reverse(Ra, Pa_base, Rlen, t0, t1);
6566         if (!_squaring)
6567           reverse(Ra, Pb_base, Rlen, t0, t1);
6568         reverse(Ra, Pn_base, Rlen, t0, t1);
6569       }
6570 
6571       // Push all call-saved registers and also Pm_base which we'll need
6572       // at the end.
6573       save_regs();
6574 
6575 #ifndef PRODUCT
6576       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
6577       {
6578         ldr(Rn, Address(Pn_base, 0));
6579         mul(Rlo_mn, Rn, inv);
6580         subs(zr, Rlo_mn, -1);
6581         Label ok;
6582         br(EQ, ok); {
6583           stop("broken inverse in Montgomery multiply");
6584         } bind(ok);
6585       }
6586 #endif
6587 
6588       mov(Pm_base, Ra);
6589 
6590       mov(t0, zr);
6591       mov(t1, zr);
6592       mov(t2, zr);
6593 
6594       block_comment("for (int i = 0; i < len; i++) {");
6595       mov(Ri, zr); {
6596         Label loop, end;
6597         cmpw(Ri, Rlen);
6598         br(Assembler::GE, end);
6599 
6600         bind(loop);
6601         pre1(Ri);
6602 
6603         block_comment("  for (j = i; j; j--) {"); {
6604           movw(Rj, Ri);
6605           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
6606         } block_comment("  } // j");
6607 
6608         post1();
6609         addw(Ri, Ri, 1);
6610         cmpw(Ri, Rlen);
6611         br(Assembler::LT, loop);
6612         bind(end);
6613         block_comment("} // i");
6614       }
6615 
6616       block_comment("for (int i = len; i < 2*len; i++) {");
6617       mov(Ri, Rlen); {
6618         Label loop, end;
6619         cmpw(Ri, Rlen, Assembler::LSL, 1);
6620         br(Assembler::GE, end);
6621 
6622         bind(loop);
6623         pre2(Ri, Rlen);
6624 
6625         block_comment("  for (j = len*2-i-1; j; j--) {"); {
6626           lslw(Rj, Rlen, 1);
6627           subw(Rj, Rj, Ri);
6628           subw(Rj, Rj, 1);
6629           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
6630         } block_comment("  } // j");
6631 
6632         post2(Ri, Rlen);
6633         addw(Ri, Ri, 1);
6634         cmpw(Ri, Rlen, Assembler::LSL, 1);
6635         br(Assembler::LT, loop);
6636         bind(end);
6637       }
6638       block_comment("} // i");
6639 
6640       normalize(Rlen);
6641 
6642       mov(Ra, Pm_base);  // Save Pm_base in Ra
6643       restore_regs();  // Restore caller's Pm_base
6644 
6645       // Copy our result into caller's Pm_base
6646       reverse(Pm_base, Ra, Rlen, t0, t1);
6647 
6648       leave();
6649       bind(nothing);
6650       ret(lr);
6651 
6652       return entry;
6653     }
6654     // In C, approximately:
6655 
6656     // void
6657     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
6658     //                     julong Pn_base[], julong Pm_base[],
6659     //                     julong inv, int len) {
6660     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
6661     //   julong *Pa, *Pb, *Pn, *Pm;
6662     //   julong Ra, Rb, Rn, Rm;
6663 
6664     //   int i;
6665 
6666     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
6667 
6668     //   for (i = 0; i < len; i++) {
6669     //     int j;
6670 
6671     //     Pa = Pa_base;
6672     //     Pb = Pb_base + i;
6673     //     Pm = Pm_base;
6674     //     Pn = Pn_base + i;
6675 
6676     //     Ra = *Pa;
6677     //     Rb = *Pb;
6678     //     Rm = *Pm;
6679     //     Rn = *Pn;
6680 
6681     //     int iters = i;
6682     //     for (j = 0; iters--; j++) {
6683     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
6684     //       MACC(Ra, Rb, t0, t1, t2);
6685     //       Ra = *++Pa;
6686     //       Rb = *--Pb;
6687     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6688     //       MACC(Rm, Rn, t0, t1, t2);
6689     //       Rm = *++Pm;
6690     //       Rn = *--Pn;
6691     //     }
6692 
6693     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
6694     //     MACC(Ra, Rb, t0, t1, t2);
6695     //     *Pm = Rm = t0 * inv;
6696     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
6697     //     MACC(Rm, Rn, t0, t1, t2);
6698 
6699     //     assert(t0 == 0, "broken Montgomery multiply");
6700 
6701     //     t0 = t1; t1 = t2; t2 = 0;
6702     //   }
6703 
6704     //   for (i = len; i < 2*len; i++) {
6705     //     int j;
6706 
6707     //     Pa = Pa_base + i-len;
6708     //     Pb = Pb_base + len;
6709     //     Pm = Pm_base + i-len;
6710     //     Pn = Pn_base + len;
6711 
6712     //     Ra = *++Pa;
6713     //     Rb = *--Pb;
6714     //     Rm = *++Pm;
6715     //     Rn = *--Pn;
6716 
6717     //     int iters = len*2-i-1;
6718     //     for (j = i-len+1; iters--; j++) {
6719     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
6720     //       MACC(Ra, Rb, t0, t1, t2);
6721     //       Ra = *++Pa;
6722     //       Rb = *--Pb;
6723     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6724     //       MACC(Rm, Rn, t0, t1, t2);
6725     //       Rm = *++Pm;
6726     //       Rn = *--Pn;
6727     //     }
6728 
6729     //     Pm_base[i-len] = t0;
6730     //     t0 = t1; t1 = t2; t2 = 0;
6731     //   }
6732 
6733     //   while (t0)
6734     //     t0 = sub(Pm_base, Pn_base, t0, len);
6735     // }
6736 
6737     /**
6738      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
6739      * multiplies than Montgomery multiplication so it should be up to
6740      * 25% faster.  However, its loop control is more complex and it
6741      * may actually run slower on some machines.
6742      *
6743      * Arguments:
6744      *
6745      * Inputs:
6746      *   c_rarg0   - int array elements a
6747      *   c_rarg1   - int array elements n (the modulus)
6748      *   c_rarg2   - int length
6749      *   c_rarg3   - int inv
6750      *   c_rarg4   - int array elements m (the result)
6751      *
6752      */
generate_square()6753     address generate_square() {
6754       Label argh;
6755       bind(argh);
6756       stop("MontgomeryMultiply total_allocation must be <= 8192");
6757 
6758       align(CodeEntryAlignment);
6759       address entry = pc();
6760 
6761       enter();
6762 
6763       // Make room.
6764       cmpw(Rlen, 512);
6765       br(Assembler::HI, argh);
6766       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
6767       andr(sp, Ra, -2 * wordSize);
6768 
6769       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
6770 
6771       {
6772         // Copy input args, reversing as we go.  We use Ra as a
6773         // temporary variable.
6774         reverse(Ra, Pa_base, Rlen, t0, t1);
6775         reverse(Ra, Pn_base, Rlen, t0, t1);
6776       }
6777 
6778       // Push all call-saved registers and also Pm_base which we'll need
6779       // at the end.
6780       save_regs();
6781 
6782       mov(Pm_base, Ra);
6783 
6784       mov(t0, zr);
6785       mov(t1, zr);
6786       mov(t2, zr);
6787 
6788       block_comment("for (int i = 0; i < len; i++) {");
6789       mov(Ri, zr); {
6790         Label loop, end;
6791         bind(loop);
6792         cmp(Ri, Rlen);
6793         br(Assembler::GE, end);
6794 
6795         pre1(Ri);
6796 
6797         block_comment("for (j = (i+1)/2; j; j--) {"); {
6798           add(Rj, Ri, 1);
6799           lsr(Rj, Rj, 1);
6800           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
6801         } block_comment("  } // j");
6802 
6803         last_squaring(Ri);
6804 
6805         block_comment("  for (j = i/2; j; j--) {"); {
6806           lsr(Rj, Ri, 1);
6807           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
6808         } block_comment("  } // j");
6809 
6810         post1_squaring();
6811         add(Ri, Ri, 1);
6812         cmp(Ri, Rlen);
6813         br(Assembler::LT, loop);
6814 
6815         bind(end);
6816         block_comment("} // i");
6817       }
6818 
6819       block_comment("for (int i = len; i < 2*len; i++) {");
6820       mov(Ri, Rlen); {
6821         Label loop, end;
6822         bind(loop);
6823         cmp(Ri, Rlen, Assembler::LSL, 1);
6824         br(Assembler::GE, end);
6825 
6826         pre2(Ri, Rlen);
6827 
6828         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
6829           lsl(Rj, Rlen, 1);
6830           sub(Rj, Rj, Ri);
6831           sub(Rj, Rj, 1);
6832           lsr(Rj, Rj, 1);
6833           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
6834         } block_comment("  } // j");
6835 
6836         last_squaring(Ri);
6837 
6838         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
6839           lsl(Rj, Rlen, 1);
6840           sub(Rj, Rj, Ri);
6841           lsr(Rj, Rj, 1);
6842           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
6843         } block_comment("  } // j");
6844 
6845         post2(Ri, Rlen);
6846         add(Ri, Ri, 1);
6847         cmp(Ri, Rlen, Assembler::LSL, 1);
6848 
6849         br(Assembler::LT, loop);
6850         bind(end);
6851         block_comment("} // i");
6852       }
6853 
6854       normalize(Rlen);
6855 
6856       mov(Ra, Pm_base);  // Save Pm_base in Ra
6857       restore_regs();  // Restore caller's Pm_base
6858 
6859       // Copy our result into caller's Pm_base
6860       reverse(Pm_base, Ra, Rlen, t0, t1);
6861 
6862       leave();
6863       ret(lr);
6864 
6865       return entry;
6866     }
6867     // In C, approximately:
6868 
6869     // void
6870     // montgomery_square(julong Pa_base[], julong Pn_base[],
6871     //                   julong Pm_base[], julong inv, int len) {
6872     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
6873     //   julong *Pa, *Pb, *Pn, *Pm;
6874     //   julong Ra, Rb, Rn, Rm;
6875 
6876     //   int i;
6877 
6878     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
6879 
6880     //   for (i = 0; i < len; i++) {
6881     //     int j;
6882 
6883     //     Pa = Pa_base;
6884     //     Pb = Pa_base + i;
6885     //     Pm = Pm_base;
6886     //     Pn = Pn_base + i;
6887 
6888     //     Ra = *Pa;
6889     //     Rb = *Pb;
6890     //     Rm = *Pm;
6891     //     Rn = *Pn;
6892 
6893     //     int iters = (i+1)/2;
6894     //     for (j = 0; iters--; j++) {
6895     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
6896     //       MACC2(Ra, Rb, t0, t1, t2);
6897     //       Ra = *++Pa;
6898     //       Rb = *--Pb;
6899     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6900     //       MACC(Rm, Rn, t0, t1, t2);
6901     //       Rm = *++Pm;
6902     //       Rn = *--Pn;
6903     //     }
6904     //     if ((i & 1) == 0) {
6905     //       assert(Ra == Pa_base[j], "must be");
6906     //       MACC(Ra, Ra, t0, t1, t2);
6907     //     }
6908     //     iters = i/2;
6909     //     assert(iters == i-j, "must be");
6910     //     for (; iters--; j++) {
6911     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6912     //       MACC(Rm, Rn, t0, t1, t2);
6913     //       Rm = *++Pm;
6914     //       Rn = *--Pn;
6915     //     }
6916 
6917     //     *Pm = Rm = t0 * inv;
6918     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
6919     //     MACC(Rm, Rn, t0, t1, t2);
6920 
6921     //     assert(t0 == 0, "broken Montgomery multiply");
6922 
6923     //     t0 = t1; t1 = t2; t2 = 0;
6924     //   }
6925 
6926     //   for (i = len; i < 2*len; i++) {
6927     //     int start = i-len+1;
6928     //     int end = start + (len - start)/2;
6929     //     int j;
6930 
6931     //     Pa = Pa_base + i-len;
6932     //     Pb = Pa_base + len;
6933     //     Pm = Pm_base + i-len;
6934     //     Pn = Pn_base + len;
6935 
6936     //     Ra = *++Pa;
6937     //     Rb = *--Pb;
6938     //     Rm = *++Pm;
6939     //     Rn = *--Pn;
6940 
6941     //     int iters = (2*len-i-1)/2;
6942     //     assert(iters == end-start, "must be");
6943     //     for (j = start; iters--; j++) {
6944     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
6945     //       MACC2(Ra, Rb, t0, t1, t2);
6946     //       Ra = *++Pa;
6947     //       Rb = *--Pb;
6948     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6949     //       MACC(Rm, Rn, t0, t1, t2);
6950     //       Rm = *++Pm;
6951     //       Rn = *--Pn;
6952     //     }
6953     //     if ((i & 1) == 0) {
6954     //       assert(Ra == Pa_base[j], "must be");
6955     //       MACC(Ra, Ra, t0, t1, t2);
6956     //     }
6957     //     iters =  (2*len-i)/2;
6958     //     assert(iters == len-j, "must be");
6959     //     for (; iters--; j++) {
6960     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6961     //       MACC(Rm, Rn, t0, t1, t2);
6962     //       Rm = *++Pm;
6963     //       Rn = *--Pn;
6964     //     }
6965     //     Pm_base[i-len] = t0;
6966     //     t0 = t1; t1 = t2; t2 = 0;
6967     //   }
6968 
6969     //   while (t0)
6970     //     t0 = sub(Pm_base, Pn_base, t0, len);
6971     // }
6972   };
6973 
6974 
6975   // Initialization
generate_initial()6976   void generate_initial() {
6977     // Generate initial stubs and initializes the entry points
6978 
6979     // entry points that exist in all platforms Note: This is code
6980     // that could be shared among different platforms - however the
6981     // benefit seems to be smaller than the disadvantage of having a
6982     // much more complicated generator structure. See also comment in
6983     // stubRoutines.hpp.
6984 
6985     StubRoutines::_forward_exception_entry = generate_forward_exception();
6986 
6987     StubRoutines::_call_stub_entry =
6988       generate_call_stub(StubRoutines::_call_stub_return_address);
6989 
6990     // is referenced by megamorphic call
6991     StubRoutines::_catch_exception_entry = generate_catch_exception();
6992 
6993     // Build this early so it's available for the interpreter.
6994     StubRoutines::_throw_StackOverflowError_entry =
6995       generate_throw_exception("StackOverflowError throw_exception",
6996                                CAST_FROM_FN_PTR(address,
6997                                                 SharedRuntime::throw_StackOverflowError));
6998     StubRoutines::_throw_delayed_StackOverflowError_entry =
6999       generate_throw_exception("delayed StackOverflowError throw_exception",
7000                                CAST_FROM_FN_PTR(address,
7001                                                 SharedRuntime::throw_delayed_StackOverflowError));
7002     if (UseCRC32Intrinsics) {
7003       // set table address before stub generation which use it
7004       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7005       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7006     }
7007 
7008     if (UseCRC32CIntrinsics) {
7009       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7010     }
7011 
7012     // Disabled until JDK-8210858 is fixed
7013     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7014     //   StubRoutines::_dlog = generate_dlog();
7015     // }
7016 
7017     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7018       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7019     }
7020 
7021     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7022       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7023     }
7024 
7025     // Safefetch stubs.
7026     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7027                                                        &StubRoutines::_safefetch32_fault_pc,
7028                                                        &StubRoutines::_safefetch32_continuation_pc);
7029     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7030                                                        &StubRoutines::_safefetchN_fault_pc,
7031                                                        &StubRoutines::_safefetchN_continuation_pc);
7032   }
7033 
generate_all()7034   void generate_all() {
7035     // support for verify_oop (must happen after universe_init)
7036     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
7037     StubRoutines::_throw_AbstractMethodError_entry =
7038       generate_throw_exception("AbstractMethodError throw_exception",
7039                                CAST_FROM_FN_PTR(address,
7040                                                 SharedRuntime::
7041                                                 throw_AbstractMethodError));
7042 
7043     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7044       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7045                                CAST_FROM_FN_PTR(address,
7046                                                 SharedRuntime::
7047                                                 throw_IncompatibleClassChangeError));
7048 
7049     StubRoutines::_throw_NullPointerException_at_call_entry =
7050       generate_throw_exception("NullPointerException at call throw_exception",
7051                                CAST_FROM_FN_PTR(address,
7052                                                 SharedRuntime::
7053                                                 throw_NullPointerException_at_call));
7054 
7055     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7056 
7057     // arraycopy stubs used by compilers
7058     generate_arraycopy_stubs();
7059 
7060     // has negatives stub for large arrays.
7061     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
7062 
7063     // array equals stub for large arrays.
7064     if (!UseSimpleArrayEquals) {
7065       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7066     }
7067 
7068     generate_compare_long_strings();
7069 
7070     generate_string_indexof_stubs();
7071 
7072     // byte_array_inflate stub for large arrays.
7073     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7074 
7075     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7076     if (bs_nm != NULL) {
7077       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7078     }
7079 #ifdef COMPILER2
7080     if (UseMultiplyToLenIntrinsic) {
7081       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7082     }
7083 
7084     if (UseSquareToLenIntrinsic) {
7085       StubRoutines::_squareToLen = generate_squareToLen();
7086     }
7087 
7088     if (UseMulAddIntrinsic) {
7089       StubRoutines::_mulAdd = generate_mulAdd();
7090     }
7091 
7092     if (UseSIMDForBigIntegerShiftIntrinsics) {
7093       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7094       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7095     }
7096 
7097     if (UseMontgomeryMultiplyIntrinsic) {
7098       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7099       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7100       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7101     }
7102 
7103     if (UseMontgomerySquareIntrinsic) {
7104       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7105       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7106       // We use generate_multiply() rather than generate_square()
7107       // because it's faster for the sizes of modulus we care about.
7108       StubRoutines::_montgomerySquare = g.generate_multiply();
7109     }
7110 #endif // COMPILER2
7111 
7112     // generate GHASH intrinsics code
7113     if (UseGHASHIntrinsics) {
7114       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7115     }
7116 
7117     if (UseBASE64Intrinsics) {
7118         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7119         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7120     }
7121 
7122     // data cache line writeback
7123     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7124     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7125 
7126     if (UseAESIntrinsics) {
7127       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7128       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7129       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7130       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7131     }
7132 
7133     if (UseSHA1Intrinsics) {
7134       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7135       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7136     }
7137     if (UseSHA256Intrinsics) {
7138       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7139       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7140     }
7141     if (UseSHA512Intrinsics) {
7142       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7143       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7144     }
7145     if (UseSHA3Intrinsics) {
7146       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7147       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7148     }
7149 
7150     // generate Adler32 intrinsics code
7151     if (UseAdler32Intrinsics) {
7152       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7153     }
7154 
7155 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
7156 
7157     generate_atomic_entry_points();
7158 
7159 #endif // LINUX
7160 
7161     StubRoutines::aarch64::set_completed();
7162   }
7163 
7164  public:
StubGenerator(CodeBuffer * code,bool all)7165   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7166     if (all) {
7167       generate_all();
7168     } else {
7169       generate_initial();
7170     }
7171   }
7172 }; // end class declaration
7173 
7174 #define UCM_TABLE_MAX_ENTRIES 8
StubGenerator_generate(CodeBuffer * code,bool all)7175 void StubGenerator_generate(CodeBuffer* code, bool all) {
7176   if (UnsafeCopyMemory::_table == NULL) {
7177     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7178   }
7179   StubGenerator g(code, all);
7180 }
7181 
7182 
7183 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
7184 
7185 // Define pointers to atomic stubs and initialize them to point to the
7186 // code in atomic_aarch64.S.
7187 
7188 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7189   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7190     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7191   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7192     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7193 
7194 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7195 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7196 DEFAULT_ATOMIC_OP(xchg, 4, )
7197 DEFAULT_ATOMIC_OP(xchg, 8, )
7198 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7199 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7200 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7201 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7202 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7203 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7204 
7205 #undef DEFAULT_ATOMIC_OP
7206 
7207 #endif // LINUX
7208