1 /*
2 * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "nativeInst_aarch64.hpp"
34 #include "oops/instanceOop.hpp"
35 #include "oops/method.hpp"
36 #include "oops/objArrayKlass.hpp"
37 #include "oops/oop.inline.hpp"
38 #include "prims/methodHandles.hpp"
39 #include "runtime/atomic.hpp"
40 #include "runtime/frame.inline.hpp"
41 #include "runtime/handles.inline.hpp"
42 #include "runtime/sharedRuntime.hpp"
43 #include "runtime/stubCodeGenerator.hpp"
44 #include "runtime/stubRoutines.hpp"
45 #include "runtime/thread.inline.hpp"
46 #include "utilities/align.hpp"
47 #ifdef COMPILER2
48 #include "opto/runtime.hpp"
49 #endif
50
51 // Declaration and definition of StubGenerator (no .hpp file).
52 // For a more detailed description of the stub routine structure
53 // see the comment in stubRoutines.hpp
54
55 #undef __
56 #define __ _masm->
57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
58
59 #ifdef PRODUCT
60 #define BLOCK_COMMENT(str) /* nothing */
61 #else
62 #define BLOCK_COMMENT(str) __ block_comment(str)
63 #endif
64
65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
66
67 // Stub Code definitions
68
69 class StubGenerator: public StubCodeGenerator {
70 private:
71
72 #ifdef PRODUCT
73 #define inc_counter_np(counter) ((void)0)
74 #else
75 void inc_counter_np_(int& counter) {
76 __ lea(rscratch2, ExternalAddress((address)&counter));
77 __ ldrw(rscratch1, Address(rscratch2));
78 __ addw(rscratch1, rscratch1, 1);
79 __ strw(rscratch1, Address(rscratch2));
80 }
81 #define inc_counter_np(counter) \
82 BLOCK_COMMENT("inc_counter " #counter); \
83 inc_counter_np_(counter);
84 #endif
85
86 // Call stubs are used to call Java from C
87 //
88 // Arguments:
89 // c_rarg0: call wrapper address address
90 // c_rarg1: result address
91 // c_rarg2: result type BasicType
92 // c_rarg3: method Method*
93 // c_rarg4: (interpreter) entry point address
94 // c_rarg5: parameters intptr_t*
95 // c_rarg6: parameter size (in words) int
96 // c_rarg7: thread Thread*
97 //
98 // There is no return from the stub itself as any Java result
99 // is written to result
100 //
101 // we save r30 (lr) as the return PC at the base of the frame and
102 // link r29 (fp) below it as the frame pointer installing sp (r31)
103 // into fp.
104 //
105 // we save r0-r7, which accounts for all the c arguments.
106 //
107 // TODO: strictly do we need to save them all? they are treated as
108 // volatile by C so could we omit saving the ones we are going to
109 // place in global registers (thread? method?) or those we only use
110 // during setup of the Java call?
111 //
112 // we don't need to save r8 which C uses as an indirect result location
113 // return register.
114 //
115 // we don't need to save r9-r15 which both C and Java treat as
116 // volatile
117 //
118 // we don't need to save r16-18 because Java does not use them
119 //
120 // we save r19-r28 which Java uses as scratch registers and C
121 // expects to be callee-save
122 //
123 // we save the bottom 64 bits of each value stored in v8-v15; it is
124 // the responsibility of the caller to preserve larger values.
125 //
126 // so the stub frame looks like this when we enter Java code
127 //
128 // [ return_from_Java ] <--- sp
129 // [ argument word n ]
130 // ...
131 // -27 [ argument word 1 ]
132 // -26 [ saved v15 ] <--- sp_after_call
133 // -25 [ saved v14 ]
134 // -24 [ saved v13 ]
135 // -23 [ saved v12 ]
136 // -22 [ saved v11 ]
137 // -21 [ saved v10 ]
138 // -20 [ saved v9 ]
139 // -19 [ saved v8 ]
140 // -18 [ saved r28 ]
141 // -17 [ saved r27 ]
142 // -16 [ saved r26 ]
143 // -15 [ saved r25 ]
144 // -14 [ saved r24 ]
145 // -13 [ saved r23 ]
146 // -12 [ saved r22 ]
147 // -11 [ saved r21 ]
148 // -10 [ saved r20 ]
149 // -9 [ saved r19 ]
150 // -8 [ call wrapper (r0) ]
151 // -7 [ result (r1) ]
152 // -6 [ result type (r2) ]
153 // -5 [ method (r3) ]
154 // -4 [ entry point (r4) ]
155 // -3 [ parameters (r5) ]
156 // -2 [ parameter size (r6) ]
157 // -1 [ thread (r7) ]
158 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
159 // 1 [ saved lr (r30) ]
160
161 // Call stub stack layout word offsets from fp
162 enum call_stub_layout {
163 sp_after_call_off = -26,
164
165 d15_off = -26,
166 d13_off = -24,
167 d11_off = -22,
168 d9_off = -20,
169
170 r28_off = -18,
171 r26_off = -16,
172 r24_off = -14,
173 r22_off = -12,
174 r20_off = -10,
175 call_wrapper_off = -8,
176 result_off = -7,
177 result_type_off = -6,
178 method_off = -5,
179 entry_point_off = -4,
180 parameter_size_off = -2,
181 thread_off = -1,
182 fp_f = 0,
183 retaddr_off = 1,
184 };
185
generate_call_stub(address & return_address)186 address generate_call_stub(address& return_address) {
187 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
188 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
189 "adjust this code");
190
191 StubCodeMark mark(this, "StubRoutines", "call_stub");
192 address start = __ pc();
193
194 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
195
196 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
197 const Address result (rfp, result_off * wordSize);
198 const Address result_type (rfp, result_type_off * wordSize);
199 const Address method (rfp, method_off * wordSize);
200 const Address entry_point (rfp, entry_point_off * wordSize);
201 const Address parameter_size(rfp, parameter_size_off * wordSize);
202
203 const Address thread (rfp, thread_off * wordSize);
204
205 const Address d15_save (rfp, d15_off * wordSize);
206 const Address d13_save (rfp, d13_off * wordSize);
207 const Address d11_save (rfp, d11_off * wordSize);
208 const Address d9_save (rfp, d9_off * wordSize);
209
210 const Address r28_save (rfp, r28_off * wordSize);
211 const Address r26_save (rfp, r26_off * wordSize);
212 const Address r24_save (rfp, r24_off * wordSize);
213 const Address r22_save (rfp, r22_off * wordSize);
214 const Address r20_save (rfp, r20_off * wordSize);
215
216 // stub code
217
218 address aarch64_entry = __ pc();
219
220 // set up frame and move sp to end of save area
221 __ enter();
222 __ sub(sp, rfp, -sp_after_call_off * wordSize);
223
224 // save register parameters and Java scratch/global registers
225 // n.b. we save thread even though it gets installed in
226 // rthread because we want to sanity check rthread later
227 __ str(c_rarg7, thread);
228 __ strw(c_rarg6, parameter_size);
229 __ stp(c_rarg4, c_rarg5, entry_point);
230 __ stp(c_rarg2, c_rarg3, result_type);
231 __ stp(c_rarg0, c_rarg1, call_wrapper);
232
233 __ stp(r20, r19, r20_save);
234 __ stp(r22, r21, r22_save);
235 __ stp(r24, r23, r24_save);
236 __ stp(r26, r25, r26_save);
237 __ stp(r28, r27, r28_save);
238
239 __ stpd(v9, v8, d9_save);
240 __ stpd(v11, v10, d11_save);
241 __ stpd(v13, v12, d13_save);
242 __ stpd(v15, v14, d15_save);
243
244 // install Java thread in global register now we have saved
245 // whatever value it held
246 __ mov(rthread, c_rarg7);
247 // And method
248 __ mov(rmethod, c_rarg3);
249
250 // set up the heapbase register
251 __ reinit_heapbase();
252
253 #ifdef ASSERT
254 // make sure we have no pending exceptions
255 {
256 Label L;
257 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
258 __ cmp(rscratch1, (unsigned)NULL_WORD);
259 __ br(Assembler::EQ, L);
260 __ stop("StubRoutines::call_stub: entered with pending exception");
261 __ BIND(L);
262 }
263 #endif
264 // pass parameters if any
265 __ mov(esp, sp);
266 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
267 __ andr(sp, rscratch1, -2 * wordSize);
268
269 BLOCK_COMMENT("pass parameters if any");
270 Label parameters_done;
271 // parameter count is still in c_rarg6
272 // and parameter pointer identifying param 1 is in c_rarg5
273 __ cbzw(c_rarg6, parameters_done);
274
275 address loop = __ pc();
276 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
277 __ subsw(c_rarg6, c_rarg6, 1);
278 __ push(rscratch1);
279 __ br(Assembler::GT, loop);
280
281 __ BIND(parameters_done);
282
283 // call Java entry -- passing methdoOop, and current sp
284 // rmethod: Method*
285 // r13: sender sp
286 BLOCK_COMMENT("call Java function");
287 __ mov(r13, sp);
288 __ blr(c_rarg4);
289
290 // we do this here because the notify will already have been done
291 // if we get to the next instruction via an exception
292 //
293 // n.b. adding this instruction here affects the calculation of
294 // whether or not a routine returns to the call stub (used when
295 // doing stack walks) since the normal test is to check the return
296 // pc against the address saved below. so we may need to allow for
297 // this extra instruction in the check.
298
299 // save current address for use by exception handling code
300
301 return_address = __ pc();
302
303 // store result depending on type (everything that is not
304 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
305 // n.b. this assumes Java returns an integral result in r0
306 // and a floating result in j_farg0
307 __ ldr(j_rarg2, result);
308 Label is_long, is_float, is_double, exit;
309 __ ldr(j_rarg1, result_type);
310 __ cmp(j_rarg1, T_OBJECT);
311 __ br(Assembler::EQ, is_long);
312 __ cmp(j_rarg1, T_LONG);
313 __ br(Assembler::EQ, is_long);
314 __ cmp(j_rarg1, T_FLOAT);
315 __ br(Assembler::EQ, is_float);
316 __ cmp(j_rarg1, T_DOUBLE);
317 __ br(Assembler::EQ, is_double);
318
319 // handle T_INT case
320 __ strw(r0, Address(j_rarg2));
321
322 __ BIND(exit);
323
324 // pop parameters
325 __ sub(esp, rfp, -sp_after_call_off * wordSize);
326
327 #ifdef ASSERT
328 // verify that threads correspond
329 {
330 Label L, S;
331 __ ldr(rscratch1, thread);
332 __ cmp(rthread, rscratch1);
333 __ br(Assembler::NE, S);
334 __ get_thread(rscratch1);
335 __ cmp(rthread, rscratch1);
336 __ br(Assembler::EQ, L);
337 __ BIND(S);
338 __ stop("StubRoutines::call_stub: threads must correspond");
339 __ BIND(L);
340 }
341 #endif
342
343 // restore callee-save registers
344 __ ldpd(v15, v14, d15_save);
345 __ ldpd(v13, v12, d13_save);
346 __ ldpd(v11, v10, d11_save);
347 __ ldpd(v9, v8, d9_save);
348
349 __ ldp(r28, r27, r28_save);
350 __ ldp(r26, r25, r26_save);
351 __ ldp(r24, r23, r24_save);
352 __ ldp(r22, r21, r22_save);
353 __ ldp(r20, r19, r20_save);
354
355 __ ldp(c_rarg0, c_rarg1, call_wrapper);
356 __ ldrw(c_rarg2, result_type);
357 __ ldr(c_rarg3, method);
358 __ ldp(c_rarg4, c_rarg5, entry_point);
359 __ ldp(c_rarg6, c_rarg7, parameter_size);
360
361 // leave frame and return to caller
362 __ leave();
363 __ ret(lr);
364
365 // handle return types different from T_INT
366
367 __ BIND(is_long);
368 __ str(r0, Address(j_rarg2, 0));
369 __ br(Assembler::AL, exit);
370
371 __ BIND(is_float);
372 __ strs(j_farg0, Address(j_rarg2, 0));
373 __ br(Assembler::AL, exit);
374
375 __ BIND(is_double);
376 __ strd(j_farg0, Address(j_rarg2, 0));
377 __ br(Assembler::AL, exit);
378
379 return start;
380 }
381
382 // Return point for a Java call if there's an exception thrown in
383 // Java code. The exception is caught and transformed into a
384 // pending exception stored in JavaThread that can be tested from
385 // within the VM.
386 //
387 // Note: Usually the parameters are removed by the callee. In case
388 // of an exception crossing an activation frame boundary, that is
389 // not the case if the callee is compiled code => need to setup the
390 // rsp.
391 //
392 // r0: exception oop
393
generate_catch_exception()394 address generate_catch_exception() {
395 StubCodeMark mark(this, "StubRoutines", "catch_exception");
396 address start = __ pc();
397
398 // same as in generate_call_stub():
399 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
400 const Address thread (rfp, thread_off * wordSize);
401
402 #ifdef ASSERT
403 // verify that threads correspond
404 {
405 Label L, S;
406 __ ldr(rscratch1, thread);
407 __ cmp(rthread, rscratch1);
408 __ br(Assembler::NE, S);
409 __ get_thread(rscratch1);
410 __ cmp(rthread, rscratch1);
411 __ br(Assembler::EQ, L);
412 __ bind(S);
413 __ stop("StubRoutines::catch_exception: threads must correspond");
414 __ bind(L);
415 }
416 #endif
417
418 // set pending exception
419 __ verify_oop(r0);
420
421 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
422 __ mov(rscratch1, (address)__FILE__);
423 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
424 __ movw(rscratch1, (int)__LINE__);
425 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
426
427 // complete return to VM
428 assert(StubRoutines::_call_stub_return_address != NULL,
429 "_call_stub_return_address must have been generated before");
430 __ b(StubRoutines::_call_stub_return_address);
431
432 return start;
433 }
434
435 // Continuation point for runtime calls returning with a pending
436 // exception. The pending exception check happened in the runtime
437 // or native call stub. The pending exception in Thread is
438 // converted into a Java-level exception.
439 //
440 // Contract with Java-level exception handlers:
441 // r0: exception
442 // r3: throwing pc
443 //
444 // NOTE: At entry of this stub, exception-pc must be in LR !!
445
446 // NOTE: this is always used as a jump target within generated code
447 // so it just needs to be generated code wiht no x86 prolog
448
generate_forward_exception()449 address generate_forward_exception() {
450 StubCodeMark mark(this, "StubRoutines", "forward exception");
451 address start = __ pc();
452
453 // Upon entry, LR points to the return address returning into
454 // Java (interpreted or compiled) code; i.e., the return address
455 // becomes the throwing pc.
456 //
457 // Arguments pushed before the runtime call are still on the stack
458 // but the exception handler will reset the stack pointer ->
459 // ignore them. A potential result in registers can be ignored as
460 // well.
461
462 #ifdef ASSERT
463 // make sure this code is only executed if there is a pending exception
464 {
465 Label L;
466 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
467 __ cbnz(rscratch1, L);
468 __ stop("StubRoutines::forward exception: no pending exception (1)");
469 __ bind(L);
470 }
471 #endif
472
473 // compute exception handler into r19
474
475 // call the VM to find the handler address associated with the
476 // caller address. pass thread in r0 and caller pc (ret address)
477 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
478 // the stack.
479 __ mov(c_rarg1, lr);
480 // lr will be trashed by the VM call so we move it to R19
481 // (callee-saved) because we also need to pass it to the handler
482 // returned by this call.
483 __ mov(r19, lr);
484 BLOCK_COMMENT("call exception_handler_for_return_address");
485 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
486 SharedRuntime::exception_handler_for_return_address),
487 rthread, c_rarg1);
488 // we should not really care that lr is no longer the callee
489 // address. we saved the value the handler needs in r19 so we can
490 // just copy it to r3. however, the C2 handler will push its own
491 // frame and then calls into the VM and the VM code asserts that
492 // the PC for the frame above the handler belongs to a compiled
493 // Java method. So, we restore lr here to satisfy that assert.
494 __ mov(lr, r19);
495 // setup r0 & r3 & clear pending exception
496 __ mov(r3, r19);
497 __ mov(r19, r0);
498 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
499 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
500
501 #ifdef ASSERT
502 // make sure exception is set
503 {
504 Label L;
505 __ cbnz(r0, L);
506 __ stop("StubRoutines::forward exception: no pending exception (2)");
507 __ bind(L);
508 }
509 #endif
510
511 // continue at exception handler
512 // r0: exception
513 // r3: throwing pc
514 // r19: exception handler
515 __ verify_oop(r0);
516 __ br(r19);
517
518 return start;
519 }
520
521 // Non-destructive plausibility checks for oops
522 //
523 // Arguments:
524 // r0: oop to verify
525 // rscratch1: error message
526 //
527 // Stack after saving c_rarg3:
528 // [tos + 0]: saved c_rarg3
529 // [tos + 1]: saved c_rarg2
530 // [tos + 2]: saved lr
531 // [tos + 3]: saved rscratch2
532 // [tos + 4]: saved r0
533 // [tos + 5]: saved rscratch1
generate_verify_oop()534 address generate_verify_oop() {
535
536 StubCodeMark mark(this, "StubRoutines", "verify_oop");
537 address start = __ pc();
538
539 Label exit, error;
540
541 // save c_rarg2 and c_rarg3
542 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
543
544 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
545 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
546 __ ldr(c_rarg3, Address(c_rarg2));
547 __ add(c_rarg3, c_rarg3, 1);
548 __ str(c_rarg3, Address(c_rarg2));
549
550 // object is in r0
551 // make sure object is 'reasonable'
552 __ cbz(r0, exit); // if obj is NULL it is OK
553
554 // Check if the oop is in the right area of memory
555 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
556 __ andr(c_rarg2, r0, c_rarg3);
557 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
558
559 // Compare c_rarg2 and c_rarg3. We don't use a compare
560 // instruction here because the flags register is live.
561 __ eor(c_rarg2, c_rarg2, c_rarg3);
562 __ cbnz(c_rarg2, error);
563
564 // make sure klass is 'reasonable', which is not zero.
565 __ load_klass(r0, r0); // get klass
566 __ cbz(r0, error); // if klass is NULL it is broken
567
568 // return if everything seems ok
569 __ bind(exit);
570
571 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
572 __ ret(lr);
573
574 // handle errors
575 __ bind(error);
576 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
577
578 __ push(RegSet::range(r0, r29), sp);
579 // debug(char* msg, int64_t pc, int64_t regs[])
580 __ mov(c_rarg0, rscratch1); // pass address of error message
581 __ mov(c_rarg1, lr); // pass return address
582 __ mov(c_rarg2, sp); // pass address of regs on stack
583 #ifndef PRODUCT
584 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
585 #endif
586 BLOCK_COMMENT("call MacroAssembler::debug");
587 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
588 __ blr(rscratch1);
589
590 return start;
591 }
592
array_overlap_test(Label & L_no_overlap,Address::sxtw sf)593 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
594
595 // The inner part of zero_words(). This is the bulk operation,
596 // zeroing words in blocks, possibly using DC ZVA to do it. The
597 // caller is responsible for zeroing the last few words.
598 //
599 // Inputs:
600 // r10: the HeapWord-aligned base address of an array to zero.
601 // r11: the count in HeapWords, r11 > 0.
602 //
603 // Returns r10 and r11, adjusted for the caller to clear.
604 // r10: the base address of the tail of words left to clear.
605 // r11: the number of words in the tail.
606 // r11 < MacroAssembler::zero_words_block_size.
607
generate_zero_blocks()608 address generate_zero_blocks() {
609 Label store_pair, loop_store_pair, done;
610 Label base_aligned;
611
612 Register base = r10, cnt = r11;
613
614 __ align(CodeEntryAlignment);
615 StubCodeMark mark(this, "StubRoutines", "zero_blocks");
616 address start = __ pc();
617
618 if (UseBlockZeroing) {
619 int zva_length = VM_Version::zva_length();
620
621 // Ensure ZVA length can be divided by 16. This is required by
622 // the subsequent operations.
623 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
624
625 __ tbz(base, 3, base_aligned);
626 __ str(zr, Address(__ post(base, 8)));
627 __ sub(cnt, cnt, 1);
628 __ bind(base_aligned);
629
630 // Ensure count >= zva_length * 2 so that it still deserves a zva after
631 // alignment.
632 Label small;
633 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
634 __ subs(rscratch1, cnt, low_limit >> 3);
635 __ br(Assembler::LT, small);
636 __ zero_dcache_blocks(base, cnt);
637 __ bind(small);
638 }
639
640 {
641 // Number of stp instructions we'll unroll
642 const int unroll =
643 MacroAssembler::zero_words_block_size / 2;
644 // Clear the remaining blocks.
645 Label loop;
646 __ subs(cnt, cnt, unroll * 2);
647 __ br(Assembler::LT, done);
648 __ bind(loop);
649 for (int i = 0; i < unroll; i++)
650 __ stp(zr, zr, __ post(base, 16));
651 __ subs(cnt, cnt, unroll * 2);
652 __ br(Assembler::GE, loop);
653 __ bind(done);
654 __ add(cnt, cnt, unroll * 2);
655 }
656
657 __ ret(lr);
658
659 return start;
660 }
661
662
663 typedef enum {
664 copy_forwards = 1,
665 copy_backwards = -1
666 } copy_direction;
667
668 // Bulk copy of blocks of 8 words.
669 //
670 // count is a count of words.
671 //
672 // Precondition: count >= 8
673 //
674 // Postconditions:
675 //
676 // The least significant bit of count contains the remaining count
677 // of words to copy. The rest of count is trash.
678 //
679 // s and d are adjusted to point to the remaining words to copy
680 //
generate_copy_longs(Label & start,Register s,Register d,Register count,copy_direction direction)681 void generate_copy_longs(Label &start, Register s, Register d, Register count,
682 copy_direction direction) {
683 int unit = wordSize * direction;
684 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
685
686 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
687 t4 = r7, t5 = r10, t6 = r11, t7 = r12;
688 const Register stride = r13;
689
690 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
691 assert_different_registers(s, d, count, rscratch1);
692
693 Label again, drain;
694 const char *stub_name;
695 if (direction == copy_forwards)
696 stub_name = "forward_copy_longs";
697 else
698 stub_name = "backward_copy_longs";
699
700 __ align(CodeEntryAlignment);
701
702 StubCodeMark mark(this, "StubRoutines", stub_name);
703
704 __ bind(start);
705
706 Label unaligned_copy_long;
707 if (AvoidUnalignedAccesses) {
708 __ tbnz(d, 3, unaligned_copy_long);
709 }
710
711 if (direction == copy_forwards) {
712 __ sub(s, s, bias);
713 __ sub(d, d, bias);
714 }
715
716 #ifdef ASSERT
717 // Make sure we are never given < 8 words
718 {
719 Label L;
720 __ cmp(count, 8);
721 __ br(Assembler::GE, L);
722 __ stop("genrate_copy_longs called with < 8 words");
723 __ bind(L);
724 }
725 #endif
726
727 // Fill 8 registers
728 if (UseSIMDForMemoryOps) {
729 __ ldpq(v0, v1, Address(s, 4 * unit));
730 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
731 } else {
732 __ ldp(t0, t1, Address(s, 2 * unit));
733 __ ldp(t2, t3, Address(s, 4 * unit));
734 __ ldp(t4, t5, Address(s, 6 * unit));
735 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
736 }
737
738 __ subs(count, count, 16);
739 __ br(Assembler::LO, drain);
740
741 int prefetch = PrefetchCopyIntervalInBytes;
742 bool use_stride = false;
743 if (direction == copy_backwards) {
744 use_stride = prefetch > 256;
745 prefetch = -prefetch;
746 if (use_stride) __ mov(stride, prefetch);
747 }
748
749 __ bind(again);
750
751 if (PrefetchCopyIntervalInBytes > 0)
752 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
753
754 if (UseSIMDForMemoryOps) {
755 __ stpq(v0, v1, Address(d, 4 * unit));
756 __ ldpq(v0, v1, Address(s, 4 * unit));
757 __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
758 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
759 } else {
760 __ stp(t0, t1, Address(d, 2 * unit));
761 __ ldp(t0, t1, Address(s, 2 * unit));
762 __ stp(t2, t3, Address(d, 4 * unit));
763 __ ldp(t2, t3, Address(s, 4 * unit));
764 __ stp(t4, t5, Address(d, 6 * unit));
765 __ ldp(t4, t5, Address(s, 6 * unit));
766 __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
767 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
768 }
769
770 __ subs(count, count, 8);
771 __ br(Assembler::HS, again);
772
773 // Drain
774 __ bind(drain);
775 if (UseSIMDForMemoryOps) {
776 __ stpq(v0, v1, Address(d, 4 * unit));
777 __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
778 } else {
779 __ stp(t0, t1, Address(d, 2 * unit));
780 __ stp(t2, t3, Address(d, 4 * unit));
781 __ stp(t4, t5, Address(d, 6 * unit));
782 __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
783 }
784
785 {
786 Label L1, L2;
787 __ tbz(count, exact_log2(4), L1);
788 if (UseSIMDForMemoryOps) {
789 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
790 __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
791 } else {
792 __ ldp(t0, t1, Address(s, 2 * unit));
793 __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
794 __ stp(t0, t1, Address(d, 2 * unit));
795 __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
796 }
797 __ bind(L1);
798
799 if (direction == copy_forwards) {
800 __ add(s, s, bias);
801 __ add(d, d, bias);
802 }
803
804 __ tbz(count, 1, L2);
805 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
806 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
807 __ bind(L2);
808 }
809
810 __ ret(lr);
811
812 if (AvoidUnalignedAccesses) {
813 Label drain, again;
814 // Register order for storing. Order is different for backward copy.
815
816 __ bind(unaligned_copy_long);
817
818 // source address is even aligned, target odd aligned
819 //
820 // when forward copying word pairs we read long pairs at offsets
821 // {0, 2, 4, 6} (in long words). when backwards copying we read
822 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
823 // address by -2 in the forwards case so we can compute the
824 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
825 // or -1.
826 //
827 // when forward copying we need to store 1 word, 3 pairs and
828 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
829 // zero offset We adjust the destination by -1 which means we
830 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
831 //
832 // When backwards copyng we need to store 1 word, 3 pairs and
833 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
834 // offsets {1, 3, 5, 7, 8} * unit.
835
836 if (direction == copy_forwards) {
837 __ sub(s, s, 16);
838 __ sub(d, d, 8);
839 }
840
841 // Fill 8 registers
842 //
843 // for forwards copy s was offset by -16 from the original input
844 // value of s so the register contents are at these offsets
845 // relative to the 64 bit block addressed by that original input
846 // and so on for each successive 64 byte block when s is updated
847 //
848 // t0 at offset 0, t1 at offset 8
849 // t2 at offset 16, t3 at offset 24
850 // t4 at offset 32, t5 at offset 40
851 // t6 at offset 48, t7 at offset 56
852
853 // for backwards copy s was not offset so the register contents
854 // are at these offsets into the preceding 64 byte block
855 // relative to that original input and so on for each successive
856 // preceding 64 byte block when s is updated. this explains the
857 // slightly counter-intuitive looking pattern of register usage
858 // in the stp instructions for backwards copy.
859 //
860 // t0 at offset -16, t1 at offset -8
861 // t2 at offset -32, t3 at offset -24
862 // t4 at offset -48, t5 at offset -40
863 // t6 at offset -64, t7 at offset -56
864
865 __ ldp(t0, t1, Address(s, 2 * unit));
866 __ ldp(t2, t3, Address(s, 4 * unit));
867 __ ldp(t4, t5, Address(s, 6 * unit));
868 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
869
870 __ subs(count, count, 16);
871 __ br(Assembler::LO, drain);
872
873 int prefetch = PrefetchCopyIntervalInBytes;
874 bool use_stride = false;
875 if (direction == copy_backwards) {
876 use_stride = prefetch > 256;
877 prefetch = -prefetch;
878 if (use_stride) __ mov(stride, prefetch);
879 }
880
881 __ bind(again);
882
883 if (PrefetchCopyIntervalInBytes > 0)
884 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
885
886 if (direction == copy_forwards) {
887 // allowing for the offset of -8 the store instructions place
888 // registers into the target 64 bit block at the following
889 // offsets
890 //
891 // t0 at offset 0
892 // t1 at offset 8, t2 at offset 16
893 // t3 at offset 24, t4 at offset 32
894 // t5 at offset 40, t6 at offset 48
895 // t7 at offset 56
896
897 __ str(t0, Address(d, 1 * unit));
898 __ stp(t1, t2, Address(d, 2 * unit));
899 __ ldp(t0, t1, Address(s, 2 * unit));
900 __ stp(t3, t4, Address(d, 4 * unit));
901 __ ldp(t2, t3, Address(s, 4 * unit));
902 __ stp(t5, t6, Address(d, 6 * unit));
903 __ ldp(t4, t5, Address(s, 6 * unit));
904 __ str(t7, Address(__ pre(d, 8 * unit)));
905 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
906 } else {
907 // d was not offset when we started so the registers are
908 // written into the 64 bit block preceding d with the following
909 // offsets
910 //
911 // t1 at offset -8
912 // t3 at offset -24, t0 at offset -16
913 // t5 at offset -48, t2 at offset -32
914 // t7 at offset -56, t4 at offset -48
915 // t6 at offset -64
916 //
917 // note that this matches the offsets previously noted for the
918 // loads
919
920 __ str(t1, Address(d, 1 * unit));
921 __ stp(t3, t0, Address(d, 3 * unit));
922 __ ldp(t0, t1, Address(s, 2 * unit));
923 __ stp(t5, t2, Address(d, 5 * unit));
924 __ ldp(t2, t3, Address(s, 4 * unit));
925 __ stp(t7, t4, Address(d, 7 * unit));
926 __ ldp(t4, t5, Address(s, 6 * unit));
927 __ str(t6, Address(__ pre(d, 8 * unit)));
928 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
929 }
930
931 __ subs(count, count, 8);
932 __ br(Assembler::HS, again);
933
934 // Drain
935 //
936 // this uses the same pattern of offsets and register arguments
937 // as above
938 __ bind(drain);
939 if (direction == copy_forwards) {
940 __ str(t0, Address(d, 1 * unit));
941 __ stp(t1, t2, Address(d, 2 * unit));
942 __ stp(t3, t4, Address(d, 4 * unit));
943 __ stp(t5, t6, Address(d, 6 * unit));
944 __ str(t7, Address(__ pre(d, 8 * unit)));
945 } else {
946 __ str(t1, Address(d, 1 * unit));
947 __ stp(t3, t0, Address(d, 3 * unit));
948 __ stp(t5, t2, Address(d, 5 * unit));
949 __ stp(t7, t4, Address(d, 7 * unit));
950 __ str(t6, Address(__ pre(d, 8 * unit)));
951 }
952 // now we need to copy any remaining part block which may
953 // include a 4 word block subblock and/or a 2 word subblock.
954 // bits 2 and 1 in the count are the tell-tale for whetehr we
955 // have each such subblock
956 {
957 Label L1, L2;
958 __ tbz(count, exact_log2(4), L1);
959 // this is the same as above but copying only 4 longs hence
960 // with ony one intervening stp between the str instructions
961 // but note that the offsets and registers still follow the
962 // same pattern
963 __ ldp(t0, t1, Address(s, 2 * unit));
964 __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
965 if (direction == copy_forwards) {
966 __ str(t0, Address(d, 1 * unit));
967 __ stp(t1, t2, Address(d, 2 * unit));
968 __ str(t3, Address(__ pre(d, 4 * unit)));
969 } else {
970 __ str(t1, Address(d, 1 * unit));
971 __ stp(t3, t0, Address(d, 3 * unit));
972 __ str(t2, Address(__ pre(d, 4 * unit)));
973 }
974 __ bind(L1);
975
976 __ tbz(count, 1, L2);
977 // this is the same as above but copying only 2 longs hence
978 // there is no intervening stp between the str instructions
979 // but note that the offset and register patterns are still
980 // the same
981 __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
982 if (direction == copy_forwards) {
983 __ str(t0, Address(d, 1 * unit));
984 __ str(t1, Address(__ pre(d, 2 * unit)));
985 } else {
986 __ str(t1, Address(d, 1 * unit));
987 __ str(t0, Address(__ pre(d, 2 * unit)));
988 }
989 __ bind(L2);
990
991 // for forwards copy we need to re-adjust the offsets we
992 // applied so that s and d are follow the last words written
993
994 if (direction == copy_forwards) {
995 __ add(s, s, 16);
996 __ add(d, d, 8);
997 }
998
999 }
1000
1001 __ ret(lr);
1002 }
1003 }
1004
1005 // Small copy: less than 16 bytes.
1006 //
1007 // NB: Ignores all of the bits of count which represent more than 15
1008 // bytes, so a caller doesn't have to mask them.
1009
copy_memory_small(Register s,Register d,Register count,Register tmp,int step)1010 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1011 bool is_backwards = step < 0;
1012 size_t granularity = uabs(step);
1013 int direction = is_backwards ? -1 : 1;
1014 int unit = wordSize * direction;
1015
1016 Label Lpair, Lword, Lint, Lshort, Lbyte;
1017
1018 assert(granularity
1019 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1020
1021 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1022
1023 // ??? I don't know if this bit-test-and-branch is the right thing
1024 // to do. It does a lot of jumping, resulting in several
1025 // mispredicted branches. It might make more sense to do this
1026 // with something like Duff's device with a single computed branch.
1027
1028 __ tbz(count, 3 - exact_log2(granularity), Lword);
1029 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1030 __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1031 __ bind(Lword);
1032
1033 if (granularity <= sizeof (jint)) {
1034 __ tbz(count, 2 - exact_log2(granularity), Lint);
1035 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1036 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1037 __ bind(Lint);
1038 }
1039
1040 if (granularity <= sizeof (jshort)) {
1041 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1042 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1043 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1044 __ bind(Lshort);
1045 }
1046
1047 if (granularity <= sizeof (jbyte)) {
1048 __ tbz(count, 0, Lbyte);
1049 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1050 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1051 __ bind(Lbyte);
1052 }
1053 }
1054
1055 Label copy_f, copy_b;
1056
1057 // All-singing all-dancing memory copy.
1058 //
1059 // Copy count units of memory from s to d. The size of a unit is
1060 // step, which can be positive or negative depending on the direction
1061 // of copy. If is_aligned is false, we align the source address.
1062 //
1063
copy_memory(bool is_aligned,Register s,Register d,Register count,Register tmp,int step)1064 void copy_memory(bool is_aligned, Register s, Register d,
1065 Register count, Register tmp, int step) {
1066 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1067 bool is_backwards = step < 0;
1068 unsigned int granularity = uabs(step);
1069 const Register t0 = r3, t1 = r4;
1070
1071 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1072 // load all the data before writing anything
1073 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1074 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1075 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1076 const Register send = r17, dend = r18;
1077
1078 if (PrefetchCopyIntervalInBytes > 0)
1079 __ prfm(Address(s, 0), PLDL1KEEP);
1080 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1081 __ br(Assembler::HI, copy_big);
1082
1083 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1084 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1085
1086 __ cmp(count, 16/granularity);
1087 __ br(Assembler::LS, copy16);
1088
1089 __ cmp(count, 64/granularity);
1090 __ br(Assembler::HI, copy80);
1091
1092 __ cmp(count, 32/granularity);
1093 __ br(Assembler::LS, copy32);
1094
1095 // 33..64 bytes
1096 if (UseSIMDForMemoryOps) {
1097 __ ldpq(v0, v1, Address(s, 0));
1098 __ ldpq(v2, v3, Address(send, -32));
1099 __ stpq(v0, v1, Address(d, 0));
1100 __ stpq(v2, v3, Address(dend, -32));
1101 } else {
1102 __ ldp(t0, t1, Address(s, 0));
1103 __ ldp(t2, t3, Address(s, 16));
1104 __ ldp(t4, t5, Address(send, -32));
1105 __ ldp(t6, t7, Address(send, -16));
1106
1107 __ stp(t0, t1, Address(d, 0));
1108 __ stp(t2, t3, Address(d, 16));
1109 __ stp(t4, t5, Address(dend, -32));
1110 __ stp(t6, t7, Address(dend, -16));
1111 }
1112 __ b(finish);
1113
1114 // 17..32 bytes
1115 __ bind(copy32);
1116 __ ldp(t0, t1, Address(s, 0));
1117 __ ldp(t2, t3, Address(send, -16));
1118 __ stp(t0, t1, Address(d, 0));
1119 __ stp(t2, t3, Address(dend, -16));
1120 __ b(finish);
1121
1122 // 65..80/96 bytes
1123 // (96 bytes if SIMD because we do 32 byes per instruction)
1124 __ bind(copy80);
1125 if (UseSIMDForMemoryOps) {
1126 __ ldpq(v0, v1, Address(s, 0));
1127 __ ldpq(v2, v3, Address(s, 32));
1128 // Unaligned pointers can be an issue for copying.
1129 // The issue has more chances to happen when granularity of data is
1130 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1131 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1132 // The most performance drop has been seen for the range 65-80 bytes.
1133 // For such cases using the pair of ldp/stp instead of the third pair of
1134 // ldpq/stpq fixes the performance issue.
1135 if (granularity < sizeof (jint)) {
1136 Label copy96;
1137 __ cmp(count, u1(80/granularity));
1138 __ br(Assembler::HI, copy96);
1139 __ ldp(t0, t1, Address(send, -16));
1140
1141 __ stpq(v0, v1, Address(d, 0));
1142 __ stpq(v2, v3, Address(d, 32));
1143 __ stp(t0, t1, Address(dend, -16));
1144 __ b(finish);
1145
1146 __ bind(copy96);
1147 }
1148 __ ldpq(v4, v5, Address(send, -32));
1149
1150 __ stpq(v0, v1, Address(d, 0));
1151 __ stpq(v2, v3, Address(d, 32));
1152 __ stpq(v4, v5, Address(dend, -32));
1153 } else {
1154 __ ldp(t0, t1, Address(s, 0));
1155 __ ldp(t2, t3, Address(s, 16));
1156 __ ldp(t4, t5, Address(s, 32));
1157 __ ldp(t6, t7, Address(s, 48));
1158 __ ldp(t8, t9, Address(send, -16));
1159
1160 __ stp(t0, t1, Address(d, 0));
1161 __ stp(t2, t3, Address(d, 16));
1162 __ stp(t4, t5, Address(d, 32));
1163 __ stp(t6, t7, Address(d, 48));
1164 __ stp(t8, t9, Address(dend, -16));
1165 }
1166 __ b(finish);
1167
1168 // 0..16 bytes
1169 __ bind(copy16);
1170 __ cmp(count, 8/granularity);
1171 __ br(Assembler::LO, copy8);
1172
1173 // 8..16 bytes
1174 __ ldr(t0, Address(s, 0));
1175 __ ldr(t1, Address(send, -8));
1176 __ str(t0, Address(d, 0));
1177 __ str(t1, Address(dend, -8));
1178 __ b(finish);
1179
1180 if (granularity < 8) {
1181 // 4..7 bytes
1182 __ bind(copy8);
1183 __ tbz(count, 2 - exact_log2(granularity), copy4);
1184 __ ldrw(t0, Address(s, 0));
1185 __ ldrw(t1, Address(send, -4));
1186 __ strw(t0, Address(d, 0));
1187 __ strw(t1, Address(dend, -4));
1188 __ b(finish);
1189 if (granularity < 4) {
1190 // 0..3 bytes
1191 __ bind(copy4);
1192 __ cbz(count, finish); // get rid of 0 case
1193 if (granularity == 2) {
1194 __ ldrh(t0, Address(s, 0));
1195 __ strh(t0, Address(d, 0));
1196 } else { // granularity == 1
1197 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1198 // the first and last byte.
1199 // Handle the 3 byte case by loading and storing base + count/2
1200 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1201 // This does means in the 1 byte case we load/store the same
1202 // byte 3 times.
1203 __ lsr(count, count, 1);
1204 __ ldrb(t0, Address(s, 0));
1205 __ ldrb(t1, Address(send, -1));
1206 __ ldrb(t2, Address(s, count));
1207 __ strb(t0, Address(d, 0));
1208 __ strb(t1, Address(dend, -1));
1209 __ strb(t2, Address(d, count));
1210 }
1211 __ b(finish);
1212 }
1213 }
1214
1215 __ bind(copy_big);
1216 if (is_backwards) {
1217 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1218 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1219 }
1220
1221 // Now we've got the small case out of the way we can align the
1222 // source address on a 2-word boundary.
1223
1224 Label aligned;
1225
1226 if (is_aligned) {
1227 // We may have to adjust by 1 word to get s 2-word-aligned.
1228 __ tbz(s, exact_log2(wordSize), aligned);
1229 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1230 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1231 __ sub(count, count, wordSize/granularity);
1232 } else {
1233 if (is_backwards) {
1234 __ andr(rscratch2, s, 2 * wordSize - 1);
1235 } else {
1236 __ neg(rscratch2, s);
1237 __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1238 }
1239 // rscratch2 is the byte adjustment needed to align s.
1240 __ cbz(rscratch2, aligned);
1241 int shift = exact_log2(granularity);
1242 if (shift) __ lsr(rscratch2, rscratch2, shift);
1243 __ sub(count, count, rscratch2);
1244
1245 #if 0
1246 // ?? This code is only correct for a disjoint copy. It may or
1247 // may not make sense to use it in that case.
1248
1249 // Copy the first pair; s and d may not be aligned.
1250 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1251 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1252
1253 // Align s and d, adjust count
1254 if (is_backwards) {
1255 __ sub(s, s, rscratch2);
1256 __ sub(d, d, rscratch2);
1257 } else {
1258 __ add(s, s, rscratch2);
1259 __ add(d, d, rscratch2);
1260 }
1261 #else
1262 copy_memory_small(s, d, rscratch2, rscratch1, step);
1263 #endif
1264 }
1265
1266 __ bind(aligned);
1267
1268 // s is now 2-word-aligned.
1269
1270 // We have a count of units and some trailing bytes. Adjust the
1271 // count and do a bulk copy of words.
1272 __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1273 if (direction == copy_forwards)
1274 __ bl(copy_f);
1275 else
1276 __ bl(copy_b);
1277
1278 // And the tail.
1279 copy_memory_small(s, d, count, tmp, step);
1280
1281 if (granularity >= 8) __ bind(copy8);
1282 if (granularity >= 4) __ bind(copy4);
1283 __ bind(finish);
1284 }
1285
1286
clobber_registers()1287 void clobber_registers() {
1288 #ifdef ASSERT
1289 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1290 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1291 for (Register r = r3; r <= r18; r++)
1292 if (r != rscratch1) __ mov(r, rscratch1);
1293 #endif
1294 }
1295
1296 // Scan over array at a for count oops, verifying each one.
1297 // Preserves a and count, clobbers rscratch1 and rscratch2.
verify_oop_array(size_t size,Register a,Register count,Register temp)1298 void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1299 Label loop, end;
1300 __ mov(rscratch1, a);
1301 __ mov(rscratch2, zr);
1302 __ bind(loop);
1303 __ cmp(rscratch2, count);
1304 __ br(Assembler::HS, end);
1305 if (size == (size_t)wordSize) {
1306 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1307 __ verify_oop(temp);
1308 } else {
1309 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1310 __ decode_heap_oop(temp); // calls verify_oop
1311 }
1312 __ add(rscratch2, rscratch2, 1);
1313 __ b(loop);
1314 __ bind(end);
1315 }
1316
1317 // Arguments:
1318 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1319 // ignored
1320 // is_oop - true => oop array, so generate store check code
1321 // name - stub name string
1322 //
1323 // Inputs:
1324 // c_rarg0 - source array address
1325 // c_rarg1 - destination array address
1326 // c_rarg2 - element count, treated as ssize_t, can be zero
1327 //
1328 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1329 // the hardware handle it. The two dwords within qwords that span
1330 // cache line boundaries will still be loaded and stored atomically.
1331 //
1332 // Side Effects:
1333 // disjoint_int_copy_entry is set to the no-overlap entry point
1334 // used by generate_conjoint_int_oop_copy().
1335 //
generate_disjoint_copy(size_t size,bool aligned,bool is_oop,address * entry,const char * name,bool dest_uninitialized=false)1336 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1337 const char *name, bool dest_uninitialized = false) {
1338 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1339 RegSet saved_reg = RegSet::of(s, d, count);
1340 __ align(CodeEntryAlignment);
1341 StubCodeMark mark(this, "StubRoutines", name);
1342 address start = __ pc();
1343 __ enter();
1344
1345 if (entry != NULL) {
1346 *entry = __ pc();
1347 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1348 BLOCK_COMMENT("Entry:");
1349 }
1350
1351 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1352 if (dest_uninitialized) {
1353 decorators |= IS_DEST_UNINITIALIZED;
1354 }
1355 if (aligned) {
1356 decorators |= ARRAYCOPY_ALIGNED;
1357 }
1358
1359 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1360 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1361
1362 if (is_oop) {
1363 // save regs before copy_memory
1364 __ push(RegSet::of(d, count), sp);
1365 }
1366 copy_memory(aligned, s, d, count, rscratch1, size);
1367
1368 if (is_oop) {
1369 __ pop(RegSet::of(d, count), sp);
1370 if (VerifyOops)
1371 verify_oop_array(size, d, count, r16);
1372 }
1373
1374 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1375
1376 __ leave();
1377 __ mov(r0, zr); // return 0
1378 __ ret(lr);
1379 return start;
1380 }
1381
1382 // Arguments:
1383 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1384 // ignored
1385 // is_oop - true => oop array, so generate store check code
1386 // name - stub name string
1387 //
1388 // Inputs:
1389 // c_rarg0 - source array address
1390 // c_rarg1 - destination array address
1391 // c_rarg2 - element count, treated as ssize_t, can be zero
1392 //
1393 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1394 // the hardware handle it. The two dwords within qwords that span
1395 // cache line boundaries will still be loaded and stored atomically.
1396 //
generate_conjoint_copy(size_t size,bool aligned,bool is_oop,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1397 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1398 address *entry, const char *name,
1399 bool dest_uninitialized = false) {
1400 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1401 RegSet saved_regs = RegSet::of(s, d, count);
1402 StubCodeMark mark(this, "StubRoutines", name);
1403 address start = __ pc();
1404 __ enter();
1405
1406 if (entry != NULL) {
1407 *entry = __ pc();
1408 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1409 BLOCK_COMMENT("Entry:");
1410 }
1411
1412 // use fwd copy when (d-s) above_equal (count*size)
1413 __ sub(rscratch1, d, s);
1414 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1415 __ br(Assembler::HS, nooverlap_target);
1416
1417 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1418 if (dest_uninitialized) {
1419 decorators |= IS_DEST_UNINITIALIZED;
1420 }
1421 if (aligned) {
1422 decorators |= ARRAYCOPY_ALIGNED;
1423 }
1424
1425 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1426 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1427
1428 if (is_oop) {
1429 // save regs before copy_memory
1430 __ push(RegSet::of(d, count), sp);
1431 }
1432 copy_memory(aligned, s, d, count, rscratch1, -size);
1433 if (is_oop) {
1434 __ pop(RegSet::of(d, count), sp);
1435 if (VerifyOops)
1436 verify_oop_array(size, d, count, r16);
1437 }
1438 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1439 __ leave();
1440 __ mov(r0, zr); // return 0
1441 __ ret(lr);
1442 return start;
1443 }
1444
1445 // Arguments:
1446 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1447 // ignored
1448 // name - stub name string
1449 //
1450 // Inputs:
1451 // c_rarg0 - source array address
1452 // c_rarg1 - destination array address
1453 // c_rarg2 - element count, treated as ssize_t, can be zero
1454 //
1455 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1456 // we let the hardware handle it. The one to eight bytes within words,
1457 // dwords or qwords that span cache line boundaries will still be loaded
1458 // and stored atomically.
1459 //
1460 // Side Effects:
1461 // disjoint_byte_copy_entry is set to the no-overlap entry point //
1462 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1463 // we let the hardware handle it. The one to eight bytes within words,
1464 // dwords or qwords that span cache line boundaries will still be loaded
1465 // and stored atomically.
1466 //
1467 // Side Effects:
1468 // disjoint_byte_copy_entry is set to the no-overlap entry point
1469 // used by generate_conjoint_byte_copy().
1470 //
generate_disjoint_byte_copy(bool aligned,address * entry,const char * name)1471 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1472 const bool not_oop = false;
1473 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1474 }
1475
1476 // Arguments:
1477 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1478 // ignored
1479 // name - stub name string
1480 //
1481 // Inputs:
1482 // c_rarg0 - source array address
1483 // c_rarg1 - destination array address
1484 // c_rarg2 - element count, treated as ssize_t, can be zero
1485 //
1486 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1487 // we let the hardware handle it. The one to eight bytes within words,
1488 // dwords or qwords that span cache line boundaries will still be loaded
1489 // and stored atomically.
1490 //
generate_conjoint_byte_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1491 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1492 address* entry, const char *name) {
1493 const bool not_oop = false;
1494 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1495 }
1496
1497 // Arguments:
1498 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1499 // ignored
1500 // name - stub name string
1501 //
1502 // Inputs:
1503 // c_rarg0 - source array address
1504 // c_rarg1 - destination array address
1505 // c_rarg2 - element count, treated as ssize_t, can be zero
1506 //
1507 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1508 // let the hardware handle it. The two or four words within dwords
1509 // or qwords that span cache line boundaries will still be loaded
1510 // and stored atomically.
1511 //
1512 // Side Effects:
1513 // disjoint_short_copy_entry is set to the no-overlap entry point
1514 // used by generate_conjoint_short_copy().
1515 //
generate_disjoint_short_copy(bool aligned,address * entry,const char * name)1516 address generate_disjoint_short_copy(bool aligned,
1517 address* entry, const char *name) {
1518 const bool not_oop = false;
1519 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1520 }
1521
1522 // Arguments:
1523 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1524 // ignored
1525 // name - stub name string
1526 //
1527 // Inputs:
1528 // c_rarg0 - source array address
1529 // c_rarg1 - destination array address
1530 // c_rarg2 - element count, treated as ssize_t, can be zero
1531 //
1532 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1533 // let the hardware handle it. The two or four words within dwords
1534 // or qwords that span cache line boundaries will still be loaded
1535 // and stored atomically.
1536 //
generate_conjoint_short_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1537 address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1538 address *entry, const char *name) {
1539 const bool not_oop = false;
1540 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1541
1542 }
1543 // Arguments:
1544 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1545 // ignored
1546 // name - stub name string
1547 //
1548 // Inputs:
1549 // c_rarg0 - source array address
1550 // c_rarg1 - destination array address
1551 // c_rarg2 - element count, treated as ssize_t, can be zero
1552 //
1553 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1554 // the hardware handle it. The two dwords within qwords that span
1555 // cache line boundaries will still be loaded and stored atomically.
1556 //
1557 // Side Effects:
1558 // disjoint_int_copy_entry is set to the no-overlap entry point
1559 // used by generate_conjoint_int_oop_copy().
1560 //
generate_disjoint_int_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1561 address generate_disjoint_int_copy(bool aligned, address *entry,
1562 const char *name, bool dest_uninitialized = false) {
1563 const bool not_oop = false;
1564 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1565 }
1566
1567 // Arguments:
1568 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1569 // ignored
1570 // name - stub name string
1571 //
1572 // Inputs:
1573 // c_rarg0 - source array address
1574 // c_rarg1 - destination array address
1575 // c_rarg2 - element count, treated as ssize_t, can be zero
1576 //
1577 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1578 // the hardware handle it. The two dwords within qwords that span
1579 // cache line boundaries will still be loaded and stored atomically.
1580 //
generate_conjoint_int_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1581 address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1582 address *entry, const char *name,
1583 bool dest_uninitialized = false) {
1584 const bool not_oop = false;
1585 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1586 }
1587
1588
1589 // Arguments:
1590 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1591 // ignored
1592 // name - stub name string
1593 //
1594 // Inputs:
1595 // c_rarg0 - source array address
1596 // c_rarg1 - destination array address
1597 // c_rarg2 - element count, treated as size_t, can be zero
1598 //
1599 // Side Effects:
1600 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1601 // no-overlap entry point used by generate_conjoint_long_oop_copy().
1602 //
generate_disjoint_long_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1603 address generate_disjoint_long_copy(bool aligned, address *entry,
1604 const char *name, bool dest_uninitialized = false) {
1605 const bool not_oop = false;
1606 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1607 }
1608
1609 // Arguments:
1610 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1611 // ignored
1612 // name - stub name string
1613 //
1614 // Inputs:
1615 // c_rarg0 - source array address
1616 // c_rarg1 - destination array address
1617 // c_rarg2 - element count, treated as size_t, can be zero
1618 //
generate_conjoint_long_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1619 address generate_conjoint_long_copy(bool aligned,
1620 address nooverlap_target, address *entry,
1621 const char *name, bool dest_uninitialized = false) {
1622 const bool not_oop = false;
1623 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1624 }
1625
1626 // Arguments:
1627 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1628 // ignored
1629 // name - stub name string
1630 //
1631 // Inputs:
1632 // c_rarg0 - source array address
1633 // c_rarg1 - destination array address
1634 // c_rarg2 - element count, treated as size_t, can be zero
1635 //
1636 // Side Effects:
1637 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1638 // no-overlap entry point used by generate_conjoint_long_oop_copy().
1639 //
generate_disjoint_oop_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized)1640 address generate_disjoint_oop_copy(bool aligned, address *entry,
1641 const char *name, bool dest_uninitialized) {
1642 const bool is_oop = true;
1643 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1644 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1645 }
1646
1647 // Arguments:
1648 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1649 // ignored
1650 // name - stub name string
1651 //
1652 // Inputs:
1653 // c_rarg0 - source array address
1654 // c_rarg1 - destination array address
1655 // c_rarg2 - element count, treated as size_t, can be zero
1656 //
generate_conjoint_oop_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized)1657 address generate_conjoint_oop_copy(bool aligned,
1658 address nooverlap_target, address *entry,
1659 const char *name, bool dest_uninitialized) {
1660 const bool is_oop = true;
1661 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1662 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1663 name, dest_uninitialized);
1664 }
1665
1666
1667 // Helper for generating a dynamic type check.
1668 // Smashes rscratch1, rscratch2.
generate_type_check(Register sub_klass,Register super_check_offset,Register super_klass,Label & L_success)1669 void generate_type_check(Register sub_klass,
1670 Register super_check_offset,
1671 Register super_klass,
1672 Label& L_success) {
1673 assert_different_registers(sub_klass, super_check_offset, super_klass);
1674
1675 BLOCK_COMMENT("type_check:");
1676
1677 Label L_miss;
1678
1679 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL,
1680 super_check_offset);
1681 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1682
1683 // Fall through on failure!
1684 __ BIND(L_miss);
1685 }
1686
1687 //
1688 // Generate checkcasting array copy stub
1689 //
1690 // Input:
1691 // c_rarg0 - source array address
1692 // c_rarg1 - destination array address
1693 // c_rarg2 - element count, treated as ssize_t, can be zero
1694 // c_rarg3 - size_t ckoff (super_check_offset)
1695 // c_rarg4 - oop ckval (super_klass)
1696 //
1697 // Output:
1698 // r0 == 0 - success
1699 // r0 == -1^K - failure, where K is partial transfer count
1700 //
generate_checkcast_copy(const char * name,address * entry,bool dest_uninitialized=false)1701 address generate_checkcast_copy(const char *name, address *entry,
1702 bool dest_uninitialized = false) {
1703
1704 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1705
1706 // Input registers (after setup_arg_regs)
1707 const Register from = c_rarg0; // source array address
1708 const Register to = c_rarg1; // destination array address
1709 const Register count = c_rarg2; // elementscount
1710 const Register ckoff = c_rarg3; // super_check_offset
1711 const Register ckval = c_rarg4; // super_klass
1712
1713 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1714 RegSet wb_post_saved_regs = RegSet::of(count);
1715
1716 // Registers used as temps (r18, r19, r20 are save-on-entry)
1717 const Register count_save = r21; // orig elementscount
1718 const Register start_to = r20; // destination array start address
1719 const Register copied_oop = r18; // actual oop copied
1720 const Register r19_klass = r19; // oop._klass
1721
1722 //---------------------------------------------------------------
1723 // Assembler stub will be used for this call to arraycopy
1724 // if the two arrays are subtypes of Object[] but the
1725 // destination array type is not equal to or a supertype
1726 // of the source type. Each element must be separately
1727 // checked.
1728
1729 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1730 copied_oop, r19_klass, count_save);
1731
1732 __ align(CodeEntryAlignment);
1733 StubCodeMark mark(this, "StubRoutines", name);
1734 address start = __ pc();
1735
1736 __ enter(); // required for proper stackwalking of RuntimeStub frame
1737
1738 #ifdef ASSERT
1739 // caller guarantees that the arrays really are different
1740 // otherwise, we would have to make conjoint checks
1741 { Label L;
1742 array_overlap_test(L, TIMES_OOP);
1743 __ stop("checkcast_copy within a single array");
1744 __ bind(L);
1745 }
1746 #endif //ASSERT
1747
1748 // Caller of this entry point must set up the argument registers.
1749 if (entry != NULL) {
1750 *entry = __ pc();
1751 BLOCK_COMMENT("Entry:");
1752 }
1753
1754 // Empty array: Nothing to do.
1755 __ cbz(count, L_done);
1756
1757 __ push(RegSet::of(r18, r19, r20, r21), sp);
1758
1759 #ifdef ASSERT
1760 BLOCK_COMMENT("assert consistent ckoff/ckval");
1761 // The ckoff and ckval must be mutually consistent,
1762 // even though caller generates both.
1763 { Label L;
1764 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1765 __ ldrw(start_to, Address(ckval, sco_offset));
1766 __ cmpw(ckoff, start_to);
1767 __ br(Assembler::EQ, L);
1768 __ stop("super_check_offset inconsistent");
1769 __ bind(L);
1770 }
1771 #endif //ASSERT
1772
1773 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1774 bool is_oop = true;
1775 if (dest_uninitialized) {
1776 decorators |= IS_DEST_UNINITIALIZED;
1777 }
1778
1779 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1780 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1781
1782 // save the original count
1783 __ mov(count_save, count);
1784
1785 // Copy from low to high addresses
1786 __ mov(start_to, to); // Save destination array start address
1787 __ b(L_load_element);
1788
1789 // ======== begin loop ========
1790 // (Loop is rotated; its entry is L_load_element.)
1791 // Loop control:
1792 // for (; count != 0; count--) {
1793 // copied_oop = load_heap_oop(from++);
1794 // ... generate_type_check ...;
1795 // store_heap_oop(to++, copied_oop);
1796 // }
1797 __ align(OptoLoopAlignment);
1798
1799 __ BIND(L_store_element);
1800 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop
1801 __ sub(count, count, 1);
1802 __ cbz(count, L_do_card_marks);
1803
1804 // ======== loop entry is here ========
1805 __ BIND(L_load_element);
1806 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1807 __ cbz(copied_oop, L_store_element);
1808
1809 __ load_klass(r19_klass, copied_oop);// query the object klass
1810 generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1811 // ======== end loop ========
1812
1813 // It was a real error; we must depend on the caller to finish the job.
1814 // Register count = remaining oops, count_orig = total oops.
1815 // Emit GC store barriers for the oops we have copied and report
1816 // their number to the caller.
1817
1818 __ subs(count, count_save, count); // K = partially copied oop count
1819 __ eon(count, count, zr); // report (-1^K) to caller
1820 __ br(Assembler::EQ, L_done_pop);
1821
1822 __ BIND(L_do_card_marks);
1823 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1824
1825 __ bind(L_done_pop);
1826 __ pop(RegSet::of(r18, r19, r20, r21), sp);
1827 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1828
1829 __ bind(L_done);
1830 __ mov(r0, count);
1831 __ leave();
1832 __ ret(lr);
1833
1834 return start;
1835 }
1836
1837 // Perform range checks on the proposed arraycopy.
1838 // Kills temp, but nothing else.
1839 // Also, clean the sign bits of src_pos and dst_pos.
arraycopy_range_checks(Register src,Register src_pos,Register dst,Register dst_pos,Register length,Register temp,Label & L_failed)1840 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
1841 Register src_pos, // source position (c_rarg1)
1842 Register dst, // destination array oo (c_rarg2)
1843 Register dst_pos, // destination position (c_rarg3)
1844 Register length,
1845 Register temp,
1846 Label& L_failed) {
1847 BLOCK_COMMENT("arraycopy_range_checks:");
1848
1849 assert_different_registers(rscratch1, temp);
1850
1851 // if (src_pos + length > arrayOop(src)->length()) FAIL;
1852 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1853 __ addw(temp, length, src_pos);
1854 __ cmpw(temp, rscratch1);
1855 __ br(Assembler::HI, L_failed);
1856
1857 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
1858 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1859 __ addw(temp, length, dst_pos);
1860 __ cmpw(temp, rscratch1);
1861 __ br(Assembler::HI, L_failed);
1862
1863 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1864 __ movw(src_pos, src_pos);
1865 __ movw(dst_pos, dst_pos);
1866
1867 BLOCK_COMMENT("arraycopy_range_checks done");
1868 }
1869
1870 // These stubs get called from some dumb test routine.
1871 // I'll write them properly when they're called from
1872 // something that's actually doing something.
fake_arraycopy_stub(address src,address dst,int count)1873 static void fake_arraycopy_stub(address src, address dst, int count) {
1874 assert(count == 0, "huh?");
1875 }
1876
1877
1878 //
1879 // Generate 'unsafe' array copy stub
1880 // Though just as safe as the other stubs, it takes an unscaled
1881 // size_t argument instead of an element count.
1882 //
1883 // Input:
1884 // c_rarg0 - source array address
1885 // c_rarg1 - destination array address
1886 // c_rarg2 - byte count, treated as ssize_t, can be zero
1887 //
1888 // Examines the alignment of the operands and dispatches
1889 // to a long, int, short, or byte copy loop.
1890 //
generate_unsafe_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address long_copy_entry)1891 address generate_unsafe_copy(const char *name,
1892 address byte_copy_entry,
1893 address short_copy_entry,
1894 address int_copy_entry,
1895 address long_copy_entry) {
1896 Label L_long_aligned, L_int_aligned, L_short_aligned;
1897 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1898
1899 __ align(CodeEntryAlignment);
1900 StubCodeMark mark(this, "StubRoutines", name);
1901 address start = __ pc();
1902 __ enter(); // required for proper stackwalking of RuntimeStub frame
1903
1904 // bump this on entry, not on exit:
1905 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1906
1907 __ orr(rscratch1, s, d);
1908 __ orr(rscratch1, rscratch1, count);
1909
1910 __ andr(rscratch1, rscratch1, BytesPerLong-1);
1911 __ cbz(rscratch1, L_long_aligned);
1912 __ andr(rscratch1, rscratch1, BytesPerInt-1);
1913 __ cbz(rscratch1, L_int_aligned);
1914 __ tbz(rscratch1, 0, L_short_aligned);
1915 __ b(RuntimeAddress(byte_copy_entry));
1916
1917 __ BIND(L_short_aligned);
1918 __ lsr(count, count, LogBytesPerShort); // size => short_count
1919 __ b(RuntimeAddress(short_copy_entry));
1920 __ BIND(L_int_aligned);
1921 __ lsr(count, count, LogBytesPerInt); // size => int_count
1922 __ b(RuntimeAddress(int_copy_entry));
1923 __ BIND(L_long_aligned);
1924 __ lsr(count, count, LogBytesPerLong); // size => long_count
1925 __ b(RuntimeAddress(long_copy_entry));
1926
1927 return start;
1928 }
1929
1930 //
1931 // Generate generic array copy stubs
1932 //
1933 // Input:
1934 // c_rarg0 - src oop
1935 // c_rarg1 - src_pos (32-bits)
1936 // c_rarg2 - dst oop
1937 // c_rarg3 - dst_pos (32-bits)
1938 // c_rarg4 - element count (32-bits)
1939 //
1940 // Output:
1941 // r0 == 0 - success
1942 // r0 == -1^K - failure, where K is partial transfer count
1943 //
generate_generic_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address oop_copy_entry,address long_copy_entry,address checkcast_copy_entry)1944 address generate_generic_copy(const char *name,
1945 address byte_copy_entry, address short_copy_entry,
1946 address int_copy_entry, address oop_copy_entry,
1947 address long_copy_entry, address checkcast_copy_entry) {
1948
1949 Label L_failed, L_failed_0, L_objArray;
1950 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1951
1952 // Input registers
1953 const Register src = c_rarg0; // source array oop
1954 const Register src_pos = c_rarg1; // source position
1955 const Register dst = c_rarg2; // destination array oop
1956 const Register dst_pos = c_rarg3; // destination position
1957 const Register length = c_rarg4;
1958
1959 __ align(CodeEntryAlignment);
1960
1961 StubCodeMark mark(this, "StubRoutines", name);
1962
1963 // Registers used as temps
1964 const Register dst_klass = c_rarg5;
1965
1966 address start = __ pc();
1967
1968 __ enter(); // required for proper stackwalking of RuntimeStub frame
1969
1970 // bump this on entry, not on exit:
1971 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1972
1973 //-----------------------------------------------------------------------
1974 // Assembler stub will be used for this call to arraycopy
1975 // if the following conditions are met:
1976 //
1977 // (1) src and dst must not be null.
1978 // (2) src_pos must not be negative.
1979 // (3) dst_pos must not be negative.
1980 // (4) length must not be negative.
1981 // (5) src klass and dst klass should be the same and not NULL.
1982 // (6) src and dst should be arrays.
1983 // (7) src_pos + length must not exceed length of src.
1984 // (8) dst_pos + length must not exceed length of dst.
1985 //
1986
1987 // if (src == NULL) return -1;
1988 __ cbz(src, L_failed);
1989
1990 // if (src_pos < 0) return -1;
1991 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
1992
1993 // if (dst == NULL) return -1;
1994 __ cbz(dst, L_failed);
1995
1996 // if (dst_pos < 0) return -1;
1997 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
1998
1999 // registers used as temp
2000 const Register scratch_length = r16; // elements count to copy
2001 const Register scratch_src_klass = r17; // array klass
2002 const Register lh = r18; // layout helper
2003
2004 // if (length < 0) return -1;
2005 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2006 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2007
2008 __ load_klass(scratch_src_klass, src);
2009 #ifdef ASSERT
2010 // assert(src->klass() != NULL);
2011 {
2012 BLOCK_COMMENT("assert klasses not null {");
2013 Label L1, L2;
2014 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL
2015 __ bind(L1);
2016 __ stop("broken null klass");
2017 __ bind(L2);
2018 __ load_klass(rscratch1, dst);
2019 __ cbz(rscratch1, L1); // this would be broken also
2020 BLOCK_COMMENT("} assert klasses not null done");
2021 }
2022 #endif
2023
2024 // Load layout helper (32-bits)
2025 //
2026 // |array_tag| | header_size | element_type | |log2_element_size|
2027 // 32 30 24 16 8 2 0
2028 //
2029 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2030 //
2031
2032 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2033
2034 // Handle objArrays completely differently...
2035 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2036 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2037 __ movw(rscratch1, objArray_lh);
2038 __ eorw(rscratch2, lh, rscratch1);
2039 __ cbzw(rscratch2, L_objArray);
2040
2041 // if (src->klass() != dst->klass()) return -1;
2042 __ load_klass(rscratch2, dst);
2043 __ eor(rscratch2, rscratch2, scratch_src_klass);
2044 __ cbnz(rscratch2, L_failed);
2045
2046 // if (!src->is_Array()) return -1;
2047 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2048
2049 // At this point, it is known to be a typeArray (array_tag 0x3).
2050 #ifdef ASSERT
2051 {
2052 BLOCK_COMMENT("assert primitive array {");
2053 Label L;
2054 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2055 __ cmpw(lh, rscratch2);
2056 __ br(Assembler::GE, L);
2057 __ stop("must be a primitive array");
2058 __ bind(L);
2059 BLOCK_COMMENT("} assert primitive array done");
2060 }
2061 #endif
2062
2063 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2064 rscratch2, L_failed);
2065
2066 // TypeArrayKlass
2067 //
2068 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2069 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2070 //
2071
2072 const Register rscratch1_offset = rscratch1; // array offset
2073 const Register r18_elsize = lh; // element size
2074
2075 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2076 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2077 __ add(src, src, rscratch1_offset); // src array offset
2078 __ add(dst, dst, rscratch1_offset); // dst array offset
2079 BLOCK_COMMENT("choose copy loop based on element size");
2080
2081 // next registers should be set before the jump to corresponding stub
2082 const Register from = c_rarg0; // source array address
2083 const Register to = c_rarg1; // destination array address
2084 const Register count = c_rarg2; // elements count
2085
2086 // 'from', 'to', 'count' registers should be set in such order
2087 // since they are the same as 'src', 'src_pos', 'dst'.
2088
2089 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2090
2091 // The possible values of elsize are 0-3, i.e. exact_log2(element
2092 // size in bytes). We do a simple bitwise binary search.
2093 __ BIND(L_copy_bytes);
2094 __ tbnz(r18_elsize, 1, L_copy_ints);
2095 __ tbnz(r18_elsize, 0, L_copy_shorts);
2096 __ lea(from, Address(src, src_pos));// src_addr
2097 __ lea(to, Address(dst, dst_pos));// dst_addr
2098 __ movw(count, scratch_length); // length
2099 __ b(RuntimeAddress(byte_copy_entry));
2100
2101 __ BIND(L_copy_shorts);
2102 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2103 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2104 __ movw(count, scratch_length); // length
2105 __ b(RuntimeAddress(short_copy_entry));
2106
2107 __ BIND(L_copy_ints);
2108 __ tbnz(r18_elsize, 0, L_copy_longs);
2109 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2110 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2111 __ movw(count, scratch_length); // length
2112 __ b(RuntimeAddress(int_copy_entry));
2113
2114 __ BIND(L_copy_longs);
2115 #ifdef ASSERT
2116 {
2117 BLOCK_COMMENT("assert long copy {");
2118 Label L;
2119 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2120 __ cmpw(r18_elsize, LogBytesPerLong);
2121 __ br(Assembler::EQ, L);
2122 __ stop("must be long copy, but elsize is wrong");
2123 __ bind(L);
2124 BLOCK_COMMENT("} assert long copy done");
2125 }
2126 #endif
2127 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2128 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2129 __ movw(count, scratch_length); // length
2130 __ b(RuntimeAddress(long_copy_entry));
2131
2132 // ObjArrayKlass
2133 __ BIND(L_objArray);
2134 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2135
2136 Label L_plain_copy, L_checkcast_copy;
2137 // test array classes for subtyping
2138 __ load_klass(r18, dst);
2139 __ cmp(scratch_src_klass, r18); // usual case is exact equality
2140 __ br(Assembler::NE, L_checkcast_copy);
2141
2142 // Identically typed arrays can be copied without element-wise checks.
2143 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2144 rscratch2, L_failed);
2145
2146 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2147 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2148 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2149 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2150 __ movw(count, scratch_length); // length
2151 __ BIND(L_plain_copy);
2152 __ b(RuntimeAddress(oop_copy_entry));
2153
2154 __ BIND(L_checkcast_copy);
2155 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass)
2156 {
2157 // Before looking at dst.length, make sure dst is also an objArray.
2158 __ ldrw(rscratch1, Address(r18, lh_offset));
2159 __ movw(rscratch2, objArray_lh);
2160 __ eorw(rscratch1, rscratch1, rscratch2);
2161 __ cbnzw(rscratch1, L_failed);
2162
2163 // It is safe to examine both src.length and dst.length.
2164 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2165 r18, L_failed);
2166
2167 __ load_klass(dst_klass, dst); // reload
2168
2169 // Marshal the base address arguments now, freeing registers.
2170 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2171 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2172 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2173 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2174 __ movw(count, length); // length (reloaded)
2175 Register sco_temp = c_rarg3; // this register is free now
2176 assert_different_registers(from, to, count, sco_temp,
2177 dst_klass, scratch_src_klass);
2178 // assert_clean_int(count, sco_temp);
2179
2180 // Generate the type check.
2181 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2182 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2183
2184 // Smashes rscratch1, rscratch2
2185 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2186
2187 // Fetch destination element klass from the ObjArrayKlass header.
2188 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2189 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2190 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2191
2192 // the checkcast_copy loop needs two extra arguments:
2193 assert(c_rarg3 == sco_temp, "#3 already in place");
2194 // Set up arguments for checkcast_copy_entry.
2195 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2196 __ b(RuntimeAddress(checkcast_copy_entry));
2197 }
2198
2199 __ BIND(L_failed);
2200 __ mov(r0, -1);
2201 __ leave(); // required for proper stackwalking of RuntimeStub frame
2202 __ ret(lr);
2203
2204 return start;
2205 }
2206
2207 //
2208 // Generate stub for array fill. If "aligned" is true, the
2209 // "to" address is assumed to be heapword aligned.
2210 //
2211 // Arguments for generated stub:
2212 // to: c_rarg0
2213 // value: c_rarg1
2214 // count: c_rarg2 treated as signed
2215 //
generate_fill(BasicType t,bool aligned,const char * name)2216 address generate_fill(BasicType t, bool aligned, const char *name) {
2217 __ align(CodeEntryAlignment);
2218 StubCodeMark mark(this, "StubRoutines", name);
2219 address start = __ pc();
2220
2221 BLOCK_COMMENT("Entry:");
2222
2223 const Register to = c_rarg0; // source array address
2224 const Register value = c_rarg1; // value
2225 const Register count = c_rarg2; // elements count
2226
2227 const Register bz_base = r10; // base for block_zero routine
2228 const Register cnt_words = r11; // temp register
2229
2230 __ enter();
2231
2232 Label L_fill_elements, L_exit1;
2233
2234 int shift = -1;
2235 switch (t) {
2236 case T_BYTE:
2237 shift = 0;
2238 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2239 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2240 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2241 __ br(Assembler::LO, L_fill_elements);
2242 break;
2243 case T_SHORT:
2244 shift = 1;
2245 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2246 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2247 __ br(Assembler::LO, L_fill_elements);
2248 break;
2249 case T_INT:
2250 shift = 2;
2251 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2252 __ br(Assembler::LO, L_fill_elements);
2253 break;
2254 default: ShouldNotReachHere();
2255 }
2256
2257 // Align source address at 8 bytes address boundary.
2258 Label L_skip_align1, L_skip_align2, L_skip_align4;
2259 if (!aligned) {
2260 switch (t) {
2261 case T_BYTE:
2262 // One byte misalignment happens only for byte arrays.
2263 __ tbz(to, 0, L_skip_align1);
2264 __ strb(value, Address(__ post(to, 1)));
2265 __ subw(count, count, 1);
2266 __ bind(L_skip_align1);
2267 // Fallthrough
2268 case T_SHORT:
2269 // Two bytes misalignment happens only for byte and short (char) arrays.
2270 __ tbz(to, 1, L_skip_align2);
2271 __ strh(value, Address(__ post(to, 2)));
2272 __ subw(count, count, 2 >> shift);
2273 __ bind(L_skip_align2);
2274 // Fallthrough
2275 case T_INT:
2276 // Align to 8 bytes, we know we are 4 byte aligned to start.
2277 __ tbz(to, 2, L_skip_align4);
2278 __ strw(value, Address(__ post(to, 4)));
2279 __ subw(count, count, 4 >> shift);
2280 __ bind(L_skip_align4);
2281 break;
2282 default: ShouldNotReachHere();
2283 }
2284 }
2285
2286 //
2287 // Fill large chunks
2288 //
2289 __ lsrw(cnt_words, count, 3 - shift); // number of words
2290 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2291 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2292 if (UseBlockZeroing) {
2293 Label non_block_zeroing, rest;
2294 // If the fill value is zero we can use the fast zero_words().
2295 __ cbnz(value, non_block_zeroing);
2296 __ mov(bz_base, to);
2297 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2298 __ zero_words(bz_base, cnt_words);
2299 __ b(rest);
2300 __ bind(non_block_zeroing);
2301 __ fill_words(to, cnt_words, value);
2302 __ bind(rest);
2303 } else {
2304 __ fill_words(to, cnt_words, value);
2305 }
2306
2307 // Remaining count is less than 8 bytes. Fill it by a single store.
2308 // Note that the total length is no less than 8 bytes.
2309 if (t == T_BYTE || t == T_SHORT) {
2310 Label L_exit1;
2311 __ cbzw(count, L_exit1);
2312 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2313 __ str(value, Address(to, -8)); // overwrite some elements
2314 __ bind(L_exit1);
2315 __ leave();
2316 __ ret(lr);
2317 }
2318
2319 // Handle copies less than 8 bytes.
2320 Label L_fill_2, L_fill_4, L_exit2;
2321 __ bind(L_fill_elements);
2322 switch (t) {
2323 case T_BYTE:
2324 __ tbz(count, 0, L_fill_2);
2325 __ strb(value, Address(__ post(to, 1)));
2326 __ bind(L_fill_2);
2327 __ tbz(count, 1, L_fill_4);
2328 __ strh(value, Address(__ post(to, 2)));
2329 __ bind(L_fill_4);
2330 __ tbz(count, 2, L_exit2);
2331 __ strw(value, Address(to));
2332 break;
2333 case T_SHORT:
2334 __ tbz(count, 0, L_fill_4);
2335 __ strh(value, Address(__ post(to, 2)));
2336 __ bind(L_fill_4);
2337 __ tbz(count, 1, L_exit2);
2338 __ strw(value, Address(to));
2339 break;
2340 case T_INT:
2341 __ cbzw(count, L_exit2);
2342 __ strw(value, Address(to));
2343 break;
2344 default: ShouldNotReachHere();
2345 }
2346 __ bind(L_exit2);
2347 __ leave();
2348 __ ret(lr);
2349 return start;
2350 }
2351
generate_arraycopy_stubs()2352 void generate_arraycopy_stubs() {
2353 address entry;
2354 address entry_jbyte_arraycopy;
2355 address entry_jshort_arraycopy;
2356 address entry_jint_arraycopy;
2357 address entry_oop_arraycopy;
2358 address entry_jlong_arraycopy;
2359 address entry_checkcast_arraycopy;
2360
2361 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2362 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2363
2364 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2365
2366 //*** jbyte
2367 // Always need aligned and unaligned versions
2368 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
2369 "jbyte_disjoint_arraycopy");
2370 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry,
2371 &entry_jbyte_arraycopy,
2372 "jbyte_arraycopy");
2373 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2374 "arrayof_jbyte_disjoint_arraycopy");
2375 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL,
2376 "arrayof_jbyte_arraycopy");
2377
2378 //*** jshort
2379 // Always need aligned and unaligned versions
2380 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
2381 "jshort_disjoint_arraycopy");
2382 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry,
2383 &entry_jshort_arraycopy,
2384 "jshort_arraycopy");
2385 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2386 "arrayof_jshort_disjoint_arraycopy");
2387 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL,
2388 "arrayof_jshort_arraycopy");
2389
2390 //*** jint
2391 // Aligned versions
2392 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2393 "arrayof_jint_disjoint_arraycopy");
2394 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2395 "arrayof_jint_arraycopy");
2396 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2397 // entry_jint_arraycopy always points to the unaligned version
2398 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry,
2399 "jint_disjoint_arraycopy");
2400 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry,
2401 &entry_jint_arraycopy,
2402 "jint_arraycopy");
2403
2404 //*** jlong
2405 // It is always aligned
2406 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2407 "arrayof_jlong_disjoint_arraycopy");
2408 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2409 "arrayof_jlong_arraycopy");
2410 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2411 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2412
2413 //*** oops
2414 {
2415 // With compressed oops we need unaligned versions; notice that
2416 // we overwrite entry_oop_arraycopy.
2417 bool aligned = !UseCompressedOops;
2418
2419 StubRoutines::_arrayof_oop_disjoint_arraycopy
2420 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2421 /*dest_uninitialized*/false);
2422 StubRoutines::_arrayof_oop_arraycopy
2423 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2424 /*dest_uninitialized*/false);
2425 // Aligned versions without pre-barriers
2426 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2427 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2428 /*dest_uninitialized*/true);
2429 StubRoutines::_arrayof_oop_arraycopy_uninit
2430 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2431 /*dest_uninitialized*/true);
2432 }
2433
2434 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2435 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2436 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2437 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2438
2439 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2440 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2441 /*dest_uninitialized*/true);
2442
2443 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
2444 entry_jbyte_arraycopy,
2445 entry_jshort_arraycopy,
2446 entry_jint_arraycopy,
2447 entry_jlong_arraycopy);
2448
2449 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
2450 entry_jbyte_arraycopy,
2451 entry_jshort_arraycopy,
2452 entry_jint_arraycopy,
2453 entry_oop_arraycopy,
2454 entry_jlong_arraycopy,
2455 entry_checkcast_arraycopy);
2456
2457 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2458 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2459 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2460 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2461 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2462 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2463 }
2464
generate_math_stubs()2465 void generate_math_stubs() { Unimplemented(); }
2466
2467 // Arguments:
2468 //
2469 // Inputs:
2470 // c_rarg0 - source byte array address
2471 // c_rarg1 - destination byte array address
2472 // c_rarg2 - K (key) in little endian int array
2473 //
generate_aescrypt_encryptBlock()2474 address generate_aescrypt_encryptBlock() {
2475 __ align(CodeEntryAlignment);
2476 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2477
2478 Label L_doLast;
2479
2480 const Register from = c_rarg0; // source array address
2481 const Register to = c_rarg1; // destination array address
2482 const Register key = c_rarg2; // key array address
2483 const Register keylen = rscratch1;
2484
2485 address start = __ pc();
2486 __ enter();
2487
2488 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2489
2490 __ ld1(v0, __ T16B, from); // get 16 bytes of input
2491
2492 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2493 __ rev32(v1, __ T16B, v1);
2494 __ rev32(v2, __ T16B, v2);
2495 __ rev32(v3, __ T16B, v3);
2496 __ rev32(v4, __ T16B, v4);
2497 __ aese(v0, v1);
2498 __ aesmc(v0, v0);
2499 __ aese(v0, v2);
2500 __ aesmc(v0, v0);
2501 __ aese(v0, v3);
2502 __ aesmc(v0, v0);
2503 __ aese(v0, v4);
2504 __ aesmc(v0, v0);
2505
2506 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2507 __ rev32(v1, __ T16B, v1);
2508 __ rev32(v2, __ T16B, v2);
2509 __ rev32(v3, __ T16B, v3);
2510 __ rev32(v4, __ T16B, v4);
2511 __ aese(v0, v1);
2512 __ aesmc(v0, v0);
2513 __ aese(v0, v2);
2514 __ aesmc(v0, v0);
2515 __ aese(v0, v3);
2516 __ aesmc(v0, v0);
2517 __ aese(v0, v4);
2518 __ aesmc(v0, v0);
2519
2520 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2521 __ rev32(v1, __ T16B, v1);
2522 __ rev32(v2, __ T16B, v2);
2523
2524 __ cmpw(keylen, 44);
2525 __ br(Assembler::EQ, L_doLast);
2526
2527 __ aese(v0, v1);
2528 __ aesmc(v0, v0);
2529 __ aese(v0, v2);
2530 __ aesmc(v0, v0);
2531
2532 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2533 __ rev32(v1, __ T16B, v1);
2534 __ rev32(v2, __ T16B, v2);
2535
2536 __ cmpw(keylen, 52);
2537 __ br(Assembler::EQ, L_doLast);
2538
2539 __ aese(v0, v1);
2540 __ aesmc(v0, v0);
2541 __ aese(v0, v2);
2542 __ aesmc(v0, v0);
2543
2544 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2545 __ rev32(v1, __ T16B, v1);
2546 __ rev32(v2, __ T16B, v2);
2547
2548 __ BIND(L_doLast);
2549
2550 __ aese(v0, v1);
2551 __ aesmc(v0, v0);
2552 __ aese(v0, v2);
2553
2554 __ ld1(v1, __ T16B, key);
2555 __ rev32(v1, __ T16B, v1);
2556 __ eor(v0, __ T16B, v0, v1);
2557
2558 __ st1(v0, __ T16B, to);
2559
2560 __ mov(r0, 0);
2561
2562 __ leave();
2563 __ ret(lr);
2564
2565 return start;
2566 }
2567
2568 // Arguments:
2569 //
2570 // Inputs:
2571 // c_rarg0 - source byte array address
2572 // c_rarg1 - destination byte array address
2573 // c_rarg2 - K (key) in little endian int array
2574 //
generate_aescrypt_decryptBlock()2575 address generate_aescrypt_decryptBlock() {
2576 assert(UseAES, "need AES instructions and misaligned SSE support");
2577 __ align(CodeEntryAlignment);
2578 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2579 Label L_doLast;
2580
2581 const Register from = c_rarg0; // source array address
2582 const Register to = c_rarg1; // destination array address
2583 const Register key = c_rarg2; // key array address
2584 const Register keylen = rscratch1;
2585
2586 address start = __ pc();
2587 __ enter(); // required for proper stackwalking of RuntimeStub frame
2588
2589 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2590
2591 __ ld1(v0, __ T16B, from); // get 16 bytes of input
2592
2593 __ ld1(v5, __ T16B, __ post(key, 16));
2594 __ rev32(v5, __ T16B, v5);
2595
2596 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2597 __ rev32(v1, __ T16B, v1);
2598 __ rev32(v2, __ T16B, v2);
2599 __ rev32(v3, __ T16B, v3);
2600 __ rev32(v4, __ T16B, v4);
2601 __ aesd(v0, v1);
2602 __ aesimc(v0, v0);
2603 __ aesd(v0, v2);
2604 __ aesimc(v0, v0);
2605 __ aesd(v0, v3);
2606 __ aesimc(v0, v0);
2607 __ aesd(v0, v4);
2608 __ aesimc(v0, v0);
2609
2610 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2611 __ rev32(v1, __ T16B, v1);
2612 __ rev32(v2, __ T16B, v2);
2613 __ rev32(v3, __ T16B, v3);
2614 __ rev32(v4, __ T16B, v4);
2615 __ aesd(v0, v1);
2616 __ aesimc(v0, v0);
2617 __ aesd(v0, v2);
2618 __ aesimc(v0, v0);
2619 __ aesd(v0, v3);
2620 __ aesimc(v0, v0);
2621 __ aesd(v0, v4);
2622 __ aesimc(v0, v0);
2623
2624 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2625 __ rev32(v1, __ T16B, v1);
2626 __ rev32(v2, __ T16B, v2);
2627
2628 __ cmpw(keylen, 44);
2629 __ br(Assembler::EQ, L_doLast);
2630
2631 __ aesd(v0, v1);
2632 __ aesimc(v0, v0);
2633 __ aesd(v0, v2);
2634 __ aesimc(v0, v0);
2635
2636 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2637 __ rev32(v1, __ T16B, v1);
2638 __ rev32(v2, __ T16B, v2);
2639
2640 __ cmpw(keylen, 52);
2641 __ br(Assembler::EQ, L_doLast);
2642
2643 __ aesd(v0, v1);
2644 __ aesimc(v0, v0);
2645 __ aesd(v0, v2);
2646 __ aesimc(v0, v0);
2647
2648 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2649 __ rev32(v1, __ T16B, v1);
2650 __ rev32(v2, __ T16B, v2);
2651
2652 __ BIND(L_doLast);
2653
2654 __ aesd(v0, v1);
2655 __ aesimc(v0, v0);
2656 __ aesd(v0, v2);
2657
2658 __ eor(v0, __ T16B, v0, v5);
2659
2660 __ st1(v0, __ T16B, to);
2661
2662 __ mov(r0, 0);
2663
2664 __ leave();
2665 __ ret(lr);
2666
2667 return start;
2668 }
2669
2670 // Arguments:
2671 //
2672 // Inputs:
2673 // c_rarg0 - source byte array address
2674 // c_rarg1 - destination byte array address
2675 // c_rarg2 - K (key) in little endian int array
2676 // c_rarg3 - r vector byte array address
2677 // c_rarg4 - input length
2678 //
2679 // Output:
2680 // x0 - input length
2681 //
generate_cipherBlockChaining_encryptAESCrypt()2682 address generate_cipherBlockChaining_encryptAESCrypt() {
2683 assert(UseAES, "need AES instructions and misaligned SSE support");
2684 __ align(CodeEntryAlignment);
2685 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2686
2687 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2688
2689 const Register from = c_rarg0; // source array address
2690 const Register to = c_rarg1; // destination array address
2691 const Register key = c_rarg2; // key array address
2692 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2693 // and left with the results of the last encryption block
2694 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2695 const Register keylen = rscratch1;
2696
2697 address start = __ pc();
2698
2699 __ enter();
2700
2701 __ movw(rscratch2, len_reg);
2702
2703 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2704
2705 __ ld1(v0, __ T16B, rvec);
2706
2707 __ cmpw(keylen, 52);
2708 __ br(Assembler::CC, L_loadkeys_44);
2709 __ br(Assembler::EQ, L_loadkeys_52);
2710
2711 __ ld1(v17, v18, __ T16B, __ post(key, 32));
2712 __ rev32(v17, __ T16B, v17);
2713 __ rev32(v18, __ T16B, v18);
2714 __ BIND(L_loadkeys_52);
2715 __ ld1(v19, v20, __ T16B, __ post(key, 32));
2716 __ rev32(v19, __ T16B, v19);
2717 __ rev32(v20, __ T16B, v20);
2718 __ BIND(L_loadkeys_44);
2719 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2720 __ rev32(v21, __ T16B, v21);
2721 __ rev32(v22, __ T16B, v22);
2722 __ rev32(v23, __ T16B, v23);
2723 __ rev32(v24, __ T16B, v24);
2724 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2725 __ rev32(v25, __ T16B, v25);
2726 __ rev32(v26, __ T16B, v26);
2727 __ rev32(v27, __ T16B, v27);
2728 __ rev32(v28, __ T16B, v28);
2729 __ ld1(v29, v30, v31, __ T16B, key);
2730 __ rev32(v29, __ T16B, v29);
2731 __ rev32(v30, __ T16B, v30);
2732 __ rev32(v31, __ T16B, v31);
2733
2734 __ BIND(L_aes_loop);
2735 __ ld1(v1, __ T16B, __ post(from, 16));
2736 __ eor(v0, __ T16B, v0, v1);
2737
2738 __ br(Assembler::CC, L_rounds_44);
2739 __ br(Assembler::EQ, L_rounds_52);
2740
2741 __ aese(v0, v17); __ aesmc(v0, v0);
2742 __ aese(v0, v18); __ aesmc(v0, v0);
2743 __ BIND(L_rounds_52);
2744 __ aese(v0, v19); __ aesmc(v0, v0);
2745 __ aese(v0, v20); __ aesmc(v0, v0);
2746 __ BIND(L_rounds_44);
2747 __ aese(v0, v21); __ aesmc(v0, v0);
2748 __ aese(v0, v22); __ aesmc(v0, v0);
2749 __ aese(v0, v23); __ aesmc(v0, v0);
2750 __ aese(v0, v24); __ aesmc(v0, v0);
2751 __ aese(v0, v25); __ aesmc(v0, v0);
2752 __ aese(v0, v26); __ aesmc(v0, v0);
2753 __ aese(v0, v27); __ aesmc(v0, v0);
2754 __ aese(v0, v28); __ aesmc(v0, v0);
2755 __ aese(v0, v29); __ aesmc(v0, v0);
2756 __ aese(v0, v30);
2757 __ eor(v0, __ T16B, v0, v31);
2758
2759 __ st1(v0, __ T16B, __ post(to, 16));
2760
2761 __ subw(len_reg, len_reg, 16);
2762 __ cbnzw(len_reg, L_aes_loop);
2763
2764 __ st1(v0, __ T16B, rvec);
2765
2766 __ mov(r0, rscratch2);
2767
2768 __ leave();
2769 __ ret(lr);
2770
2771 return start;
2772 }
2773
2774 // Arguments:
2775 //
2776 // Inputs:
2777 // c_rarg0 - source byte array address
2778 // c_rarg1 - destination byte array address
2779 // c_rarg2 - K (key) in little endian int array
2780 // c_rarg3 - r vector byte array address
2781 // c_rarg4 - input length
2782 //
2783 // Output:
2784 // r0 - input length
2785 //
generate_cipherBlockChaining_decryptAESCrypt()2786 address generate_cipherBlockChaining_decryptAESCrypt() {
2787 assert(UseAES, "need AES instructions and misaligned SSE support");
2788 __ align(CodeEntryAlignment);
2789 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2790
2791 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2792
2793 const Register from = c_rarg0; // source array address
2794 const Register to = c_rarg1; // destination array address
2795 const Register key = c_rarg2; // key array address
2796 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2797 // and left with the results of the last encryption block
2798 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2799 const Register keylen = rscratch1;
2800
2801 address start = __ pc();
2802
2803 __ enter();
2804
2805 __ movw(rscratch2, len_reg);
2806
2807 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2808
2809 __ ld1(v2, __ T16B, rvec);
2810
2811 __ ld1(v31, __ T16B, __ post(key, 16));
2812 __ rev32(v31, __ T16B, v31);
2813
2814 __ cmpw(keylen, 52);
2815 __ br(Assembler::CC, L_loadkeys_44);
2816 __ br(Assembler::EQ, L_loadkeys_52);
2817
2818 __ ld1(v17, v18, __ T16B, __ post(key, 32));
2819 __ rev32(v17, __ T16B, v17);
2820 __ rev32(v18, __ T16B, v18);
2821 __ BIND(L_loadkeys_52);
2822 __ ld1(v19, v20, __ T16B, __ post(key, 32));
2823 __ rev32(v19, __ T16B, v19);
2824 __ rev32(v20, __ T16B, v20);
2825 __ BIND(L_loadkeys_44);
2826 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2827 __ rev32(v21, __ T16B, v21);
2828 __ rev32(v22, __ T16B, v22);
2829 __ rev32(v23, __ T16B, v23);
2830 __ rev32(v24, __ T16B, v24);
2831 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2832 __ rev32(v25, __ T16B, v25);
2833 __ rev32(v26, __ T16B, v26);
2834 __ rev32(v27, __ T16B, v27);
2835 __ rev32(v28, __ T16B, v28);
2836 __ ld1(v29, v30, __ T16B, key);
2837 __ rev32(v29, __ T16B, v29);
2838 __ rev32(v30, __ T16B, v30);
2839
2840 __ BIND(L_aes_loop);
2841 __ ld1(v0, __ T16B, __ post(from, 16));
2842 __ orr(v1, __ T16B, v0, v0);
2843
2844 __ br(Assembler::CC, L_rounds_44);
2845 __ br(Assembler::EQ, L_rounds_52);
2846
2847 __ aesd(v0, v17); __ aesimc(v0, v0);
2848 __ aesd(v0, v18); __ aesimc(v0, v0);
2849 __ BIND(L_rounds_52);
2850 __ aesd(v0, v19); __ aesimc(v0, v0);
2851 __ aesd(v0, v20); __ aesimc(v0, v0);
2852 __ BIND(L_rounds_44);
2853 __ aesd(v0, v21); __ aesimc(v0, v0);
2854 __ aesd(v0, v22); __ aesimc(v0, v0);
2855 __ aesd(v0, v23); __ aesimc(v0, v0);
2856 __ aesd(v0, v24); __ aesimc(v0, v0);
2857 __ aesd(v0, v25); __ aesimc(v0, v0);
2858 __ aesd(v0, v26); __ aesimc(v0, v0);
2859 __ aesd(v0, v27); __ aesimc(v0, v0);
2860 __ aesd(v0, v28); __ aesimc(v0, v0);
2861 __ aesd(v0, v29); __ aesimc(v0, v0);
2862 __ aesd(v0, v30);
2863 __ eor(v0, __ T16B, v0, v31);
2864 __ eor(v0, __ T16B, v0, v2);
2865
2866 __ st1(v0, __ T16B, __ post(to, 16));
2867 __ orr(v2, __ T16B, v1, v1);
2868
2869 __ subw(len_reg, len_reg, 16);
2870 __ cbnzw(len_reg, L_aes_loop);
2871
2872 __ st1(v2, __ T16B, rvec);
2873
2874 __ mov(r0, rscratch2);
2875
2876 __ leave();
2877 __ ret(lr);
2878
2879 return start;
2880 }
2881
2882 // Arguments:
2883 //
2884 // Inputs:
2885 // c_rarg0 - byte[] source+offset
2886 // c_rarg1 - int[] SHA.state
2887 // c_rarg2 - int offset
2888 // c_rarg3 - int limit
2889 //
generate_sha1_implCompress(bool multi_block,const char * name)2890 address generate_sha1_implCompress(bool multi_block, const char *name) {
2891 __ align(CodeEntryAlignment);
2892 StubCodeMark mark(this, "StubRoutines", name);
2893 address start = __ pc();
2894
2895 Register buf = c_rarg0;
2896 Register state = c_rarg1;
2897 Register ofs = c_rarg2;
2898 Register limit = c_rarg3;
2899
2900 Label keys;
2901 Label sha1_loop;
2902
2903 // load the keys into v0..v3
2904 __ adr(rscratch1, keys);
2905 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2906 // load 5 words state into v6, v7
2907 __ ldrq(v6, Address(state, 0));
2908 __ ldrs(v7, Address(state, 16));
2909
2910
2911 __ BIND(sha1_loop);
2912 // load 64 bytes of data into v16..v19
2913 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2914 __ rev32(v16, __ T16B, v16);
2915 __ rev32(v17, __ T16B, v17);
2916 __ rev32(v18, __ T16B, v18);
2917 __ rev32(v19, __ T16B, v19);
2918
2919 // do the sha1
2920 __ addv(v4, __ T4S, v16, v0);
2921 __ orr(v20, __ T16B, v6, v6);
2922
2923 FloatRegister d0 = v16;
2924 FloatRegister d1 = v17;
2925 FloatRegister d2 = v18;
2926 FloatRegister d3 = v19;
2927
2928 for (int round = 0; round < 20; round++) {
2929 FloatRegister tmp1 = (round & 1) ? v4 : v5;
2930 FloatRegister tmp2 = (round & 1) ? v21 : v22;
2931 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2932 FloatRegister tmp4 = (round & 1) ? v5 : v4;
2933 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2934
2935 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2936 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2937 __ sha1h(tmp2, __ T4S, v20);
2938 if (round < 5)
2939 __ sha1c(v20, __ T4S, tmp3, tmp4);
2940 else if (round < 10 || round >= 15)
2941 __ sha1p(v20, __ T4S, tmp3, tmp4);
2942 else
2943 __ sha1m(v20, __ T4S, tmp3, tmp4);
2944 if (round < 16) __ sha1su1(d0, __ T4S, d3);
2945
2946 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2947 }
2948
2949 __ addv(v7, __ T2S, v7, v21);
2950 __ addv(v6, __ T4S, v6, v20);
2951
2952 if (multi_block) {
2953 __ add(ofs, ofs, 64);
2954 __ cmp(ofs, limit);
2955 __ br(Assembler::LE, sha1_loop);
2956 __ mov(c_rarg0, ofs); // return ofs
2957 }
2958
2959 __ strq(v6, Address(state, 0));
2960 __ strs(v7, Address(state, 16));
2961
2962 __ ret(lr);
2963
2964 __ bind(keys);
2965 __ emit_int32(0x5a827999);
2966 __ emit_int32(0x6ed9eba1);
2967 __ emit_int32(0x8f1bbcdc);
2968 __ emit_int32(0xca62c1d6);
2969
2970 return start;
2971 }
2972
2973
2974 // Arguments:
2975 //
2976 // Inputs:
2977 // c_rarg0 - byte[] source+offset
2978 // c_rarg1 - int[] SHA.state
2979 // c_rarg2 - int offset
2980 // c_rarg3 - int limit
2981 //
generate_sha256_implCompress(bool multi_block,const char * name)2982 address generate_sha256_implCompress(bool multi_block, const char *name) {
2983 static const uint32_t round_consts[64] = {
2984 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2985 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2986 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2987 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2988 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2989 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2990 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2991 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2992 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2993 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2994 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2995 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2996 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2997 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2998 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
2999 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3000 };
3001 __ align(CodeEntryAlignment);
3002 StubCodeMark mark(this, "StubRoutines", name);
3003 address start = __ pc();
3004
3005 Register buf = c_rarg0;
3006 Register state = c_rarg1;
3007 Register ofs = c_rarg2;
3008 Register limit = c_rarg3;
3009
3010 Label sha1_loop;
3011
3012 __ stpd(v8, v9, __ pre(sp, -32));
3013 __ stpd(v10, v11, Address(sp, 16));
3014
3015 // dga == v0
3016 // dgb == v1
3017 // dg0 == v2
3018 // dg1 == v3
3019 // dg2 == v4
3020 // t0 == v6
3021 // t1 == v7
3022
3023 // load 16 keys to v16..v31
3024 __ lea(rscratch1, ExternalAddress((address)round_consts));
3025 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3026 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3027 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3028 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3029
3030 // load 8 words (256 bits) state
3031 __ ldpq(v0, v1, state);
3032
3033 __ BIND(sha1_loop);
3034 // load 64 bytes of data into v8..v11
3035 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3036 __ rev32(v8, __ T16B, v8);
3037 __ rev32(v9, __ T16B, v9);
3038 __ rev32(v10, __ T16B, v10);
3039 __ rev32(v11, __ T16B, v11);
3040
3041 __ addv(v6, __ T4S, v8, v16);
3042 __ orr(v2, __ T16B, v0, v0);
3043 __ orr(v3, __ T16B, v1, v1);
3044
3045 FloatRegister d0 = v8;
3046 FloatRegister d1 = v9;
3047 FloatRegister d2 = v10;
3048 FloatRegister d3 = v11;
3049
3050
3051 for (int round = 0; round < 16; round++) {
3052 FloatRegister tmp1 = (round & 1) ? v6 : v7;
3053 FloatRegister tmp2 = (round & 1) ? v7 : v6;
3054 FloatRegister tmp3 = (round & 1) ? v2 : v4;
3055 FloatRegister tmp4 = (round & 1) ? v4 : v2;
3056
3057 if (round < 12) __ sha256su0(d0, __ T4S, d1);
3058 __ orr(v4, __ T16B, v2, v2);
3059 if (round < 15)
3060 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3061 __ sha256h(v2, __ T4S, v3, tmp2);
3062 __ sha256h2(v3, __ T4S, v4, tmp2);
3063 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3064
3065 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3066 }
3067
3068 __ addv(v0, __ T4S, v0, v2);
3069 __ addv(v1, __ T4S, v1, v3);
3070
3071 if (multi_block) {
3072 __ add(ofs, ofs, 64);
3073 __ cmp(ofs, limit);
3074 __ br(Assembler::LE, sha1_loop);
3075 __ mov(c_rarg0, ofs); // return ofs
3076 }
3077
3078 __ ldpd(v10, v11, Address(sp, 16));
3079 __ ldpd(v8, v9, __ post(sp, 32));
3080
3081 __ stpq(v0, v1, state);
3082
3083 __ ret(lr);
3084
3085 return start;
3086 }
3087
3088 // Safefetch stubs.
generate_safefetch(const char * name,int size,address * entry,address * fault_pc,address * continuation_pc)3089 void generate_safefetch(const char* name, int size, address* entry,
3090 address* fault_pc, address* continuation_pc) {
3091 // safefetch signatures:
3092 // int SafeFetch32(int* adr, int errValue);
3093 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3094 //
3095 // arguments:
3096 // c_rarg0 = adr
3097 // c_rarg1 = errValue
3098 //
3099 // result:
3100 // PPC_RET = *adr or errValue
3101
3102 StubCodeMark mark(this, "StubRoutines", name);
3103
3104 // Entry point, pc or function descriptor.
3105 *entry = __ pc();
3106
3107 // Load *adr into c_rarg1, may fault.
3108 *fault_pc = __ pc();
3109 switch (size) {
3110 case 4:
3111 // int32_t
3112 __ ldrw(c_rarg1, Address(c_rarg0, 0));
3113 break;
3114 case 8:
3115 // int64_t
3116 __ ldr(c_rarg1, Address(c_rarg0, 0));
3117 break;
3118 default:
3119 ShouldNotReachHere();
3120 }
3121
3122 // return errValue or *adr
3123 *continuation_pc = __ pc();
3124 __ mov(r0, c_rarg1);
3125 __ ret(lr);
3126 }
3127
3128 /**
3129 * Arguments:
3130 *
3131 * Inputs:
3132 * c_rarg0 - int crc
3133 * c_rarg1 - byte* buf
3134 * c_rarg2 - int length
3135 *
3136 * Ouput:
3137 * rax - int crc result
3138 */
generate_updateBytesCRC32()3139 address generate_updateBytesCRC32() {
3140 assert(UseCRC32Intrinsics, "what are we doing here?");
3141
3142 __ align(CodeEntryAlignment);
3143 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3144
3145 address start = __ pc();
3146
3147 const Register crc = c_rarg0; // crc
3148 const Register buf = c_rarg1; // source java byte array address
3149 const Register len = c_rarg2; // length
3150 const Register table0 = c_rarg3; // crc_table address
3151 const Register table1 = c_rarg4;
3152 const Register table2 = c_rarg5;
3153 const Register table3 = c_rarg6;
3154 const Register tmp3 = c_rarg7;
3155
3156 BLOCK_COMMENT("Entry:");
3157 __ enter(); // required for proper stackwalking of RuntimeStub frame
3158
3159 __ kernel_crc32(crc, buf, len,
3160 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3161
3162 __ leave(); // required for proper stackwalking of RuntimeStub frame
3163 __ ret(lr);
3164
3165 return start;
3166 }
3167
3168 /**
3169 * Arguments:
3170 *
3171 * Inputs:
3172 * c_rarg0 - int crc
3173 * c_rarg1 - byte* buf
3174 * c_rarg2 - int length
3175 * c_rarg3 - int* table
3176 *
3177 * Ouput:
3178 * r0 - int crc result
3179 */
generate_updateBytesCRC32C()3180 address generate_updateBytesCRC32C() {
3181 assert(UseCRC32CIntrinsics, "what are we doing here?");
3182
3183 __ align(CodeEntryAlignment);
3184 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3185
3186 address start = __ pc();
3187
3188 const Register crc = c_rarg0; // crc
3189 const Register buf = c_rarg1; // source java byte array address
3190 const Register len = c_rarg2; // length
3191 const Register table0 = c_rarg3; // crc_table address
3192 const Register table1 = c_rarg4;
3193 const Register table2 = c_rarg5;
3194 const Register table3 = c_rarg6;
3195 const Register tmp3 = c_rarg7;
3196
3197 BLOCK_COMMENT("Entry:");
3198 __ enter(); // required for proper stackwalking of RuntimeStub frame
3199
3200 __ kernel_crc32c(crc, buf, len,
3201 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3202
3203 __ leave(); // required for proper stackwalking of RuntimeStub frame
3204 __ ret(lr);
3205
3206 return start;
3207 }
3208
3209 /***
3210 * Arguments:
3211 *
3212 * Inputs:
3213 * c_rarg0 - int adler
3214 * c_rarg1 - byte* buff
3215 * c_rarg2 - int len
3216 *
3217 * Output:
3218 * c_rarg0 - int adler result
3219 */
generate_updateBytesAdler32()3220 address generate_updateBytesAdler32() {
3221 __ align(CodeEntryAlignment);
3222 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3223 address start = __ pc();
3224
3225 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3226
3227 // Aliases
3228 Register adler = c_rarg0;
3229 Register s1 = c_rarg0;
3230 Register s2 = c_rarg3;
3231 Register buff = c_rarg1;
3232 Register len = c_rarg2;
3233 Register nmax = r4;
3234 Register base = r5;
3235 Register count = r6;
3236 Register temp0 = rscratch1;
3237 Register temp1 = rscratch2;
3238 FloatRegister vbytes = v0;
3239 FloatRegister vs1acc = v1;
3240 FloatRegister vs2acc = v2;
3241 FloatRegister vtable = v3;
3242
3243 // Max number of bytes we can process before having to take the mod
3244 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3245 uint64_t BASE = 0xfff1;
3246 uint64_t NMAX = 0x15B0;
3247
3248 __ mov(base, BASE);
3249 __ mov(nmax, NMAX);
3250
3251 // Load accumulation coefficients for the upper 16 bits
3252 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3253 __ ld1(vtable, __ T16B, Address(temp0));
3254
3255 // s1 is initialized to the lower 16 bits of adler
3256 // s2 is initialized to the upper 16 bits of adler
3257 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
3258 __ uxth(s1, adler); // s1 = (adler & 0xffff)
3259
3260 // The pipelined loop needs at least 16 elements for 1 iteration
3261 // It does check this, but it is more effective to skip to the cleanup loop
3262 __ cmp(len, 16);
3263 __ br(Assembler::HS, L_nmax);
3264 __ cbz(len, L_combine);
3265
3266 __ bind(L_simple_by1_loop);
3267 __ ldrb(temp0, Address(__ post(buff, 1)));
3268 __ add(s1, s1, temp0);
3269 __ add(s2, s2, s1);
3270 __ subs(len, len, 1);
3271 __ br(Assembler::HI, L_simple_by1_loop);
3272
3273 // s1 = s1 % BASE
3274 __ subs(temp0, s1, base);
3275 __ csel(s1, temp0, s1, Assembler::HS);
3276
3277 // s2 = s2 % BASE
3278 __ lsr(temp0, s2, 16);
3279 __ lsl(temp1, temp0, 4);
3280 __ sub(temp1, temp1, temp0);
3281 __ add(s2, temp1, s2, ext::uxth);
3282
3283 __ subs(temp0, s2, base);
3284 __ csel(s2, temp0, s2, Assembler::HS);
3285
3286 __ b(L_combine);
3287
3288 __ bind(L_nmax);
3289 __ subs(len, len, nmax);
3290 __ sub(count, nmax, 16);
3291 __ br(Assembler::LO, L_by16);
3292
3293 __ bind(L_nmax_loop);
3294
3295 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3296 vbytes, vs1acc, vs2acc, vtable);
3297
3298 __ subs(count, count, 16);
3299 __ br(Assembler::HS, L_nmax_loop);
3300
3301 // s1 = s1 % BASE
3302 __ lsr(temp0, s1, 16);
3303 __ lsl(temp1, temp0, 4);
3304 __ sub(temp1, temp1, temp0);
3305 __ add(temp1, temp1, s1, ext::uxth);
3306
3307 __ lsr(temp0, temp1, 16);
3308 __ lsl(s1, temp0, 4);
3309 __ sub(s1, s1, temp0);
3310 __ add(s1, s1, temp1, ext:: uxth);
3311
3312 __ subs(temp0, s1, base);
3313 __ csel(s1, temp0, s1, Assembler::HS);
3314
3315 // s2 = s2 % BASE
3316 __ lsr(temp0, s2, 16);
3317 __ lsl(temp1, temp0, 4);
3318 __ sub(temp1, temp1, temp0);
3319 __ add(temp1, temp1, s2, ext::uxth);
3320
3321 __ lsr(temp0, temp1, 16);
3322 __ lsl(s2, temp0, 4);
3323 __ sub(s2, s2, temp0);
3324 __ add(s2, s2, temp1, ext:: uxth);
3325
3326 __ subs(temp0, s2, base);
3327 __ csel(s2, temp0, s2, Assembler::HS);
3328
3329 __ subs(len, len, nmax);
3330 __ sub(count, nmax, 16);
3331 __ br(Assembler::HS, L_nmax_loop);
3332
3333 __ bind(L_by16);
3334 __ adds(len, len, count);
3335 __ br(Assembler::LO, L_by1);
3336
3337 __ bind(L_by16_loop);
3338
3339 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3340 vbytes, vs1acc, vs2acc, vtable);
3341
3342 __ subs(len, len, 16);
3343 __ br(Assembler::HS, L_by16_loop);
3344
3345 __ bind(L_by1);
3346 __ adds(len, len, 15);
3347 __ br(Assembler::LO, L_do_mod);
3348
3349 __ bind(L_by1_loop);
3350 __ ldrb(temp0, Address(__ post(buff, 1)));
3351 __ add(s1, temp0, s1);
3352 __ add(s2, s2, s1);
3353 __ subs(len, len, 1);
3354 __ br(Assembler::HS, L_by1_loop);
3355
3356 __ bind(L_do_mod);
3357 // s1 = s1 % BASE
3358 __ lsr(temp0, s1, 16);
3359 __ lsl(temp1, temp0, 4);
3360 __ sub(temp1, temp1, temp0);
3361 __ add(temp1, temp1, s1, ext::uxth);
3362
3363 __ lsr(temp0, temp1, 16);
3364 __ lsl(s1, temp0, 4);
3365 __ sub(s1, s1, temp0);
3366 __ add(s1, s1, temp1, ext:: uxth);
3367
3368 __ subs(temp0, s1, base);
3369 __ csel(s1, temp0, s1, Assembler::HS);
3370
3371 // s2 = s2 % BASE
3372 __ lsr(temp0, s2, 16);
3373 __ lsl(temp1, temp0, 4);
3374 __ sub(temp1, temp1, temp0);
3375 __ add(temp1, temp1, s2, ext::uxth);
3376
3377 __ lsr(temp0, temp1, 16);
3378 __ lsl(s2, temp0, 4);
3379 __ sub(s2, s2, temp0);
3380 __ add(s2, s2, temp1, ext:: uxth);
3381
3382 __ subs(temp0, s2, base);
3383 __ csel(s2, temp0, s2, Assembler::HS);
3384
3385 // Combine lower bits and higher bits
3386 __ bind(L_combine);
3387 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3388
3389 __ ret(lr);
3390
3391 return start;
3392 }
3393
generate_updateBytesAdler32_accum(Register s1,Register s2,Register buff,Register temp0,Register temp1,FloatRegister vbytes,FloatRegister vs1acc,FloatRegister vs2acc,FloatRegister vtable)3394 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3395 Register temp0, Register temp1, FloatRegister vbytes,
3396 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3397 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3398 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3399 // In non-vectorized code, we update s1 and s2 as:
3400 // s1 <- s1 + b1
3401 // s2 <- s2 + s1
3402 // s1 <- s1 + b2
3403 // s2 <- s2 + b1
3404 // ...
3405 // s1 <- s1 + b16
3406 // s2 <- s2 + s1
3407 // Putting above assignments together, we have:
3408 // s1_new = s1 + b1 + b2 + ... + b16
3409 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3410 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3411 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3412 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3413
3414 // s2 = s2 + s1 * 16
3415 __ add(s2, s2, s1, Assembler::LSL, 4);
3416
3417 // vs1acc = b1 + b2 + b3 + ... + b16
3418 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3419 __ umullv(vs2acc, __ T8B, vtable, vbytes);
3420 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3421 __ uaddlv(vs1acc, __ T16B, vbytes);
3422 __ uaddlv(vs2acc, __ T8H, vs2acc);
3423
3424 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3425 __ fmovd(temp0, vs1acc);
3426 __ fmovd(temp1, vs2acc);
3427 __ add(s1, s1, temp0);
3428 __ add(s2, s2, temp1);
3429 }
3430
3431 /**
3432 * Arguments:
3433 *
3434 * Input:
3435 * c_rarg0 - x address
3436 * c_rarg1 - x length
3437 * c_rarg2 - y address
3438 * c_rarg3 - y lenth
3439 * c_rarg4 - z address
3440 * c_rarg5 - z length
3441 */
generate_multiplyToLen()3442 address generate_multiplyToLen() {
3443 __ align(CodeEntryAlignment);
3444 StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3445
3446 address start = __ pc();
3447 const Register x = r0;
3448 const Register xlen = r1;
3449 const Register y = r2;
3450 const Register ylen = r3;
3451 const Register z = r4;
3452 const Register zlen = r5;
3453
3454 const Register tmp1 = r10;
3455 const Register tmp2 = r11;
3456 const Register tmp3 = r12;
3457 const Register tmp4 = r13;
3458 const Register tmp5 = r14;
3459 const Register tmp6 = r15;
3460 const Register tmp7 = r16;
3461
3462 BLOCK_COMMENT("Entry:");
3463 __ enter(); // required for proper stackwalking of RuntimeStub frame
3464 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3465 __ leave(); // required for proper stackwalking of RuntimeStub frame
3466 __ ret(lr);
3467
3468 return start;
3469 }
3470
generate_squareToLen()3471 address generate_squareToLen() {
3472 // squareToLen algorithm for sizes 1..127 described in java code works
3473 // faster than multiply_to_len on some CPUs and slower on others, but
3474 // multiply_to_len shows a bit better overall results
3475 __ align(CodeEntryAlignment);
3476 StubCodeMark mark(this, "StubRoutines", "squareToLen");
3477 address start = __ pc();
3478
3479 const Register x = r0;
3480 const Register xlen = r1;
3481 const Register z = r2;
3482 const Register zlen = r3;
3483 const Register y = r4; // == x
3484 const Register ylen = r5; // == xlen
3485
3486 const Register tmp1 = r10;
3487 const Register tmp2 = r11;
3488 const Register tmp3 = r12;
3489 const Register tmp4 = r13;
3490 const Register tmp5 = r14;
3491 const Register tmp6 = r15;
3492 const Register tmp7 = r16;
3493
3494 RegSet spilled_regs = RegSet::of(y, ylen);
3495 BLOCK_COMMENT("Entry:");
3496 __ enter();
3497 __ push(spilled_regs, sp);
3498 __ mov(y, x);
3499 __ mov(ylen, xlen);
3500 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3501 __ pop(spilled_regs, sp);
3502 __ leave();
3503 __ ret(lr);
3504 return start;
3505 }
3506
generate_mulAdd()3507 address generate_mulAdd() {
3508 __ align(CodeEntryAlignment);
3509 StubCodeMark mark(this, "StubRoutines", "mulAdd");
3510
3511 address start = __ pc();
3512
3513 const Register out = r0;
3514 const Register in = r1;
3515 const Register offset = r2;
3516 const Register len = r3;
3517 const Register k = r4;
3518
3519 BLOCK_COMMENT("Entry:");
3520 __ enter();
3521 __ mul_add(out, in, offset, len, k);
3522 __ leave();
3523 __ ret(lr);
3524
3525 return start;
3526 }
3527
ghash_multiply(FloatRegister result_lo,FloatRegister result_hi,FloatRegister a,FloatRegister b,FloatRegister a1_xor_a0,FloatRegister tmp1,FloatRegister tmp2,FloatRegister tmp3,FloatRegister tmp4)3528 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3529 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3530 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3531 // Karatsuba multiplication performs a 128*128 -> 256-bit
3532 // multiplication in three 128-bit multiplications and a few
3533 // additions.
3534 //
3535 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3536 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3537 //
3538 // Inputs:
3539 //
3540 // A0 in a.d[0] (subkey)
3541 // A1 in a.d[1]
3542 // (A1+A0) in a1_xor_a0.d[0]
3543 //
3544 // B0 in b.d[0] (state)
3545 // B1 in b.d[1]
3546
3547 __ ext(tmp1, __ T16B, b, b, 0x08);
3548 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1
3549 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0)
3550 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0
3551 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3552
3553 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3554 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3555 __ eor(tmp2, __ T16B, tmp2, tmp4);
3556 __ eor(tmp2, __ T16B, tmp2, tmp3);
3557
3558 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3559 __ ins(result_hi, __ D, tmp2, 0, 1);
3560 __ ins(result_lo, __ D, tmp2, 1, 0);
3561 }
3562
ghash_reduce(FloatRegister result,FloatRegister lo,FloatRegister hi,FloatRegister p,FloatRegister z,FloatRegister t1)3563 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3564 FloatRegister p, FloatRegister z, FloatRegister t1) {
3565 const FloatRegister t0 = result;
3566
3567 // The GCM field polynomial f is z^128 + p(z), where p =
3568 // z^7+z^2+z+1.
3569 //
3570 // z^128 === -p(z) (mod (z^128 + p(z)))
3571 //
3572 // so, given that the product we're reducing is
3573 // a == lo + hi * z^128
3574 // substituting,
3575 // === lo - hi * p(z) (mod (z^128 + p(z)))
3576 //
3577 // we reduce by multiplying hi by p(z) and subtracting the result
3578 // from (i.e. XORing it with) lo. Because p has no nonzero high
3579 // bits we can do this with two 64-bit multiplications, lo*p and
3580 // hi*p.
3581
3582 __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3583 __ ext(t1, __ T16B, t0, z, 8);
3584 __ eor(hi, __ T16B, hi, t1);
3585 __ ext(t1, __ T16B, z, t0, 8);
3586 __ eor(lo, __ T16B, lo, t1);
3587 __ pmull(t0, __ T1Q, hi, p, __ T1D);
3588 __ eor(result, __ T16B, lo, t0);
3589 }
3590
generate_has_negatives(address & has_negatives_long)3591 address generate_has_negatives(address &has_negatives_long) {
3592 const int large_loop_size = 64;
3593 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3594 int dcache_line = VM_Version::dcache_line_size();
3595
3596 Register ary1 = r1, len = r2, result = r0;
3597
3598 __ align(CodeEntryAlignment);
3599
3600 StubCodeMark mark(this, "StubRoutines", "has_negatives");
3601
3602 address entry = __ pc();
3603
3604 __ enter();
3605
3606 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3607 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3608
3609 __ cmp(len, 15);
3610 __ br(Assembler::GT, LEN_OVER_15);
3611 // The only case when execution falls into this code is when pointer is near
3612 // the end of memory page and we have to avoid reading next page
3613 __ add(ary1, ary1, len);
3614 __ subs(len, len, 8);
3615 __ br(Assembler::GT, LEN_OVER_8);
3616 __ ldr(rscratch2, Address(ary1, -8));
3617 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
3618 __ lsrv(rscratch2, rscratch2, rscratch1);
3619 __ tst(rscratch2, UPPER_BIT_MASK);
3620 __ cset(result, Assembler::NE);
3621 __ leave();
3622 __ ret(lr);
3623 __ bind(LEN_OVER_8);
3624 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3625 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3626 __ tst(rscratch2, UPPER_BIT_MASK);
3627 __ br(Assembler::NE, RET_TRUE_NO_POP);
3628 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3629 __ lsrv(rscratch1, rscratch1, rscratch2);
3630 __ tst(rscratch1, UPPER_BIT_MASK);
3631 __ cset(result, Assembler::NE);
3632 __ leave();
3633 __ ret(lr);
3634
3635 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3636 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3637
3638 has_negatives_long = __ pc(); // 2nd entry point
3639
3640 __ enter();
3641
3642 __ bind(LEN_OVER_15);
3643 __ push(spilled_regs, sp);
3644 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3645 __ cbz(rscratch2, ALIGNED);
3646 __ ldp(tmp6, tmp1, Address(ary1));
3647 __ mov(tmp5, 16);
3648 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3649 __ add(ary1, ary1, rscratch1);
3650 __ sub(len, len, rscratch1);
3651 __ orr(tmp6, tmp6, tmp1);
3652 __ tst(tmp6, UPPER_BIT_MASK);
3653 __ br(Assembler::NE, RET_TRUE);
3654
3655 __ bind(ALIGNED);
3656 __ cmp(len, large_loop_size);
3657 __ br(Assembler::LT, CHECK_16);
3658 // Perform 16-byte load as early return in pre-loop to handle situation
3659 // when initially aligned large array has negative values at starting bytes,
3660 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3661 // slower. Cases with negative bytes further ahead won't be affected that
3662 // much. In fact, it'll be faster due to early loads, less instructions and
3663 // less branches in LARGE_LOOP.
3664 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3665 __ sub(len, len, 16);
3666 __ orr(tmp6, tmp6, tmp1);
3667 __ tst(tmp6, UPPER_BIT_MASK);
3668 __ br(Assembler::NE, RET_TRUE);
3669 __ cmp(len, large_loop_size);
3670 __ br(Assembler::LT, CHECK_16);
3671
3672 if (SoftwarePrefetchHintDistance >= 0
3673 && SoftwarePrefetchHintDistance >= dcache_line) {
3674 // initial prefetch
3675 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3676 }
3677 __ bind(LARGE_LOOP);
3678 if (SoftwarePrefetchHintDistance >= 0) {
3679 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3680 }
3681 // Issue load instructions first, since it can save few CPU/MEM cycles, also
3682 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3683 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3684 // instructions per cycle and have less branches, but this approach disables
3685 // early return, thus, all 64 bytes are loaded and checked every time.
3686 __ ldp(tmp2, tmp3, Address(ary1));
3687 __ ldp(tmp4, tmp5, Address(ary1, 16));
3688 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3689 __ ldp(tmp6, tmp1, Address(ary1, 48));
3690 __ add(ary1, ary1, large_loop_size);
3691 __ sub(len, len, large_loop_size);
3692 __ orr(tmp2, tmp2, tmp3);
3693 __ orr(tmp4, tmp4, tmp5);
3694 __ orr(rscratch1, rscratch1, rscratch2);
3695 __ orr(tmp6, tmp6, tmp1);
3696 __ orr(tmp2, tmp2, tmp4);
3697 __ orr(rscratch1, rscratch1, tmp6);
3698 __ orr(tmp2, tmp2, rscratch1);
3699 __ tst(tmp2, UPPER_BIT_MASK);
3700 __ br(Assembler::NE, RET_TRUE);
3701 __ cmp(len, large_loop_size);
3702 __ br(Assembler::GE, LARGE_LOOP);
3703
3704 __ bind(CHECK_16); // small 16-byte load pre-loop
3705 __ cmp(len, 16);
3706 __ br(Assembler::LT, POST_LOOP16);
3707
3708 __ bind(LOOP16); // small 16-byte load loop
3709 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3710 __ sub(len, len, 16);
3711 __ orr(tmp2, tmp2, tmp3);
3712 __ tst(tmp2, UPPER_BIT_MASK);
3713 __ br(Assembler::NE, RET_TRUE);
3714 __ cmp(len, 16);
3715 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3716
3717 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3718 __ cmp(len, 8);
3719 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3720 __ ldr(tmp3, Address(__ post(ary1, 8)));
3721 __ sub(len, len, 8);
3722 __ tst(tmp3, UPPER_BIT_MASK);
3723 __ br(Assembler::NE, RET_TRUE);
3724
3725 __ bind(POST_LOOP16_LOAD_TAIL);
3726 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3727 __ ldr(tmp1, Address(ary1));
3728 __ mov(tmp2, 64);
3729 __ sub(tmp4, tmp2, len, __ LSL, 3);
3730 __ lslv(tmp1, tmp1, tmp4);
3731 __ tst(tmp1, UPPER_BIT_MASK);
3732 __ br(Assembler::NE, RET_TRUE);
3733 // Fallthrough
3734
3735 __ bind(RET_FALSE);
3736 __ pop(spilled_regs, sp);
3737 __ leave();
3738 __ mov(result, zr);
3739 __ ret(lr);
3740
3741 __ bind(RET_TRUE);
3742 __ pop(spilled_regs, sp);
3743 __ bind(RET_TRUE_NO_POP);
3744 __ leave();
3745 __ mov(result, 1);
3746 __ ret(lr);
3747
3748 __ bind(DONE);
3749 __ pop(spilled_regs, sp);
3750 __ leave();
3751 __ ret(lr);
3752 return entry;
3753 }
3754
generate_large_array_equals_loop_nonsimd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)3755 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3756 bool usePrefetch, Label &NOT_EQUAL) {
3757 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3758 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3759 tmp7 = r12, tmp8 = r13;
3760 Label LOOP;
3761
3762 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3763 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3764 __ bind(LOOP);
3765 if (usePrefetch) {
3766 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3767 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3768 }
3769 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3770 __ eor(tmp1, tmp1, tmp2);
3771 __ eor(tmp3, tmp3, tmp4);
3772 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3773 __ orr(tmp1, tmp1, tmp3);
3774 __ cbnz(tmp1, NOT_EQUAL);
3775 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3776 __ eor(tmp5, tmp5, tmp6);
3777 __ eor(tmp7, tmp7, tmp8);
3778 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3779 __ orr(tmp5, tmp5, tmp7);
3780 __ cbnz(tmp5, NOT_EQUAL);
3781 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3782 __ eor(tmp1, tmp1, tmp2);
3783 __ eor(tmp3, tmp3, tmp4);
3784 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3785 __ orr(tmp1, tmp1, tmp3);
3786 __ cbnz(tmp1, NOT_EQUAL);
3787 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3788 __ eor(tmp5, tmp5, tmp6);
3789 __ sub(cnt1, cnt1, 8 * wordSize);
3790 __ eor(tmp7, tmp7, tmp8);
3791 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3792 // tmp6 is not used. MacroAssembler::subs is used here (rather than
3793 // cmp) because subs allows an unlimited range of immediate operand.
3794 __ subs(tmp6, cnt1, loopThreshold);
3795 __ orr(tmp5, tmp5, tmp7);
3796 __ cbnz(tmp5, NOT_EQUAL);
3797 __ br(__ GE, LOOP);
3798 // post-loop
3799 __ eor(tmp1, tmp1, tmp2);
3800 __ eor(tmp3, tmp3, tmp4);
3801 __ orr(tmp1, tmp1, tmp3);
3802 __ sub(cnt1, cnt1, 2 * wordSize);
3803 __ cbnz(tmp1, NOT_EQUAL);
3804 }
3805
generate_large_array_equals_loop_simd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)3806 void generate_large_array_equals_loop_simd(int loopThreshold,
3807 bool usePrefetch, Label &NOT_EQUAL) {
3808 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3809 tmp2 = rscratch2;
3810 Label LOOP;
3811
3812 __ bind(LOOP);
3813 if (usePrefetch) {
3814 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3815 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3816 }
3817 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3818 __ sub(cnt1, cnt1, 8 * wordSize);
3819 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3820 __ subs(tmp1, cnt1, loopThreshold);
3821 __ eor(v0, __ T16B, v0, v4);
3822 __ eor(v1, __ T16B, v1, v5);
3823 __ eor(v2, __ T16B, v2, v6);
3824 __ eor(v3, __ T16B, v3, v7);
3825 __ orr(v0, __ T16B, v0, v1);
3826 __ orr(v1, __ T16B, v2, v3);
3827 __ orr(v0, __ T16B, v0, v1);
3828 __ umov(tmp1, v0, __ D, 0);
3829 __ umov(tmp2, v0, __ D, 1);
3830 __ orr(tmp1, tmp1, tmp2);
3831 __ cbnz(tmp1, NOT_EQUAL);
3832 __ br(__ GE, LOOP);
3833 }
3834
3835 // a1 = r1 - array1 address
3836 // a2 = r2 - array2 address
3837 // result = r0 - return value. Already contains "false"
3838 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3839 // r3-r5 are reserved temporary registers
generate_large_array_equals()3840 address generate_large_array_equals() {
3841 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3842 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3843 tmp7 = r12, tmp8 = r13;
3844 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3845 SMALL_LOOP, POST_LOOP;
3846 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3847 // calculate if at least 32 prefetched bytes are used
3848 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3849 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3850 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3851 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3852 tmp5, tmp6, tmp7, tmp8);
3853
3854 __ align(CodeEntryAlignment);
3855
3856 StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3857
3858 address entry = __ pc();
3859 __ enter();
3860 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
3861 // also advance pointers to use post-increment instead of pre-increment
3862 __ add(a1, a1, wordSize);
3863 __ add(a2, a2, wordSize);
3864 if (AvoidUnalignedAccesses) {
3865 // both implementations (SIMD/nonSIMD) are using relatively large load
3866 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3867 // on some CPUs in case of address is not at least 16-byte aligned.
3868 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3869 // load if needed at least for 1st address and make if 16-byte aligned.
3870 Label ALIGNED16;
3871 __ tbz(a1, 3, ALIGNED16);
3872 __ ldr(tmp1, Address(__ post(a1, wordSize)));
3873 __ ldr(tmp2, Address(__ post(a2, wordSize)));
3874 __ sub(cnt1, cnt1, wordSize);
3875 __ eor(tmp1, tmp1, tmp2);
3876 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3877 __ bind(ALIGNED16);
3878 }
3879 if (UseSIMDForArrayEquals) {
3880 if (SoftwarePrefetchHintDistance >= 0) {
3881 __ subs(tmp1, cnt1, prefetchLoopThreshold);
3882 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3883 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3884 /* prfm = */ true, NOT_EQUAL);
3885 __ cmp(cnt1, nonPrefetchLoopThreshold);
3886 __ br(__ LT, TAIL);
3887 }
3888 __ bind(NO_PREFETCH_LARGE_LOOP);
3889 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3890 /* prfm = */ false, NOT_EQUAL);
3891 } else {
3892 __ push(spilled_regs, sp);
3893 if (SoftwarePrefetchHintDistance >= 0) {
3894 __ subs(tmp1, cnt1, prefetchLoopThreshold);
3895 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3896 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3897 /* prfm = */ true, NOT_EQUAL);
3898 __ cmp(cnt1, nonPrefetchLoopThreshold);
3899 __ br(__ LT, TAIL);
3900 }
3901 __ bind(NO_PREFETCH_LARGE_LOOP);
3902 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3903 /* prfm = */ false, NOT_EQUAL);
3904 }
3905 __ bind(TAIL);
3906 __ cbz(cnt1, EQUAL);
3907 __ subs(cnt1, cnt1, wordSize);
3908 __ br(__ LE, POST_LOOP);
3909 __ bind(SMALL_LOOP);
3910 __ ldr(tmp1, Address(__ post(a1, wordSize)));
3911 __ ldr(tmp2, Address(__ post(a2, wordSize)));
3912 __ subs(cnt1, cnt1, wordSize);
3913 __ eor(tmp1, tmp1, tmp2);
3914 __ cbnz(tmp1, NOT_EQUAL);
3915 __ br(__ GT, SMALL_LOOP);
3916 __ bind(POST_LOOP);
3917 __ ldr(tmp1, Address(a1, cnt1));
3918 __ ldr(tmp2, Address(a2, cnt1));
3919 __ eor(tmp1, tmp1, tmp2);
3920 __ cbnz(tmp1, NOT_EQUAL);
3921 __ bind(EQUAL);
3922 __ mov(result, true);
3923 __ bind(NOT_EQUAL);
3924 if (!UseSIMDForArrayEquals) {
3925 __ pop(spilled_regs, sp);
3926 }
3927 __ bind(NOT_EQUAL_NO_POP);
3928 __ leave();
3929 __ ret(lr);
3930 return entry;
3931 }
3932
generate_dsin_dcos(bool isCos)3933 address generate_dsin_dcos(bool isCos) {
3934 __ align(CodeEntryAlignment);
3935 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3936 address start = __ pc();
3937 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3938 (address)StubRoutines::aarch64::_two_over_pi,
3939 (address)StubRoutines::aarch64::_pio2,
3940 (address)StubRoutines::aarch64::_dsin_coef,
3941 (address)StubRoutines::aarch64::_dcos_coef);
3942 return start;
3943 }
3944
generate_dlog()3945 address generate_dlog() {
3946 __ align(CodeEntryAlignment);
3947 StubCodeMark mark(this, "StubRoutines", "dlog");
3948 address entry = __ pc();
3949 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3950 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3951 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3952 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3953 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3954 return entry;
3955 }
3956
3957 // code for comparing 16 bytes of strings with same encoding
compare_string_16_bytes_same(Label & DIFF1,Label & DIFF2)3958 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3959 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3960 __ ldr(rscratch1, Address(__ post(str1, 8)));
3961 __ eor(rscratch2, tmp1, tmp2);
3962 __ ldr(cnt1, Address(__ post(str2, 8)));
3963 __ cbnz(rscratch2, DIFF1);
3964 __ ldr(tmp1, Address(__ post(str1, 8)));
3965 __ eor(rscratch2, rscratch1, cnt1);
3966 __ ldr(tmp2, Address(__ post(str2, 8)));
3967 __ cbnz(rscratch2, DIFF2);
3968 }
3969
3970 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
compare_string_16_x_LU(Register tmpL,Register tmpU,Label & DIFF1,Label & DIFF2)3971 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
3972 Label &DIFF2) {
3973 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
3974 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
3975
3976 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
3977 __ ldr(tmpU, Address(__ post(cnt1, 8)));
3978 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
3979 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
3980
3981 __ fmovd(tmpL, vtmp3);
3982 __ eor(rscratch2, tmp3, tmpL);
3983 __ cbnz(rscratch2, DIFF2);
3984
3985 __ ldr(tmp3, Address(__ post(cnt1, 8)));
3986 __ umov(tmpL, vtmp3, __ D, 1);
3987 __ eor(rscratch2, tmpU, tmpL);
3988 __ cbnz(rscratch2, DIFF1);
3989
3990 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
3991 __ ldr(tmpU, Address(__ post(cnt1, 8)));
3992 __ fmovd(tmpL, vtmp);
3993 __ eor(rscratch2, tmp3, tmpL);
3994 __ cbnz(rscratch2, DIFF2);
3995
3996 __ ldr(tmp3, Address(__ post(cnt1, 8)));
3997 __ umov(tmpL, vtmp, __ D, 1);
3998 __ eor(rscratch2, tmpU, tmpL);
3999 __ cbnz(rscratch2, DIFF1);
4000 }
4001
4002 // r0 = result
4003 // r1 = str1
4004 // r2 = cnt1
4005 // r3 = str2
4006 // r4 = cnt2
4007 // r10 = tmp1
4008 // r11 = tmp2
generate_compare_long_string_different_encoding(bool isLU)4009 address generate_compare_long_string_different_encoding(bool isLU) {
4010 __ align(CodeEntryAlignment);
4011 StubCodeMark mark(this, "StubRoutines", isLU
4012 ? "compare_long_string_different_encoding LU"
4013 : "compare_long_string_different_encoding UL");
4014 address entry = __ pc();
4015 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4016 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4017 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4018 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4019 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4020 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4021 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4022
4023 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4024
4025 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4026 // cnt2 == amount of characters left to compare
4027 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4028 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4029 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4030 __ add(str2, str2, isLU ? wordSize : wordSize/2);
4031 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4032 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4033 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4034 __ eor(rscratch2, tmp1, tmp2);
4035 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4036 __ mov(rscratch1, tmp2);
4037 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4038 Register strU = isLU ? str2 : str1,
4039 strL = isLU ? str1 : str2,
4040 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4041 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4042 __ push(spilled_regs, sp);
4043 __ sub(tmp2, strL, cnt2); // strL pointer to load from
4044 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4045
4046 __ ldr(tmp3, Address(__ post(cnt1, 8)));
4047
4048 if (SoftwarePrefetchHintDistance >= 0) {
4049 __ cmp(cnt2, prefetchLoopExitCondition);
4050 __ br(__ LT, NO_PREFETCH);
4051 __ bind(LARGE_LOOP_PREFETCH);
4052 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4053 __ mov(tmp4, 2);
4054 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4055 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4056 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4057 __ subs(tmp4, tmp4, 1);
4058 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4059 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4060 __ mov(tmp4, 2);
4061 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4062 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4063 __ subs(tmp4, tmp4, 1);
4064 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4065 __ sub(cnt2, cnt2, 64);
4066 __ cmp(cnt2, prefetchLoopExitCondition);
4067 __ br(__ GE, LARGE_LOOP_PREFETCH);
4068 }
4069 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4070 __ bind(NO_PREFETCH);
4071 __ subs(cnt2, cnt2, 16);
4072 __ br(__ LT, TAIL);
4073 __ bind(SMALL_LOOP); // smaller loop
4074 __ subs(cnt2, cnt2, 16);
4075 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4076 __ br(__ GE, SMALL_LOOP);
4077 __ cmn(cnt2, (u1)16);
4078 __ br(__ EQ, LOAD_LAST);
4079 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4080 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string
4081 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4082 __ ldr(tmp3, Address(cnt1, -8));
4083 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4084 __ b(LOAD_LAST);
4085 __ bind(DIFF2);
4086 __ mov(tmpU, tmp3);
4087 __ bind(DIFF1);
4088 __ pop(spilled_regs, sp);
4089 __ b(CALCULATE_DIFFERENCE);
4090 __ bind(LOAD_LAST);
4091 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4092 // No need to load it again
4093 __ mov(tmpU, tmp3);
4094 __ pop(spilled_regs, sp);
4095
4096 __ ldrs(vtmp, Address(strL));
4097 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4098 __ fmovd(tmpL, vtmp);
4099
4100 __ eor(rscratch2, tmpU, tmpL);
4101 __ cbz(rscratch2, DONE);
4102
4103 // Find the first different characters in the longwords and
4104 // compute their difference.
4105 __ bind(CALCULATE_DIFFERENCE);
4106 __ rev(rscratch2, rscratch2);
4107 __ clz(rscratch2, rscratch2);
4108 __ andr(rscratch2, rscratch2, -16);
4109 __ lsrv(tmp1, tmp1, rscratch2);
4110 __ uxthw(tmp1, tmp1);
4111 __ lsrv(rscratch1, rscratch1, rscratch2);
4112 __ uxthw(rscratch1, rscratch1);
4113 __ subw(result, tmp1, rscratch1);
4114 __ bind(DONE);
4115 __ ret(lr);
4116 return entry;
4117 }
4118
4119 // r0 = result
4120 // r1 = str1
4121 // r2 = cnt1
4122 // r3 = str2
4123 // r4 = cnt2
4124 // r10 = tmp1
4125 // r11 = tmp2
generate_compare_long_string_same_encoding(bool isLL)4126 address generate_compare_long_string_same_encoding(bool isLL) {
4127 __ align(CodeEntryAlignment);
4128 StubCodeMark mark(this, "StubRoutines", isLL
4129 ? "compare_long_string_same_encoding LL"
4130 : "compare_long_string_same_encoding UU");
4131 address entry = __ pc();
4132 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4133 tmp1 = r10, tmp2 = r11;
4134 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4135 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4136 DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4137 // exit from large loop when less than 64 bytes left to read or we're about
4138 // to prefetch memory behind array border
4139 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4140 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4141 // update cnt2 counter with already loaded 8 bytes
4142 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4143 // update pointers, because of previous read
4144 __ add(str1, str1, wordSize);
4145 __ add(str2, str2, wordSize);
4146 if (SoftwarePrefetchHintDistance >= 0) {
4147 __ bind(LARGE_LOOP_PREFETCH);
4148 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4149 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4150 compare_string_16_bytes_same(DIFF, DIFF2);
4151 compare_string_16_bytes_same(DIFF, DIFF2);
4152 __ sub(cnt2, cnt2, isLL ? 64 : 32);
4153 compare_string_16_bytes_same(DIFF, DIFF2);
4154 __ cmp(cnt2, largeLoopExitCondition);
4155 compare_string_16_bytes_same(DIFF, DIFF2);
4156 __ br(__ GT, LARGE_LOOP_PREFETCH);
4157 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4158 }
4159 // less than 16 bytes left?
4160 __ subs(cnt2, cnt2, isLL ? 16 : 8);
4161 __ br(__ LT, TAIL);
4162 __ bind(SMALL_LOOP);
4163 compare_string_16_bytes_same(DIFF, DIFF2);
4164 __ subs(cnt2, cnt2, isLL ? 16 : 8);
4165 __ br(__ GE, SMALL_LOOP);
4166 __ bind(TAIL);
4167 __ adds(cnt2, cnt2, isLL ? 16 : 8);
4168 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4169 __ subs(cnt2, cnt2, isLL ? 8 : 4);
4170 __ br(__ LE, CHECK_LAST);
4171 __ eor(rscratch2, tmp1, tmp2);
4172 __ cbnz(rscratch2, DIFF);
4173 __ ldr(tmp1, Address(__ post(str1, 8)));
4174 __ ldr(tmp2, Address(__ post(str2, 8)));
4175 __ sub(cnt2, cnt2, isLL ? 8 : 4);
4176 __ bind(CHECK_LAST);
4177 if (!isLL) {
4178 __ add(cnt2, cnt2, cnt2); // now in bytes
4179 }
4180 __ eor(rscratch2, tmp1, tmp2);
4181 __ cbnz(rscratch2, DIFF);
4182 __ ldr(rscratch1, Address(str1, cnt2));
4183 __ ldr(cnt1, Address(str2, cnt2));
4184 __ eor(rscratch2, rscratch1, cnt1);
4185 __ cbz(rscratch2, LENGTH_DIFF);
4186 // Find the first different characters in the longwords and
4187 // compute their difference.
4188 __ bind(DIFF2);
4189 __ rev(rscratch2, rscratch2);
4190 __ clz(rscratch2, rscratch2);
4191 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4192 __ lsrv(rscratch1, rscratch1, rscratch2);
4193 if (isLL) {
4194 __ lsrv(cnt1, cnt1, rscratch2);
4195 __ uxtbw(rscratch1, rscratch1);
4196 __ uxtbw(cnt1, cnt1);
4197 } else {
4198 __ lsrv(cnt1, cnt1, rscratch2);
4199 __ uxthw(rscratch1, rscratch1);
4200 __ uxthw(cnt1, cnt1);
4201 }
4202 __ subw(result, rscratch1, cnt1);
4203 __ b(LENGTH_DIFF);
4204 __ bind(DIFF);
4205 __ rev(rscratch2, rscratch2);
4206 __ clz(rscratch2, rscratch2);
4207 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4208 __ lsrv(tmp1, tmp1, rscratch2);
4209 if (isLL) {
4210 __ lsrv(tmp2, tmp2, rscratch2);
4211 __ uxtbw(tmp1, tmp1);
4212 __ uxtbw(tmp2, tmp2);
4213 } else {
4214 __ lsrv(tmp2, tmp2, rscratch2);
4215 __ uxthw(tmp1, tmp1);
4216 __ uxthw(tmp2, tmp2);
4217 }
4218 __ subw(result, tmp1, tmp2);
4219 __ b(LENGTH_DIFF);
4220 __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4221 __ eor(rscratch2, tmp1, tmp2);
4222 __ cbnz(rscratch2, DIFF);
4223 __ bind(LENGTH_DIFF);
4224 __ ret(lr);
4225 return entry;
4226 }
4227
generate_compare_long_strings()4228 void generate_compare_long_strings() {
4229 StubRoutines::aarch64::_compare_long_string_LL
4230 = generate_compare_long_string_same_encoding(true);
4231 StubRoutines::aarch64::_compare_long_string_UU
4232 = generate_compare_long_string_same_encoding(false);
4233 StubRoutines::aarch64::_compare_long_string_LU
4234 = generate_compare_long_string_different_encoding(true);
4235 StubRoutines::aarch64::_compare_long_string_UL
4236 = generate_compare_long_string_different_encoding(false);
4237 }
4238
4239 // R0 = result
4240 // R1 = str2
4241 // R2 = cnt1
4242 // R3 = str1
4243 // R4 = cnt2
4244 // This generic linear code use few additional ideas, which makes it faster:
4245 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4246 // in order to skip initial loading(help in systems with 1 ld pipeline)
4247 // 2) we can use "fast" algorithm of finding single character to search for
4248 // first symbol with less branches(1 branch per each loaded register instead
4249 // of branch for each symbol), so, this is where constants like
4250 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4251 // 3) after loading and analyzing 1st register of source string, it can be
4252 // used to search for every 1st character entry, saving few loads in
4253 // comparison with "simplier-but-slower" implementation
4254 // 4) in order to avoid lots of push/pop operations, code below is heavily
4255 // re-using/re-initializing/compressing register values, which makes code
4256 // larger and a bit less readable, however, most of extra operations are
4257 // issued during loads or branches, so, penalty is minimal
generate_string_indexof_linear(bool str1_isL,bool str2_isL)4258 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4259 const char* stubName = str1_isL
4260 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4261 : "indexof_linear_uu";
4262 __ align(CodeEntryAlignment);
4263 StubCodeMark mark(this, "StubRoutines", stubName);
4264 address entry = __ pc();
4265
4266 int str1_chr_size = str1_isL ? 1 : 2;
4267 int str2_chr_size = str2_isL ? 1 : 2;
4268 int str1_chr_shift = str1_isL ? 0 : 1;
4269 int str2_chr_shift = str2_isL ? 0 : 1;
4270 bool isL = str1_isL && str2_isL;
4271 // parameters
4272 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4273 // temporary registers
4274 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4275 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4276 // redefinitions
4277 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4278
4279 __ push(spilled_regs, sp);
4280 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP,
4281 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4282 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4283 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4284 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4285 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4286 // Read whole register from str1. It is safe, because length >=8 here
4287 __ ldr(ch1, Address(str1));
4288 // Read whole register from str2. It is safe, because length >=8 here
4289 __ ldr(ch2, Address(str2));
4290 __ sub(cnt2, cnt2, cnt1);
4291 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4292 if (str1_isL != str2_isL) {
4293 __ eor(v0, __ T16B, v0, v0);
4294 }
4295 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4296 __ mul(first, first, tmp1);
4297 // check if we have less than 1 register to check
4298 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4299 if (str1_isL != str2_isL) {
4300 __ fmovd(v1, ch1);
4301 }
4302 __ br(__ LE, L_SMALL);
4303 __ eor(ch2, first, ch2);
4304 if (str1_isL != str2_isL) {
4305 __ zip1(v1, __ T16B, v1, v0);
4306 }
4307 __ sub(tmp2, ch2, tmp1);
4308 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4309 __ bics(tmp2, tmp2, ch2);
4310 if (str1_isL != str2_isL) {
4311 __ fmovd(ch1, v1);
4312 }
4313 __ br(__ NE, L_HAS_ZERO);
4314 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4315 __ add(result, result, wordSize/str2_chr_size);
4316 __ add(str2, str2, wordSize);
4317 __ br(__ LT, L_POST_LOOP);
4318 __ BIND(L_LOOP);
4319 __ ldr(ch2, Address(str2));
4320 __ eor(ch2, first, ch2);
4321 __ sub(tmp2, ch2, tmp1);
4322 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4323 __ bics(tmp2, tmp2, ch2);
4324 __ br(__ NE, L_HAS_ZERO);
4325 __ BIND(L_LOOP_PROCEED);
4326 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4327 __ add(str2, str2, wordSize);
4328 __ add(result, result, wordSize/str2_chr_size);
4329 __ br(__ GE, L_LOOP);
4330 __ BIND(L_POST_LOOP);
4331 __ cmp(cnt2, -wordSize/str2_chr_size); // no extra characters to check
4332 __ br(__ LE, NOMATCH);
4333 __ ldr(ch2, Address(str2));
4334 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4335 __ eor(ch2, first, ch2);
4336 __ sub(tmp2, ch2, tmp1);
4337 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4338 __ mov(tmp4, -1); // all bits set
4339 __ b(L_SMALL_PROCEED);
4340 __ align(OptoLoopAlignment);
4341 __ BIND(L_SMALL);
4342 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4343 __ eor(ch2, first, ch2);
4344 if (str1_isL != str2_isL) {
4345 __ zip1(v1, __ T16B, v1, v0);
4346 }
4347 __ sub(tmp2, ch2, tmp1);
4348 __ mov(tmp4, -1); // all bits set
4349 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4350 if (str1_isL != str2_isL) {
4351 __ fmovd(ch1, v1); // move converted 4 symbols
4352 }
4353 __ BIND(L_SMALL_PROCEED);
4354 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4355 __ bic(tmp2, tmp2, ch2);
4356 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4357 __ rbit(tmp2, tmp2);
4358 __ br(__ EQ, NOMATCH);
4359 __ BIND(L_SMALL_HAS_ZERO_LOOP);
4360 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4361 __ cmp(cnt1, wordSize/str2_chr_size);
4362 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4363 if (str2_isL) { // LL
4364 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4365 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4366 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4367 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4368 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4369 } else {
4370 __ mov(ch2, 0xE); // all bits in byte set except last one
4371 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4372 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4373 __ lslv(tmp2, tmp2, tmp4);
4374 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4375 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4376 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4377 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4378 }
4379 __ cmp(ch1, ch2);
4380 __ mov(tmp4, wordSize/str2_chr_size);
4381 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4382 __ BIND(L_SMALL_CMP_LOOP);
4383 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4384 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4385 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4386 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4387 __ add(tmp4, tmp4, 1);
4388 __ cmp(tmp4, cnt1);
4389 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4390 __ cmp(first, ch2);
4391 __ br(__ EQ, L_SMALL_CMP_LOOP);
4392 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4393 __ cbz(tmp2, NOMATCH); // no more matches. exit
4394 __ clz(tmp4, tmp2);
4395 __ add(result, result, 1); // advance index
4396 __ add(str2, str2, str2_chr_size); // advance pointer
4397 __ b(L_SMALL_HAS_ZERO_LOOP);
4398 __ align(OptoLoopAlignment);
4399 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4400 __ cmp(first, ch2);
4401 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4402 __ b(DONE);
4403 __ align(OptoLoopAlignment);
4404 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4405 if (str2_isL) { // LL
4406 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4407 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4408 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4409 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4410 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4411 } else {
4412 __ mov(ch2, 0xE); // all bits in byte set except last one
4413 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4414 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4415 __ lslv(tmp2, tmp2, tmp4);
4416 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4417 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4418 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4419 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4420 }
4421 __ cmp(ch1, ch2);
4422 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4423 __ b(DONE);
4424 __ align(OptoLoopAlignment);
4425 __ BIND(L_HAS_ZERO);
4426 __ rbit(tmp2, tmp2);
4427 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4428 // Now, perform compression of counters(cnt2 and cnt1) into one register.
4429 // It's fine because both counters are 32bit and are not changed in this
4430 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4431 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4432 __ sub(result, result, 1);
4433 __ BIND(L_HAS_ZERO_LOOP);
4434 __ mov(cnt1, wordSize/str2_chr_size);
4435 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4436 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4437 if (str2_isL) {
4438 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4439 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4440 __ lslv(tmp2, tmp2, tmp4);
4441 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4442 __ add(tmp4, tmp4, 1);
4443 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4444 __ lsl(tmp2, tmp2, 1);
4445 __ mov(tmp4, wordSize/str2_chr_size);
4446 } else {
4447 __ mov(ch2, 0xE);
4448 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4449 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4450 __ lslv(tmp2, tmp2, tmp4);
4451 __ add(tmp4, tmp4, 1);
4452 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4453 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4454 __ lsl(tmp2, tmp2, 1);
4455 __ mov(tmp4, wordSize/str2_chr_size);
4456 __ sub(str2, str2, str2_chr_size);
4457 }
4458 __ cmp(ch1, ch2);
4459 __ mov(tmp4, wordSize/str2_chr_size);
4460 __ br(__ NE, L_CMP_LOOP_NOMATCH);
4461 __ BIND(L_CMP_LOOP);
4462 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4463 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4464 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4465 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4466 __ add(tmp4, tmp4, 1);
4467 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4468 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4469 __ cmp(cnt1, ch2);
4470 __ br(__ EQ, L_CMP_LOOP);
4471 __ BIND(L_CMP_LOOP_NOMATCH);
4472 // here we're not matched
4473 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4474 __ clz(tmp4, tmp2);
4475 __ add(str2, str2, str2_chr_size); // advance pointer
4476 __ b(L_HAS_ZERO_LOOP);
4477 __ align(OptoLoopAlignment);
4478 __ BIND(L_CMP_LOOP_LAST_CMP);
4479 __ cmp(cnt1, ch2);
4480 __ br(__ NE, L_CMP_LOOP_NOMATCH);
4481 __ b(DONE);
4482 __ align(OptoLoopAlignment);
4483 __ BIND(L_CMP_LOOP_LAST_CMP2);
4484 if (str2_isL) {
4485 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4486 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4487 __ lslv(tmp2, tmp2, tmp4);
4488 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4489 __ add(tmp4, tmp4, 1);
4490 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4491 __ lsl(tmp2, tmp2, 1);
4492 } else {
4493 __ mov(ch2, 0xE);
4494 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4495 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4496 __ lslv(tmp2, tmp2, tmp4);
4497 __ add(tmp4, tmp4, 1);
4498 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4499 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4500 __ lsl(tmp2, tmp2, 1);
4501 __ sub(str2, str2, str2_chr_size);
4502 }
4503 __ cmp(ch1, ch2);
4504 __ br(__ NE, L_CMP_LOOP_NOMATCH);
4505 __ b(DONE);
4506 __ align(OptoLoopAlignment);
4507 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4508 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4509 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4510 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4511 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4512 // result by analyzed characters value, so, we can just reset lower bits
4513 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4514 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4515 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4516 // index of last analyzed substring inside current octet. So, str2 in at
4517 // respective start address. We need to advance it to next octet
4518 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4519 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4520 __ bfm(result, zr, 0, 2 - str2_chr_shift);
4521 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4522 __ movw(cnt2, cnt2);
4523 __ b(L_LOOP_PROCEED);
4524 __ align(OptoLoopAlignment);
4525 __ BIND(NOMATCH);
4526 __ mov(result, -1);
4527 __ BIND(DONE);
4528 __ pop(spilled_regs, sp);
4529 __ ret(lr);
4530 return entry;
4531 }
4532
generate_string_indexof_stubs()4533 void generate_string_indexof_stubs() {
4534 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4535 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4536 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4537 }
4538
inflate_and_store_2_fp_registers(bool generatePrfm,FloatRegister src1,FloatRegister src2)4539 void inflate_and_store_2_fp_registers(bool generatePrfm,
4540 FloatRegister src1, FloatRegister src2) {
4541 Register dst = r1;
4542 __ zip1(v1, __ T16B, src1, v0);
4543 __ zip2(v2, __ T16B, src1, v0);
4544 if (generatePrfm) {
4545 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4546 }
4547 __ zip1(v3, __ T16B, src2, v0);
4548 __ zip2(v4, __ T16B, src2, v0);
4549 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4550 }
4551
4552 // R0 = src
4553 // R1 = dst
4554 // R2 = len
4555 // R3 = len >> 3
4556 // V0 = 0
4557 // v1 = loaded 8 bytes
generate_large_byte_array_inflate()4558 address generate_large_byte_array_inflate() {
4559 __ align(CodeEntryAlignment);
4560 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4561 address entry = __ pc();
4562 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4563 Register src = r0, dst = r1, len = r2, octetCounter = r3;
4564 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
4565
4566 // do one more 8-byte read to have address 16-byte aligned in most cases
4567 // also use single store instruction
4568 __ ldrd(v2, __ post(src, 8));
4569 __ sub(octetCounter, octetCounter, 2);
4570 __ zip1(v1, __ T16B, v1, v0);
4571 __ zip1(v2, __ T16B, v2, v0);
4572 __ st1(v1, v2, __ T16B, __ post(dst, 32));
4573 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4574 __ cmp(octetCounter, large_loop_threshold);
4575 __ br(__ LE, LOOP_START);
4576 __ b(LOOP_PRFM_START);
4577 __ bind(LOOP_PRFM);
4578 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4579 __ bind(LOOP_PRFM_START);
4580 __ prfm(Address(src, SoftwarePrefetchHintDistance));
4581 __ sub(octetCounter, octetCounter, 8);
4582 __ cmp(octetCounter, large_loop_threshold);
4583 inflate_and_store_2_fp_registers(true, v3, v4);
4584 inflate_and_store_2_fp_registers(true, v5, v6);
4585 __ br(__ GT, LOOP_PRFM);
4586 __ cmp(octetCounter, 8);
4587 __ br(__ LT, DONE);
4588 __ bind(LOOP);
4589 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4590 __ bind(LOOP_START);
4591 __ sub(octetCounter, octetCounter, 8);
4592 __ cmp(octetCounter, 8);
4593 inflate_and_store_2_fp_registers(false, v3, v4);
4594 inflate_and_store_2_fp_registers(false, v5, v6);
4595 __ br(__ GE, LOOP);
4596 __ bind(DONE);
4597 __ ret(lr);
4598 return entry;
4599 }
4600
4601 /**
4602 * Arguments:
4603 *
4604 * Input:
4605 * c_rarg0 - current state address
4606 * c_rarg1 - H key address
4607 * c_rarg2 - data address
4608 * c_rarg3 - number of blocks
4609 *
4610 * Output:
4611 * Updated state at c_rarg0
4612 */
generate_ghash_processBlocks()4613 address generate_ghash_processBlocks() {
4614 // Bafflingly, GCM uses little-endian for the byte order, but
4615 // big-endian for the bit order. For example, the polynomial 1 is
4616 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4617 //
4618 // So, we must either reverse the bytes in each word and do
4619 // everything big-endian or reverse the bits in each byte and do
4620 // it little-endian. On AArch64 it's more idiomatic to reverse
4621 // the bits in each byte (we have an instruction, RBIT, to do
4622 // that) and keep the data in little-endian bit order throught the
4623 // calculation, bit-reversing the inputs and outputs.
4624
4625 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4626 __ align(wordSize * 2);
4627 address p = __ pc();
4628 __ emit_int64(0x87); // The low-order bits of the field
4629 // polynomial (i.e. p = z^7+z^2+z+1)
4630 // repeated in the low and high parts of a
4631 // 128-bit vector
4632 __ emit_int64(0x87);
4633
4634 __ align(CodeEntryAlignment);
4635 address start = __ pc();
4636
4637 Register state = c_rarg0;
4638 Register subkeyH = c_rarg1;
4639 Register data = c_rarg2;
4640 Register blocks = c_rarg3;
4641
4642 FloatRegister vzr = v30;
4643 __ eor(vzr, __ T16B, vzr, vzr); // zero register
4644
4645 __ ldrq(v0, Address(state));
4646 __ ldrq(v1, Address(subkeyH));
4647
4648 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
4649 __ rbit(v0, __ T16B, v0);
4650 __ rev64(v1, __ T16B, v1);
4651 __ rbit(v1, __ T16B, v1);
4652
4653 __ ldrq(v26, p);
4654
4655 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4656 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4657
4658 {
4659 Label L_ghash_loop;
4660 __ bind(L_ghash_loop);
4661
4662 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4663 // reversing each byte
4664 __ rbit(v2, __ T16B, v2);
4665 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
4666
4667 // Multiply state in v2 by subkey in v1
4668 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4669 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4670 /*temps*/v6, v20, v18, v21);
4671 // Reduce v7:v5 by the field polynomial
4672 ghash_reduce(v0, v5, v7, v26, vzr, v20);
4673
4674 __ sub(blocks, blocks, 1);
4675 __ cbnz(blocks, L_ghash_loop);
4676 }
4677
4678 // The bit-reversed result is at this point in v0
4679 __ rev64(v1, __ T16B, v0);
4680 __ rbit(v1, __ T16B, v1);
4681
4682 __ st1(v1, __ T16B, state);
4683 __ ret(lr);
4684
4685 return start;
4686 }
4687
4688 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
4689
4690 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
4691 //
4692 // If LSE is in use, generate LSE versions of all the stubs. The
4693 // non-LSE versions are in atomic_aarch64.S.
4694
4695 // class AtomicStubMark records the entry point of a stub and the
4696 // stub pointer which will point to it. The stub pointer is set to
4697 // the entry point when ~AtomicStubMark() is called, which must be
4698 // after ICache::invalidate_range. This ensures safe publication of
4699 // the generated code.
4700 class AtomicStubMark {
4701 address _entry_point;
4702 aarch64_atomic_stub_t *_stub;
4703 MacroAssembler *_masm;
4704 public:
AtomicStubMark(MacroAssembler * masm,aarch64_atomic_stub_t * stub)4705 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
4706 _masm = masm;
4707 __ align(32);
4708 _entry_point = __ pc();
4709 _stub = stub;
4710 }
~AtomicStubMark()4711 ~AtomicStubMark() {
4712 *_stub = (aarch64_atomic_stub_t)_entry_point;
4713 }
4714 };
4715
4716 // NB: For memory_order_conservative we need a trailing membar after
4717 // LSE atomic operations but not a leading membar.
4718 //
4719 // We don't need a leading membar because a clause in the Arm ARM
4720 // says:
4721 //
4722 // Barrier-ordered-before
4723 //
4724 // Barrier instructions order prior Memory effects before subsequent
4725 // Memory effects generated by the same Observer. A read or a write
4726 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
4727 // Observer if and only if RW1 appears in program order before RW 2
4728 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
4729 // instruction with both Acquire and Release semantics.
4730 //
4731 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
4732 // and Release semantics, therefore we don't need a leading
4733 // barrier. However, there is no corresponding Barrier-ordered-after
4734 // relationship, therefore we need a trailing membar to prevent a
4735 // later store or load from being reordered with the store in an
4736 // atomic instruction.
4737 //
4738 // This was checked by using the herd7 consistency model simulator
4739 // (http://diy.inria.fr/) with this test case:
4740 //
4741 // AArch64 LseCas
4742 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
4743 // P0 | P1;
4744 // LDR W4, [X2] | MOV W3, #0;
4745 // DMB LD | MOV W4, #1;
4746 // LDR W3, [X1] | CASAL W3, W4, [X1];
4747 // | DMB ISH;
4748 // | STR W4, [X2];
4749 // exists
4750 // (0:X3=0 /\ 0:X4=1)
4751 //
4752 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
4753 // with the store to x in P1. Without the DMB in P1 this may happen.
4754 //
4755 // At the time of writing we don't know of any AArch64 hardware that
4756 // reorders stores in this way, but the Reference Manual permits it.
4757
gen_cas_entry(Assembler::operand_size size,atomic_memory_order order)4758 void gen_cas_entry(Assembler::operand_size size,
4759 atomic_memory_order order) {
4760 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
4761 exchange_val = c_rarg2;
4762 bool acquire, release;
4763 switch (order) {
4764 case memory_order_relaxed:
4765 acquire = false;
4766 release = false;
4767 break;
4768 default:
4769 acquire = true;
4770 release = true;
4771 break;
4772 }
4773 __ mov(prev, compare_val);
4774 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
4775 if (order == memory_order_conservative) {
4776 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
4777 }
4778 if (size == Assembler::xword) {
4779 __ mov(r0, prev);
4780 } else {
4781 __ movw(r0, prev);
4782 }
4783 __ ret(lr);
4784 }
4785
gen_ldaddal_entry(Assembler::operand_size size)4786 void gen_ldaddal_entry(Assembler::operand_size size) {
4787 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
4788 __ ldaddal(size, incr, prev, addr);
4789 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
4790 if (size == Assembler::xword) {
4791 __ mov(r0, prev);
4792 } else {
4793 __ movw(r0, prev);
4794 }
4795 __ ret(lr);
4796 }
4797
gen_swpal_entry(Assembler::operand_size size)4798 void gen_swpal_entry(Assembler::operand_size size) {
4799 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
4800 __ swpal(size, incr, prev, addr);
4801 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
4802 if (size == Assembler::xword) {
4803 __ mov(r0, prev);
4804 } else {
4805 __ movw(r0, prev);
4806 }
4807 __ ret(lr);
4808 }
4809
generate_atomic_entry_points()4810 void generate_atomic_entry_points() {
4811 if (! UseLSE) {
4812 return;
4813 }
4814
4815 __ align(CodeEntryAlignment);
4816 StubCodeMark mark(this, "StubRoutines", "atomic entry points");
4817 address first_entry = __ pc();
4818
4819 // All memory_order_conservative
4820 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
4821 gen_ldaddal_entry(Assembler::word);
4822 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
4823 gen_ldaddal_entry(Assembler::xword);
4824
4825 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
4826 gen_swpal_entry(Assembler::word);
4827 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
4828 gen_swpal_entry(Assembler::xword);
4829
4830 // CAS, memory_order_conservative
4831 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
4832 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
4833 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
4834 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
4835 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
4836 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
4837
4838 // CAS, memory_order_relaxed
4839 AtomicStubMark mark_cmpxchg_1_relaxed
4840 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
4841 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
4842 AtomicStubMark mark_cmpxchg_4_relaxed
4843 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
4844 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
4845 AtomicStubMark mark_cmpxchg_8_relaxed
4846 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
4847 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
4848
4849 ICache::invalidate_range(first_entry, __ pc() - first_entry);
4850 }
4851 #endif // LINUX || _ALLBSD_SOURCE
4852
generate_base64_encode_simdround(Register src,Register dst,FloatRegister codec,u8 size)4853 void generate_base64_encode_simdround(Register src, Register dst,
4854 FloatRegister codec, u8 size) {
4855
4856 FloatRegister in0 = v4, in1 = v5, in2 = v6;
4857 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
4858 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
4859
4860 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
4861
4862 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
4863
4864 __ ushr(ind0, arrangement, in0, 2);
4865
4866 __ ushr(ind1, arrangement, in1, 2);
4867 __ shl(in0, arrangement, in0, 6);
4868 __ orr(ind1, arrangement, ind1, in0);
4869 __ ushr(ind1, arrangement, ind1, 2);
4870
4871 __ ushr(ind2, arrangement, in2, 4);
4872 __ shl(in1, arrangement, in1, 4);
4873 __ orr(ind2, arrangement, in1, ind2);
4874 __ ushr(ind2, arrangement, ind2, 2);
4875
4876 __ shl(ind3, arrangement, in2, 2);
4877 __ ushr(ind3, arrangement, ind3, 2);
4878
4879 __ tbl(out0, arrangement, codec, 4, ind0);
4880 __ tbl(out1, arrangement, codec, 4, ind1);
4881 __ tbl(out2, arrangement, codec, 4, ind2);
4882 __ tbl(out3, arrangement, codec, 4, ind3);
4883
4884 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
4885 }
4886
4887 /**
4888 * Arguments:
4889 *
4890 * Input:
4891 * c_rarg0 - src_start
4892 * c_rarg1 - src_offset
4893 * c_rarg2 - src_length
4894 * c_rarg3 - dest_start
4895 * c_rarg4 - dest_offset
4896 * c_rarg5 - isURL
4897 *
4898 */
generate_base64_encodeBlock()4899 address generate_base64_encodeBlock() {
4900
4901 static const char toBase64[64] = {
4902 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4903 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4904 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4905 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4906 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
4907 };
4908
4909 static const char toBase64URL[64] = {
4910 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4911 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4912 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4913 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4914 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
4915 };
4916
4917 __ align(CodeEntryAlignment);
4918 StubCodeMark mark(this, "StubRoutines", "encodeBlock");
4919 address start = __ pc();
4920
4921 Register src = c_rarg0; // source array
4922 Register soff = c_rarg1; // source start offset
4923 Register send = c_rarg2; // source end offset
4924 Register dst = c_rarg3; // dest array
4925 Register doff = c_rarg4; // position for writing to dest array
4926 Register isURL = c_rarg5; // Base64 or URL chracter set
4927
4928 // c_rarg6 and c_rarg7 are free to use as temps
4929 Register codec = c_rarg6;
4930 Register length = c_rarg7;
4931
4932 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
4933
4934 __ add(src, src, soff);
4935 __ add(dst, dst, doff);
4936 __ sub(length, send, soff);
4937
4938 // load the codec base address
4939 __ lea(codec, ExternalAddress((address) toBase64));
4940 __ cbz(isURL, ProcessData);
4941 __ lea(codec, ExternalAddress((address) toBase64URL));
4942
4943 __ BIND(ProcessData);
4944
4945 // too short to formup a SIMD loop, roll back
4946 __ cmp(length, (u1)24);
4947 __ br(Assembler::LT, Process3B);
4948
4949 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
4950
4951 __ BIND(Process48B);
4952 __ cmp(length, (u1)48);
4953 __ br(Assembler::LT, Process24B);
4954 generate_base64_encode_simdround(src, dst, v0, 16);
4955 __ sub(length, length, 48);
4956 __ b(Process48B);
4957
4958 __ BIND(Process24B);
4959 __ cmp(length, (u1)24);
4960 __ br(Assembler::LT, SIMDExit);
4961 generate_base64_encode_simdround(src, dst, v0, 8);
4962 __ sub(length, length, 24);
4963
4964 __ BIND(SIMDExit);
4965 __ cbz(length, Exit);
4966
4967 __ BIND(Process3B);
4968 // 3 src bytes, 24 bits
4969 __ ldrb(r10, __ post(src, 1));
4970 __ ldrb(r11, __ post(src, 1));
4971 __ ldrb(r12, __ post(src, 1));
4972 __ orrw(r11, r11, r10, Assembler::LSL, 8);
4973 __ orrw(r12, r12, r11, Assembler::LSL, 8);
4974 // codec index
4975 __ ubfmw(r15, r12, 18, 23);
4976 __ ubfmw(r14, r12, 12, 17);
4977 __ ubfmw(r13, r12, 6, 11);
4978 __ andw(r12, r12, 63);
4979 // get the code based on the codec
4980 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
4981 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
4982 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
4983 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
4984 __ strb(r15, __ post(dst, 1));
4985 __ strb(r14, __ post(dst, 1));
4986 __ strb(r13, __ post(dst, 1));
4987 __ strb(r12, __ post(dst, 1));
4988 __ sub(length, length, 3);
4989 __ cbnz(length, Process3B);
4990
4991 __ BIND(Exit);
4992 __ ret(lr);
4993
4994 return start;
4995 }
4996
4997 // Continuation point for throwing of implicit exceptions that are
4998 // not handled in the current activation. Fabricates an exception
4999 // oop and initiates normal exception dispatching in this
5000 // frame. Since we need to preserve callee-saved values (currently
5001 // only for C2, but done for C1 as well) we need a callee-saved oop
5002 // map and therefore have to make these stubs into RuntimeStubs
5003 // rather than BufferBlobs. If the compiler needs all registers to
5004 // be preserved between the fault point and the exception handler
5005 // then it must assume responsibility for that in
5006 // AbstractCompiler::continuation_for_implicit_null_exception or
5007 // continuation_for_implicit_division_by_zero_exception. All other
5008 // implicit exceptions (e.g., NullPointerException or
5009 // AbstractMethodError on entry) are either at call sites or
5010 // otherwise assume that stack unwinding will be initiated, so
5011 // caller saved registers were assumed volatile in the compiler.
5012
5013 #undef __
5014 #define __ masm->
5015
generate_throw_exception(const char * name,address runtime_entry,Register arg1=noreg,Register arg2=noreg)5016 address generate_throw_exception(const char* name,
5017 address runtime_entry,
5018 Register arg1 = noreg,
5019 Register arg2 = noreg) {
5020 // Information about frame layout at time of blocking runtime call.
5021 // Note that we only have to preserve callee-saved registers since
5022 // the compilers are responsible for supplying a continuation point
5023 // if they expect all registers to be preserved.
5024 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
5025 enum layout {
5026 rfp_off = 0,
5027 rfp_off2,
5028 return_off,
5029 return_off2,
5030 framesize // inclusive of return address
5031 };
5032
5033 int insts_size = 512;
5034 int locs_size = 64;
5035
5036 CodeBuffer code(name, insts_size, locs_size);
5037 OopMapSet* oop_maps = new OopMapSet();
5038 MacroAssembler* masm = new MacroAssembler(&code);
5039
5040 address start = __ pc();
5041
5042 // This is an inlined and slightly modified version of call_VM
5043 // which has the ability to fetch the return PC out of
5044 // thread-local storage and also sets up last_Java_sp slightly
5045 // differently than the real call_VM
5046
5047 __ enter(); // Save FP and LR before call
5048
5049 assert(is_even(framesize/2), "sp not 16-byte aligned");
5050
5051 // lr and fp are already in place
5052 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
5053
5054 int frame_complete = __ pc() - start;
5055
5056 // Set up last_Java_sp and last_Java_fp
5057 address the_pc = __ pc();
5058 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
5059
5060 // Call runtime
5061 if (arg1 != noreg) {
5062 assert(arg2 != c_rarg1, "clobbered");
5063 __ mov(c_rarg1, arg1);
5064 }
5065 if (arg2 != noreg) {
5066 __ mov(c_rarg2, arg2);
5067 }
5068 __ mov(c_rarg0, rthread);
5069 BLOCK_COMMENT("call runtime_entry");
5070 __ mov(rscratch1, runtime_entry);
5071 __ blr(rscratch1);
5072
5073 // Generate oop map
5074 OopMap* map = new OopMap(framesize, 0);
5075
5076 oop_maps->add_gc_map(the_pc - start, map);
5077
5078 __ reset_last_Java_frame(true);
5079 __ maybe_isb();
5080
5081 __ leave();
5082
5083 // check for pending exceptions
5084 #ifdef ASSERT
5085 Label L;
5086 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
5087 __ cbnz(rscratch1, L);
5088 __ should_not_reach_here();
5089 __ bind(L);
5090 #endif // ASSERT
5091 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5092
5093
5094 // codeBlob framesize is in words (not VMRegImpl::slot_size)
5095 RuntimeStub* stub =
5096 RuntimeStub::new_runtime_stub(name,
5097 &code,
5098 frame_complete,
5099 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5100 oop_maps, false);
5101 return stub->entry_point();
5102 }
5103
5104 class MontgomeryMultiplyGenerator : public MacroAssembler {
5105
5106 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
5107 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
5108
5109 RegSet _toSave;
5110 bool _squaring;
5111
5112 public:
MontgomeryMultiplyGenerator(Assembler * as,bool squaring)5113 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
5114 : MacroAssembler(as->code()), _squaring(squaring) {
5115
5116 // Register allocation
5117
5118 Register reg = c_rarg0;
5119 Pa_base = reg; // Argument registers
5120 if (squaring)
5121 Pb_base = Pa_base;
5122 else
5123 Pb_base = ++reg;
5124 Pn_base = ++reg;
5125 Rlen= ++reg;
5126 inv = ++reg;
5127 Pm_base = ++reg;
5128
5129 // Working registers:
5130 Ra = ++reg; // The current digit of a, b, n, and m.
5131 Rb = ++reg;
5132 Rm = ++reg;
5133 Rn = ++reg;
5134
5135 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m.
5136 Pb = ++reg;
5137 Pm = ++reg;
5138 Pn = ++reg;
5139
5140 t0 = ++reg; // Three registers which form a
5141 t1 = ++reg; // triple-precision accumuator.
5142 t2 = ++reg;
5143
5144 Ri = ++reg; // Inner and outer loop indexes.
5145 Rj = ++reg;
5146
5147 Rhi_ab = ++reg; // Product registers: low and high parts
5148 Rlo_ab = ++reg; // of a*b and m*n.
5149 Rhi_mn = ++reg;
5150 Rlo_mn = ++reg;
5151
5152 // r19 and up are callee-saved.
5153 _toSave = RegSet::range(r19, reg) + Pm_base;
5154 }
5155
5156 private:
save_regs()5157 void save_regs() {
5158 push(_toSave, sp);
5159 }
5160
restore_regs()5161 void restore_regs() {
5162 pop(_toSave, sp);
5163 }
5164
5165 template <typename T>
unroll_2(Register count,T block)5166 void unroll_2(Register count, T block) {
5167 Label loop, end, odd;
5168 tbnz(count, 0, odd);
5169 cbz(count, end);
5170 align(16);
5171 bind(loop);
5172 (this->*block)();
5173 bind(odd);
5174 (this->*block)();
5175 subs(count, count, 2);
5176 br(Assembler::GT, loop);
5177 bind(end);
5178 }
5179
5180 template <typename T>
unroll_2(Register count,T block,Register d,Register s,Register tmp)5181 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
5182 Label loop, end, odd;
5183 tbnz(count, 0, odd);
5184 cbz(count, end);
5185 align(16);
5186 bind(loop);
5187 (this->*block)(d, s, tmp);
5188 bind(odd);
5189 (this->*block)(d, s, tmp);
5190 subs(count, count, 2);
5191 br(Assembler::GT, loop);
5192 bind(end);
5193 }
5194
pre1(RegisterOrConstant i)5195 void pre1(RegisterOrConstant i) {
5196 block_comment("pre1");
5197 // Pa = Pa_base;
5198 // Pb = Pb_base + i;
5199 // Pm = Pm_base;
5200 // Pn = Pn_base + i;
5201 // Ra = *Pa;
5202 // Rb = *Pb;
5203 // Rm = *Pm;
5204 // Rn = *Pn;
5205 ldr(Ra, Address(Pa_base));
5206 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5207 ldr(Rm, Address(Pm_base));
5208 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5209 lea(Pa, Address(Pa_base));
5210 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5211 lea(Pm, Address(Pm_base));
5212 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5213
5214 // Zero the m*n result.
5215 mov(Rhi_mn, zr);
5216 mov(Rlo_mn, zr);
5217 }
5218
5219 // The core multiply-accumulate step of a Montgomery
5220 // multiplication. The idea is to schedule operations as a
5221 // pipeline so that instructions with long latencies (loads and
5222 // multiplies) have time to complete before their results are
5223 // used. This most benefits in-order implementations of the
5224 // architecture but out-of-order ones also benefit.
step()5225 void step() {
5226 block_comment("step");
5227 // MACC(Ra, Rb, t0, t1, t2);
5228 // Ra = *++Pa;
5229 // Rb = *--Pb;
5230 umulh(Rhi_ab, Ra, Rb);
5231 mul(Rlo_ab, Ra, Rb);
5232 ldr(Ra, pre(Pa, wordSize));
5233 ldr(Rb, pre(Pb, -wordSize));
5234 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5235 // previous iteration.
5236 // MACC(Rm, Rn, t0, t1, t2);
5237 // Rm = *++Pm;
5238 // Rn = *--Pn;
5239 umulh(Rhi_mn, Rm, Rn);
5240 mul(Rlo_mn, Rm, Rn);
5241 ldr(Rm, pre(Pm, wordSize));
5242 ldr(Rn, pre(Pn, -wordSize));
5243 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5244 }
5245
post1()5246 void post1() {
5247 block_comment("post1");
5248
5249 // MACC(Ra, Rb, t0, t1, t2);
5250 // Ra = *++Pa;
5251 // Rb = *--Pb;
5252 umulh(Rhi_ab, Ra, Rb);
5253 mul(Rlo_ab, Ra, Rb);
5254 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
5255 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5256
5257 // *Pm = Rm = t0 * inv;
5258 mul(Rm, t0, inv);
5259 str(Rm, Address(Pm));
5260
5261 // MACC(Rm, Rn, t0, t1, t2);
5262 // t0 = t1; t1 = t2; t2 = 0;
5263 umulh(Rhi_mn, Rm, Rn);
5264
5265 #ifndef PRODUCT
5266 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5267 {
5268 mul(Rlo_mn, Rm, Rn);
5269 add(Rlo_mn, t0, Rlo_mn);
5270 Label ok;
5271 cbz(Rlo_mn, ok); {
5272 stop("broken Montgomery multiply");
5273 } bind(ok);
5274 }
5275 #endif
5276 // We have very carefully set things up so that
5277 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5278 // the lower half of Rm * Rn because we know the result already:
5279 // it must be -t0. t0 + (-t0) must generate a carry iff
5280 // t0 != 0. So, rather than do a mul and an adds we just set
5281 // the carry flag iff t0 is nonzero.
5282 //
5283 // mul(Rlo_mn, Rm, Rn);
5284 // adds(zr, t0, Rlo_mn);
5285 subs(zr, t0, 1); // Set carry iff t0 is nonzero
5286 adcs(t0, t1, Rhi_mn);
5287 adc(t1, t2, zr);
5288 mov(t2, zr);
5289 }
5290
pre2(RegisterOrConstant i,RegisterOrConstant len)5291 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5292 block_comment("pre2");
5293 // Pa = Pa_base + i-len;
5294 // Pb = Pb_base + len;
5295 // Pm = Pm_base + i-len;
5296 // Pn = Pn_base + len;
5297
5298 if (i.is_register()) {
5299 sub(Rj, i.as_register(), len);
5300 } else {
5301 mov(Rj, i.as_constant());
5302 sub(Rj, Rj, len);
5303 }
5304 // Rj == i-len
5305
5306 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5307 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5308 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5309 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5310
5311 // Ra = *++Pa;
5312 // Rb = *--Pb;
5313 // Rm = *++Pm;
5314 // Rn = *--Pn;
5315 ldr(Ra, pre(Pa, wordSize));
5316 ldr(Rb, pre(Pb, -wordSize));
5317 ldr(Rm, pre(Pm, wordSize));
5318 ldr(Rn, pre(Pn, -wordSize));
5319
5320 mov(Rhi_mn, zr);
5321 mov(Rlo_mn, zr);
5322 }
5323
post2(RegisterOrConstant i,RegisterOrConstant len)5324 void post2(RegisterOrConstant i, RegisterOrConstant len) {
5325 block_comment("post2");
5326 if (i.is_constant()) {
5327 mov(Rj, i.as_constant()-len.as_constant());
5328 } else {
5329 sub(Rj, i.as_register(), len);
5330 }
5331
5332 adds(t0, t0, Rlo_mn); // The pending m*n, low part
5333
5334 // As soon as we know the least significant digit of our result,
5335 // store it.
5336 // Pm_base[i-len] = t0;
5337 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5338
5339 // t0 = t1; t1 = t2; t2 = 0;
5340 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5341 adc(t1, t2, zr);
5342 mov(t2, zr);
5343 }
5344
5345 // A carry in t0 after Montgomery multiplication means that we
5346 // should subtract multiples of n from our result in m. We'll
5347 // keep doing that until there is no carry.
normalize(RegisterOrConstant len)5348 void normalize(RegisterOrConstant len) {
5349 block_comment("normalize");
5350 // while (t0)
5351 // t0 = sub(Pm_base, Pn_base, t0, len);
5352 Label loop, post, again;
5353 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5354 cbz(t0, post); {
5355 bind(again); {
5356 mov(i, zr);
5357 mov(cnt, len);
5358 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5359 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5360 subs(zr, zr, zr); // set carry flag, i.e. no borrow
5361 align(16);
5362 bind(loop); {
5363 sbcs(Rm, Rm, Rn);
5364 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5365 add(i, i, 1);
5366 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5367 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5368 sub(cnt, cnt, 1);
5369 } cbnz(cnt, loop);
5370 sbc(t0, t0, zr);
5371 } cbnz(t0, again);
5372 } bind(post);
5373 }
5374
5375 // Move memory at s to d, reversing words.
5376 // Increments d to end of copied memory
5377 // Destroys tmp1, tmp2
5378 // Preserves len
5379 // Leaves s pointing to the address which was in d at start
reverse(Register d,Register s,Register len,Register tmp1,Register tmp2)5380 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5381 assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5382
5383 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5384 mov(tmp1, len);
5385 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5386 sub(s, d, len, ext::uxtw, LogBytesPerWord);
5387 }
5388 // where
reverse1(Register d,Register s,Register tmp)5389 void reverse1(Register d, Register s, Register tmp) {
5390 ldr(tmp, pre(s, -wordSize));
5391 ror(tmp, tmp, 32);
5392 str(tmp, post(d, wordSize));
5393 }
5394
step_squaring()5395 void step_squaring() {
5396 // An extra ACC
5397 step();
5398 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5399 }
5400
last_squaring(RegisterOrConstant i)5401 void last_squaring(RegisterOrConstant i) {
5402 Label dont;
5403 // if ((i & 1) == 0) {
5404 tbnz(i.as_register(), 0, dont); {
5405 // MACC(Ra, Rb, t0, t1, t2);
5406 // Ra = *++Pa;
5407 // Rb = *--Pb;
5408 umulh(Rhi_ab, Ra, Rb);
5409 mul(Rlo_ab, Ra, Rb);
5410 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5411 } bind(dont);
5412 }
5413
extra_step_squaring()5414 void extra_step_squaring() {
5415 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
5416
5417 // MACC(Rm, Rn, t0, t1, t2);
5418 // Rm = *++Pm;
5419 // Rn = *--Pn;
5420 umulh(Rhi_mn, Rm, Rn);
5421 mul(Rlo_mn, Rm, Rn);
5422 ldr(Rm, pre(Pm, wordSize));
5423 ldr(Rn, pre(Pn, -wordSize));
5424 }
5425
post1_squaring()5426 void post1_squaring() {
5427 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
5428
5429 // *Pm = Rm = t0 * inv;
5430 mul(Rm, t0, inv);
5431 str(Rm, Address(Pm));
5432
5433 // MACC(Rm, Rn, t0, t1, t2);
5434 // t0 = t1; t1 = t2; t2 = 0;
5435 umulh(Rhi_mn, Rm, Rn);
5436
5437 #ifndef PRODUCT
5438 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5439 {
5440 mul(Rlo_mn, Rm, Rn);
5441 add(Rlo_mn, t0, Rlo_mn);
5442 Label ok;
5443 cbz(Rlo_mn, ok); {
5444 stop("broken Montgomery multiply");
5445 } bind(ok);
5446 }
5447 #endif
5448 // We have very carefully set things up so that
5449 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5450 // the lower half of Rm * Rn because we know the result already:
5451 // it must be -t0. t0 + (-t0) must generate a carry iff
5452 // t0 != 0. So, rather than do a mul and an adds we just set
5453 // the carry flag iff t0 is nonzero.
5454 //
5455 // mul(Rlo_mn, Rm, Rn);
5456 // adds(zr, t0, Rlo_mn);
5457 subs(zr, t0, 1); // Set carry iff t0 is nonzero
5458 adcs(t0, t1, Rhi_mn);
5459 adc(t1, t2, zr);
5460 mov(t2, zr);
5461 }
5462
acc(Register Rhi,Register Rlo,Register t0,Register t1,Register t2)5463 void acc(Register Rhi, Register Rlo,
5464 Register t0, Register t1, Register t2) {
5465 adds(t0, t0, Rlo);
5466 adcs(t1, t1, Rhi);
5467 adc(t2, t2, zr);
5468 }
5469
5470 public:
5471 /**
5472 * Fast Montgomery multiplication. The derivation of the
5473 * algorithm is in A Cryptographic Library for the Motorola
5474 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5475 *
5476 * Arguments:
5477 *
5478 * Inputs for multiplication:
5479 * c_rarg0 - int array elements a
5480 * c_rarg1 - int array elements b
5481 * c_rarg2 - int array elements n (the modulus)
5482 * c_rarg3 - int length
5483 * c_rarg4 - int inv
5484 * c_rarg5 - int array elements m (the result)
5485 *
5486 * Inputs for squaring:
5487 * c_rarg0 - int array elements a
5488 * c_rarg1 - int array elements n (the modulus)
5489 * c_rarg2 - int length
5490 * c_rarg3 - int inv
5491 * c_rarg4 - int array elements m (the result)
5492 *
5493 */
generate_multiply()5494 address generate_multiply() {
5495 Label argh, nothing;
5496 bind(argh);
5497 stop("MontgomeryMultiply total_allocation must be <= 8192");
5498
5499 align(CodeEntryAlignment);
5500 address entry = pc();
5501
5502 cbzw(Rlen, nothing);
5503
5504 enter();
5505
5506 // Make room.
5507 cmpw(Rlen, 512);
5508 br(Assembler::HI, argh);
5509 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5510 andr(sp, Ra, -2 * wordSize);
5511
5512 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
5513
5514 {
5515 // Copy input args, reversing as we go. We use Ra as a
5516 // temporary variable.
5517 reverse(Ra, Pa_base, Rlen, t0, t1);
5518 if (!_squaring)
5519 reverse(Ra, Pb_base, Rlen, t0, t1);
5520 reverse(Ra, Pn_base, Rlen, t0, t1);
5521 }
5522
5523 // Push all call-saved registers and also Pm_base which we'll need
5524 // at the end.
5525 save_regs();
5526
5527 #ifndef PRODUCT
5528 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5529 {
5530 ldr(Rn, Address(Pn_base, 0));
5531 mul(Rlo_mn, Rn, inv);
5532 cmp(Rlo_mn, -1);
5533 Label ok;
5534 br(EQ, ok); {
5535 stop("broken inverse in Montgomery multiply");
5536 } bind(ok);
5537 }
5538 #endif
5539
5540 mov(Pm_base, Ra);
5541
5542 mov(t0, zr);
5543 mov(t1, zr);
5544 mov(t2, zr);
5545
5546 block_comment("for (int i = 0; i < len; i++) {");
5547 mov(Ri, zr); {
5548 Label loop, end;
5549 cmpw(Ri, Rlen);
5550 br(Assembler::GE, end);
5551
5552 bind(loop);
5553 pre1(Ri);
5554
5555 block_comment(" for (j = i; j; j--) {"); {
5556 movw(Rj, Ri);
5557 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5558 } block_comment(" } // j");
5559
5560 post1();
5561 addw(Ri, Ri, 1);
5562 cmpw(Ri, Rlen);
5563 br(Assembler::LT, loop);
5564 bind(end);
5565 block_comment("} // i");
5566 }
5567
5568 block_comment("for (int i = len; i < 2*len; i++) {");
5569 mov(Ri, Rlen); {
5570 Label loop, end;
5571 cmpw(Ri, Rlen, Assembler::LSL, 1);
5572 br(Assembler::GE, end);
5573
5574 bind(loop);
5575 pre2(Ri, Rlen);
5576
5577 block_comment(" for (j = len*2-i-1; j; j--) {"); {
5578 lslw(Rj, Rlen, 1);
5579 subw(Rj, Rj, Ri);
5580 subw(Rj, Rj, 1);
5581 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5582 } block_comment(" } // j");
5583
5584 post2(Ri, Rlen);
5585 addw(Ri, Ri, 1);
5586 cmpw(Ri, Rlen, Assembler::LSL, 1);
5587 br(Assembler::LT, loop);
5588 bind(end);
5589 }
5590 block_comment("} // i");
5591
5592 normalize(Rlen);
5593
5594 mov(Ra, Pm_base); // Save Pm_base in Ra
5595 restore_regs(); // Restore caller's Pm_base
5596
5597 // Copy our result into caller's Pm_base
5598 reverse(Pm_base, Ra, Rlen, t0, t1);
5599
5600 leave();
5601 bind(nothing);
5602 ret(lr);
5603
5604 return entry;
5605 }
5606 // In C, approximately:
5607
5608 // void
5609 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
5610 // julong Pn_base[], julong Pm_base[],
5611 // julong inv, int len) {
5612 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5613 // julong *Pa, *Pb, *Pn, *Pm;
5614 // julong Ra, Rb, Rn, Rm;
5615
5616 // int i;
5617
5618 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5619
5620 // for (i = 0; i < len; i++) {
5621 // int j;
5622
5623 // Pa = Pa_base;
5624 // Pb = Pb_base + i;
5625 // Pm = Pm_base;
5626 // Pn = Pn_base + i;
5627
5628 // Ra = *Pa;
5629 // Rb = *Pb;
5630 // Rm = *Pm;
5631 // Rn = *Pn;
5632
5633 // int iters = i;
5634 // for (j = 0; iters--; j++) {
5635 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5636 // MACC(Ra, Rb, t0, t1, t2);
5637 // Ra = *++Pa;
5638 // Rb = *--Pb;
5639 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5640 // MACC(Rm, Rn, t0, t1, t2);
5641 // Rm = *++Pm;
5642 // Rn = *--Pn;
5643 // }
5644
5645 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5646 // MACC(Ra, Rb, t0, t1, t2);
5647 // *Pm = Rm = t0 * inv;
5648 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5649 // MACC(Rm, Rn, t0, t1, t2);
5650
5651 // assert(t0 == 0, "broken Montgomery multiply");
5652
5653 // t0 = t1; t1 = t2; t2 = 0;
5654 // }
5655
5656 // for (i = len; i < 2*len; i++) {
5657 // int j;
5658
5659 // Pa = Pa_base + i-len;
5660 // Pb = Pb_base + len;
5661 // Pm = Pm_base + i-len;
5662 // Pn = Pn_base + len;
5663
5664 // Ra = *++Pa;
5665 // Rb = *--Pb;
5666 // Rm = *++Pm;
5667 // Rn = *--Pn;
5668
5669 // int iters = len*2-i-1;
5670 // for (j = i-len+1; iters--; j++) {
5671 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5672 // MACC(Ra, Rb, t0, t1, t2);
5673 // Ra = *++Pa;
5674 // Rb = *--Pb;
5675 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5676 // MACC(Rm, Rn, t0, t1, t2);
5677 // Rm = *++Pm;
5678 // Rn = *--Pn;
5679 // }
5680
5681 // Pm_base[i-len] = t0;
5682 // t0 = t1; t1 = t2; t2 = 0;
5683 // }
5684
5685 // while (t0)
5686 // t0 = sub(Pm_base, Pn_base, t0, len);
5687 // }
5688
5689 /**
5690 * Fast Montgomery squaring. This uses asymptotically 25% fewer
5691 * multiplies than Montgomery multiplication so it should be up to
5692 * 25% faster. However, its loop control is more complex and it
5693 * may actually run slower on some machines.
5694 *
5695 * Arguments:
5696 *
5697 * Inputs:
5698 * c_rarg0 - int array elements a
5699 * c_rarg1 - int array elements n (the modulus)
5700 * c_rarg2 - int length
5701 * c_rarg3 - int inv
5702 * c_rarg4 - int array elements m (the result)
5703 *
5704 */
generate_square()5705 address generate_square() {
5706 Label argh;
5707 bind(argh);
5708 stop("MontgomeryMultiply total_allocation must be <= 8192");
5709
5710 align(CodeEntryAlignment);
5711 address entry = pc();
5712
5713 enter();
5714
5715 // Make room.
5716 cmpw(Rlen, 512);
5717 br(Assembler::HI, argh);
5718 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5719 andr(sp, Ra, -2 * wordSize);
5720
5721 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
5722
5723 {
5724 // Copy input args, reversing as we go. We use Ra as a
5725 // temporary variable.
5726 reverse(Ra, Pa_base, Rlen, t0, t1);
5727 reverse(Ra, Pn_base, Rlen, t0, t1);
5728 }
5729
5730 // Push all call-saved registers and also Pm_base which we'll need
5731 // at the end.
5732 save_regs();
5733
5734 mov(Pm_base, Ra);
5735
5736 mov(t0, zr);
5737 mov(t1, zr);
5738 mov(t2, zr);
5739
5740 block_comment("for (int i = 0; i < len; i++) {");
5741 mov(Ri, zr); {
5742 Label loop, end;
5743 bind(loop);
5744 cmp(Ri, Rlen);
5745 br(Assembler::GE, end);
5746
5747 pre1(Ri);
5748
5749 block_comment("for (j = (i+1)/2; j; j--) {"); {
5750 add(Rj, Ri, 1);
5751 lsr(Rj, Rj, 1);
5752 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5753 } block_comment(" } // j");
5754
5755 last_squaring(Ri);
5756
5757 block_comment(" for (j = i/2; j; j--) {"); {
5758 lsr(Rj, Ri, 1);
5759 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5760 } block_comment(" } // j");
5761
5762 post1_squaring();
5763 add(Ri, Ri, 1);
5764 cmp(Ri, Rlen);
5765 br(Assembler::LT, loop);
5766
5767 bind(end);
5768 block_comment("} // i");
5769 }
5770
5771 block_comment("for (int i = len; i < 2*len; i++) {");
5772 mov(Ri, Rlen); {
5773 Label loop, end;
5774 bind(loop);
5775 cmp(Ri, Rlen, Assembler::LSL, 1);
5776 br(Assembler::GE, end);
5777
5778 pre2(Ri, Rlen);
5779
5780 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
5781 lsl(Rj, Rlen, 1);
5782 sub(Rj, Rj, Ri);
5783 sub(Rj, Rj, 1);
5784 lsr(Rj, Rj, 1);
5785 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5786 } block_comment(" } // j");
5787
5788 last_squaring(Ri);
5789
5790 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
5791 lsl(Rj, Rlen, 1);
5792 sub(Rj, Rj, Ri);
5793 lsr(Rj, Rj, 1);
5794 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5795 } block_comment(" } // j");
5796
5797 post2(Ri, Rlen);
5798 add(Ri, Ri, 1);
5799 cmp(Ri, Rlen, Assembler::LSL, 1);
5800
5801 br(Assembler::LT, loop);
5802 bind(end);
5803 block_comment("} // i");
5804 }
5805
5806 normalize(Rlen);
5807
5808 mov(Ra, Pm_base); // Save Pm_base in Ra
5809 restore_regs(); // Restore caller's Pm_base
5810
5811 // Copy our result into caller's Pm_base
5812 reverse(Pm_base, Ra, Rlen, t0, t1);
5813
5814 leave();
5815 ret(lr);
5816
5817 return entry;
5818 }
5819 // In C, approximately:
5820
5821 // void
5822 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
5823 // julong Pn_base[], julong Pm_base[],
5824 // julong inv, int len) {
5825 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5826 // julong *Pa, *Pb, *Pn, *Pm;
5827 // julong Ra, Rb, Rn, Rm;
5828
5829 // int i;
5830
5831 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5832
5833 // for (i = 0; i < len; i++) {
5834 // int j;
5835
5836 // Pa = Pa_base;
5837 // Pb = Pa_base + i;
5838 // Pm = Pm_base;
5839 // Pn = Pn_base + i;
5840
5841 // Ra = *Pa;
5842 // Rb = *Pb;
5843 // Rm = *Pm;
5844 // Rn = *Pn;
5845
5846 // int iters = (i+1)/2;
5847 // for (j = 0; iters--; j++) {
5848 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5849 // MACC2(Ra, Rb, t0, t1, t2);
5850 // Ra = *++Pa;
5851 // Rb = *--Pb;
5852 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5853 // MACC(Rm, Rn, t0, t1, t2);
5854 // Rm = *++Pm;
5855 // Rn = *--Pn;
5856 // }
5857 // if ((i & 1) == 0) {
5858 // assert(Ra == Pa_base[j], "must be");
5859 // MACC(Ra, Ra, t0, t1, t2);
5860 // }
5861 // iters = i/2;
5862 // assert(iters == i-j, "must be");
5863 // for (; iters--; j++) {
5864 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5865 // MACC(Rm, Rn, t0, t1, t2);
5866 // Rm = *++Pm;
5867 // Rn = *--Pn;
5868 // }
5869
5870 // *Pm = Rm = t0 * inv;
5871 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5872 // MACC(Rm, Rn, t0, t1, t2);
5873
5874 // assert(t0 == 0, "broken Montgomery multiply");
5875
5876 // t0 = t1; t1 = t2; t2 = 0;
5877 // }
5878
5879 // for (i = len; i < 2*len; i++) {
5880 // int start = i-len+1;
5881 // int end = start + (len - start)/2;
5882 // int j;
5883
5884 // Pa = Pa_base + i-len;
5885 // Pb = Pa_base + len;
5886 // Pm = Pm_base + i-len;
5887 // Pn = Pn_base + len;
5888
5889 // Ra = *++Pa;
5890 // Rb = *--Pb;
5891 // Rm = *++Pm;
5892 // Rn = *--Pn;
5893
5894 // int iters = (2*len-i-1)/2;
5895 // assert(iters == end-start, "must be");
5896 // for (j = start; iters--; j++) {
5897 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5898 // MACC2(Ra, Rb, t0, t1, t2);
5899 // Ra = *++Pa;
5900 // Rb = *--Pb;
5901 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5902 // MACC(Rm, Rn, t0, t1, t2);
5903 // Rm = *++Pm;
5904 // Rn = *--Pn;
5905 // }
5906 // if ((i & 1) == 0) {
5907 // assert(Ra == Pa_base[j], "must be");
5908 // MACC(Ra, Ra, t0, t1, t2);
5909 // }
5910 // iters = (2*len-i)/2;
5911 // assert(iters == len-j, "must be");
5912 // for (; iters--; j++) {
5913 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5914 // MACC(Rm, Rn, t0, t1, t2);
5915 // Rm = *++Pm;
5916 // Rn = *--Pn;
5917 // }
5918 // Pm_base[i-len] = t0;
5919 // t0 = t1; t1 = t2; t2 = 0;
5920 // }
5921
5922 // while (t0)
5923 // t0 = sub(Pm_base, Pn_base, t0, len);
5924 // }
5925 };
5926
5927
5928 // Initialization
generate_initial()5929 void generate_initial() {
5930 // Generate initial stubs and initializes the entry points
5931
5932 // entry points that exist in all platforms Note: This is code
5933 // that could be shared among different platforms - however the
5934 // benefit seems to be smaller than the disadvantage of having a
5935 // much more complicated generator structure. See also comment in
5936 // stubRoutines.hpp.
5937
5938 StubRoutines::_forward_exception_entry = generate_forward_exception();
5939
5940 StubRoutines::_call_stub_entry =
5941 generate_call_stub(StubRoutines::_call_stub_return_address);
5942
5943 // is referenced by megamorphic call
5944 StubRoutines::_catch_exception_entry = generate_catch_exception();
5945
5946 // Build this early so it's available for the interpreter.
5947 StubRoutines::_throw_StackOverflowError_entry =
5948 generate_throw_exception("StackOverflowError throw_exception",
5949 CAST_FROM_FN_PTR(address,
5950 SharedRuntime::throw_StackOverflowError));
5951 StubRoutines::_throw_delayed_StackOverflowError_entry =
5952 generate_throw_exception("delayed StackOverflowError throw_exception",
5953 CAST_FROM_FN_PTR(address,
5954 SharedRuntime::throw_delayed_StackOverflowError));
5955 if (UseCRC32Intrinsics) {
5956 // set table address before stub generation which use it
5957 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5958 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5959 }
5960
5961 if (UseCRC32CIntrinsics) {
5962 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5963 }
5964
5965 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5966 // disabled pending fix and retest of generated code via JDK-8210858
5967 // StubRoutines::_dlog = generate_dlog();
5968 }
5969
5970 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5971 // disabled pending fix and retest of generated code via JDK-8210461
5972 // StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5973 }
5974
5975 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5976 // disabled pending fix and retest of generated code via JDK-8210461
5977 // StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5978 }
5979 }
5980
generate_all()5981 void generate_all() {
5982 // support for verify_oop (must happen after universe_init)
5983 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
5984 StubRoutines::_throw_AbstractMethodError_entry =
5985 generate_throw_exception("AbstractMethodError throw_exception",
5986 CAST_FROM_FN_PTR(address,
5987 SharedRuntime::
5988 throw_AbstractMethodError));
5989
5990 StubRoutines::_throw_IncompatibleClassChangeError_entry =
5991 generate_throw_exception("IncompatibleClassChangeError throw_exception",
5992 CAST_FROM_FN_PTR(address,
5993 SharedRuntime::
5994 throw_IncompatibleClassChangeError));
5995
5996 StubRoutines::_throw_NullPointerException_at_call_entry =
5997 generate_throw_exception("NullPointerException at call throw_exception",
5998 CAST_FROM_FN_PTR(address,
5999 SharedRuntime::
6000 throw_NullPointerException_at_call));
6001
6002 // arraycopy stubs used by compilers
6003 generate_arraycopy_stubs();
6004
6005 // has negatives stub for large arrays.
6006 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
6007
6008 // array equals stub for large arrays.
6009 if (!UseSimpleArrayEquals) {
6010 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
6011 }
6012
6013 generate_compare_long_strings();
6014
6015 generate_string_indexof_stubs();
6016
6017 // byte_array_inflate stub for large arrays.
6018 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
6019
6020 #ifdef COMPILER2
6021 if (UseMultiplyToLenIntrinsic) {
6022 StubRoutines::_multiplyToLen = generate_multiplyToLen();
6023 }
6024
6025 if (UseSquareToLenIntrinsic) {
6026 StubRoutines::_squareToLen = generate_squareToLen();
6027 }
6028
6029 if (UseMulAddIntrinsic) {
6030 StubRoutines::_mulAdd = generate_mulAdd();
6031 }
6032
6033 if (UseMontgomeryMultiplyIntrinsic) {
6034 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
6035 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
6036 StubRoutines::_montgomeryMultiply = g.generate_multiply();
6037 }
6038
6039 if (UseMontgomerySquareIntrinsic) {
6040 StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
6041 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
6042 // We use generate_multiply() rather than generate_square()
6043 // because it's faster for the sizes of modulus we care about.
6044 StubRoutines::_montgomerySquare = g.generate_multiply();
6045 }
6046 #endif // COMPILER2
6047
6048 // generate GHASH intrinsics code
6049 if (UseGHASHIntrinsics) {
6050 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
6051 }
6052
6053 if (UseBASE64Intrinsics) {
6054 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6055 }
6056
6057 if (UseAESIntrinsics) {
6058 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6059 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6060 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6061 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
6062 }
6063
6064 if (UseSHA1Intrinsics) {
6065 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
6066 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
6067 }
6068 if (UseSHA256Intrinsics) {
6069 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
6070 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
6071 }
6072
6073 // generate Adler32 intrinsics code
6074 if (UseAdler32Intrinsics) {
6075 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6076 }
6077
6078 // Safefetch stubs.
6079 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
6080 &StubRoutines::_safefetch32_fault_pc,
6081 &StubRoutines::_safefetch32_continuation_pc);
6082 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
6083 &StubRoutines::_safefetchN_fault_pc,
6084 &StubRoutines::_safefetchN_continuation_pc);
6085 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
6086
6087 generate_atomic_entry_points();
6088
6089 #endif // LINUX || _ALLBSD_SOURCE
6090
6091 StubRoutines::aarch64::set_completed();
6092 }
6093
6094 public:
StubGenerator(CodeBuffer * code,bool all)6095 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6096 if (all) {
6097 generate_all();
6098 } else {
6099 generate_initial();
6100 }
6101 }
6102 }; // end class declaration
6103
StubGenerator_generate(CodeBuffer * code,bool all)6104 void StubGenerator_generate(CodeBuffer* code, bool all) {
6105 StubGenerator g(code, all);
6106 }
6107
6108
6109 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
6110
6111 // Define pointers to atomic stubs and initialize them to point to the
6112 // code in atomic_aarch64.S.
6113
6114 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
6115 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
6116 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
6117 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
6118 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
6119
6120 DEFAULT_ATOMIC_OP(fetch_add, 4, )
6121 DEFAULT_ATOMIC_OP(fetch_add, 8, )
6122 DEFAULT_ATOMIC_OP(xchg, 4, )
6123 DEFAULT_ATOMIC_OP(xchg, 8, )
6124 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
6125 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
6126 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
6127 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
6128 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
6129 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
6130
6131 #undef DEFAULT_ATOMIC_OP
6132
6133 #endif // LINUX || _ALLBSD_SOURCE
6134