1 /*
2 * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "runtime/atomic.hpp"
44 #include "runtime/frame.inline.hpp"
45 #include "runtime/handles.inline.hpp"
46 #include "runtime/sharedRuntime.hpp"
47 #include "runtime/stubCodeGenerator.hpp"
48 #include "runtime/stubRoutines.hpp"
49 #include "runtime/thread.inline.hpp"
50 #include "utilities/align.hpp"
51 #include "utilities/powerOfTwo.hpp"
52 #ifdef COMPILER2
53 #include "opto/runtime.hpp"
54 #endif
55 #if INCLUDE_ZGC
56 #include "gc/z/zThreadLocalData.hpp"
57 #endif
58
59 // Declaration and definition of StubGenerator (no .hpp file).
60 // For a more detailed description of the stub routine structure
61 // see the comment in stubRoutines.hpp
62
63 #undef __
64 #define __ _masm->
65 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
66
67 #ifdef PRODUCT
68 #define BLOCK_COMMENT(str) /* nothing */
69 #else
70 #define BLOCK_COMMENT(str) __ block_comment(str)
71 #endif
72
73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
74
75 // Stub Code definitions
76
77 class StubGenerator: public StubCodeGenerator {
78 private:
79
80 #ifdef PRODUCT
81 #define inc_counter_np(counter) ((void)0)
82 #else
83 void inc_counter_np_(int& counter) {
84 __ lea(rscratch2, ExternalAddress((address)&counter));
85 __ ldrw(rscratch1, Address(rscratch2));
86 __ addw(rscratch1, rscratch1, 1);
87 __ strw(rscratch1, Address(rscratch2));
88 }
89 #define inc_counter_np(counter) \
90 BLOCK_COMMENT("inc_counter " #counter); \
91 inc_counter_np_(counter);
92 #endif
93
94 // Call stubs are used to call Java from C
95 //
96 // Arguments:
97 // c_rarg0: call wrapper address address
98 // c_rarg1: result address
99 // c_rarg2: result type BasicType
100 // c_rarg3: method Method*
101 // c_rarg4: (interpreter) entry point address
102 // c_rarg5: parameters intptr_t*
103 // c_rarg6: parameter size (in words) int
104 // c_rarg7: thread Thread*
105 //
106 // There is no return from the stub itself as any Java result
107 // is written to result
108 //
109 // we save r30 (lr) as the return PC at the base of the frame and
110 // link r29 (fp) below it as the frame pointer installing sp (r31)
111 // into fp.
112 //
113 // we save r0-r7, which accounts for all the c arguments.
114 //
115 // TODO: strictly do we need to save them all? they are treated as
116 // volatile by C so could we omit saving the ones we are going to
117 // place in global registers (thread? method?) or those we only use
118 // during setup of the Java call?
119 //
120 // we don't need to save r8 which C uses as an indirect result location
121 // return register.
122 //
123 // we don't need to save r9-r15 which both C and Java treat as
124 // volatile
125 //
126 // we don't need to save r16-18 because Java does not use them
127 //
128 // we save r19-r28 which Java uses as scratch registers and C
129 // expects to be callee-save
130 //
131 // we save the bottom 64 bits of each value stored in v8-v15; it is
132 // the responsibility of the caller to preserve larger values.
133 //
134 // so the stub frame looks like this when we enter Java code
135 //
136 // [ return_from_Java ] <--- sp
137 // [ argument word n ]
138 // ...
139 // -27 [ argument word 1 ]
140 // -26 [ saved v15 ] <--- sp_after_call
141 // -25 [ saved v14 ]
142 // -24 [ saved v13 ]
143 // -23 [ saved v12 ]
144 // -22 [ saved v11 ]
145 // -21 [ saved v10 ]
146 // -20 [ saved v9 ]
147 // -19 [ saved v8 ]
148 // -18 [ saved r28 ]
149 // -17 [ saved r27 ]
150 // -16 [ saved r26 ]
151 // -15 [ saved r25 ]
152 // -14 [ saved r24 ]
153 // -13 [ saved r23 ]
154 // -12 [ saved r22 ]
155 // -11 [ saved r21 ]
156 // -10 [ saved r20 ]
157 // -9 [ saved r19 ]
158 // -8 [ call wrapper (r0) ]
159 // -7 [ result (r1) ]
160 // -6 [ result type (r2) ]
161 // -5 [ method (r3) ]
162 // -4 [ entry point (r4) ]
163 // -3 [ parameters (r5) ]
164 // -2 [ parameter size (r6) ]
165 // -1 [ thread (r7) ]
166 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
167 // 1 [ saved lr (r30) ]
168
169 // Call stub stack layout word offsets from fp
170 enum call_stub_layout {
171 sp_after_call_off = -26,
172
173 d15_off = -26,
174 d13_off = -24,
175 d11_off = -22,
176 d9_off = -20,
177
178 r28_off = -18,
179 r26_off = -16,
180 r24_off = -14,
181 r22_off = -12,
182 r20_off = -10,
183 call_wrapper_off = -8,
184 result_off = -7,
185 result_type_off = -6,
186 method_off = -5,
187 entry_point_off = -4,
188 parameter_size_off = -2,
189 thread_off = -1,
190 fp_f = 0,
191 retaddr_off = 1,
192 };
193
generate_call_stub(address & return_address)194 address generate_call_stub(address& return_address) {
195 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
196 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
197 "adjust this code");
198
199 StubCodeMark mark(this, "StubRoutines", "call_stub");
200 address start = __ pc();
201
202 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
203
204 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
205 const Address result (rfp, result_off * wordSize);
206 const Address result_type (rfp, result_type_off * wordSize);
207 const Address method (rfp, method_off * wordSize);
208 const Address entry_point (rfp, entry_point_off * wordSize);
209 const Address parameter_size(rfp, parameter_size_off * wordSize);
210
211 const Address thread (rfp, thread_off * wordSize);
212
213 const Address d15_save (rfp, d15_off * wordSize);
214 const Address d13_save (rfp, d13_off * wordSize);
215 const Address d11_save (rfp, d11_off * wordSize);
216 const Address d9_save (rfp, d9_off * wordSize);
217
218 const Address r28_save (rfp, r28_off * wordSize);
219 const Address r26_save (rfp, r26_off * wordSize);
220 const Address r24_save (rfp, r24_off * wordSize);
221 const Address r22_save (rfp, r22_off * wordSize);
222 const Address r20_save (rfp, r20_off * wordSize);
223
224 // stub code
225
226 address aarch64_entry = __ pc();
227
228 // set up frame and move sp to end of save area
229 __ enter();
230 __ sub(sp, rfp, -sp_after_call_off * wordSize);
231
232 // save register parameters and Java scratch/global registers
233 // n.b. we save thread even though it gets installed in
234 // rthread because we want to sanity check rthread later
235 __ str(c_rarg7, thread);
236 __ strw(c_rarg6, parameter_size);
237 __ stp(c_rarg4, c_rarg5, entry_point);
238 __ stp(c_rarg2, c_rarg3, result_type);
239 __ stp(c_rarg0, c_rarg1, call_wrapper);
240
241 __ stp(r20, r19, r20_save);
242 __ stp(r22, r21, r22_save);
243 __ stp(r24, r23, r24_save);
244 __ stp(r26, r25, r26_save);
245 __ stp(r28, r27, r28_save);
246
247 __ stpd(v9, v8, d9_save);
248 __ stpd(v11, v10, d11_save);
249 __ stpd(v13, v12, d13_save);
250 __ stpd(v15, v14, d15_save);
251
252 // install Java thread in global register now we have saved
253 // whatever value it held
254 __ mov(rthread, c_rarg7);
255 // And method
256 __ mov(rmethod, c_rarg3);
257
258 // set up the heapbase register
259 __ reinit_heapbase();
260
261 #ifdef ASSERT
262 // make sure we have no pending exceptions
263 {
264 Label L;
265 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
266 __ cmp(rscratch1, (u1)NULL_WORD);
267 __ br(Assembler::EQ, L);
268 __ stop("StubRoutines::call_stub: entered with pending exception");
269 __ BIND(L);
270 }
271 #endif
272 // pass parameters if any
273 __ mov(esp, sp);
274 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
275 __ andr(sp, rscratch1, -2 * wordSize);
276
277 BLOCK_COMMENT("pass parameters if any");
278 Label parameters_done;
279 // parameter count is still in c_rarg6
280 // and parameter pointer identifying param 1 is in c_rarg5
281 __ cbzw(c_rarg6, parameters_done);
282
283 address loop = __ pc();
284 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
285 __ subsw(c_rarg6, c_rarg6, 1);
286 __ push(rscratch1);
287 __ br(Assembler::GT, loop);
288
289 __ BIND(parameters_done);
290
291 // call Java entry -- passing methdoOop, and current sp
292 // rmethod: Method*
293 // r13: sender sp
294 BLOCK_COMMENT("call Java function");
295 __ mov(r13, sp);
296 __ blr(c_rarg4);
297
298 // we do this here because the notify will already have been done
299 // if we get to the next instruction via an exception
300 //
301 // n.b. adding this instruction here affects the calculation of
302 // whether or not a routine returns to the call stub (used when
303 // doing stack walks) since the normal test is to check the return
304 // pc against the address saved below. so we may need to allow for
305 // this extra instruction in the check.
306
307 // save current address for use by exception handling code
308
309 return_address = __ pc();
310
311 // store result depending on type (everything that is not
312 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
313 // n.b. this assumes Java returns an integral result in r0
314 // and a floating result in j_farg0
315 __ ldr(j_rarg2, result);
316 Label is_long, is_float, is_double, exit;
317 __ ldr(j_rarg1, result_type);
318 __ cmp(j_rarg1, (u1)T_OBJECT);
319 __ br(Assembler::EQ, is_long);
320 __ cmp(j_rarg1, (u1)T_LONG);
321 __ br(Assembler::EQ, is_long);
322 __ cmp(j_rarg1, (u1)T_FLOAT);
323 __ br(Assembler::EQ, is_float);
324 __ cmp(j_rarg1, (u1)T_DOUBLE);
325 __ br(Assembler::EQ, is_double);
326
327 // handle T_INT case
328 __ strw(r0, Address(j_rarg2));
329
330 __ BIND(exit);
331
332 // pop parameters
333 __ sub(esp, rfp, -sp_after_call_off * wordSize);
334
335 #ifdef ASSERT
336 // verify that threads correspond
337 {
338 Label L, S;
339 __ ldr(rscratch1, thread);
340 __ cmp(rthread, rscratch1);
341 __ br(Assembler::NE, S);
342 __ get_thread(rscratch1);
343 __ cmp(rthread, rscratch1);
344 __ br(Assembler::EQ, L);
345 __ BIND(S);
346 __ stop("StubRoutines::call_stub: threads must correspond");
347 __ BIND(L);
348 }
349 #endif
350
351 // restore callee-save registers
352 __ ldpd(v15, v14, d15_save);
353 __ ldpd(v13, v12, d13_save);
354 __ ldpd(v11, v10, d11_save);
355 __ ldpd(v9, v8, d9_save);
356
357 __ ldp(r28, r27, r28_save);
358 __ ldp(r26, r25, r26_save);
359 __ ldp(r24, r23, r24_save);
360 __ ldp(r22, r21, r22_save);
361 __ ldp(r20, r19, r20_save);
362
363 __ ldp(c_rarg0, c_rarg1, call_wrapper);
364 __ ldrw(c_rarg2, result_type);
365 __ ldr(c_rarg3, method);
366 __ ldp(c_rarg4, c_rarg5, entry_point);
367 __ ldp(c_rarg6, c_rarg7, parameter_size);
368
369 // leave frame and return to caller
370 __ leave();
371 __ ret(lr);
372
373 // handle return types different from T_INT
374
375 __ BIND(is_long);
376 __ str(r0, Address(j_rarg2, 0));
377 __ br(Assembler::AL, exit);
378
379 __ BIND(is_float);
380 __ strs(j_farg0, Address(j_rarg2, 0));
381 __ br(Assembler::AL, exit);
382
383 __ BIND(is_double);
384 __ strd(j_farg0, Address(j_rarg2, 0));
385 __ br(Assembler::AL, exit);
386
387 return start;
388 }
389
390 // Return point for a Java call if there's an exception thrown in
391 // Java code. The exception is caught and transformed into a
392 // pending exception stored in JavaThread that can be tested from
393 // within the VM.
394 //
395 // Note: Usually the parameters are removed by the callee. In case
396 // of an exception crossing an activation frame boundary, that is
397 // not the case if the callee is compiled code => need to setup the
398 // rsp.
399 //
400 // r0: exception oop
401
generate_catch_exception()402 address generate_catch_exception() {
403 StubCodeMark mark(this, "StubRoutines", "catch_exception");
404 address start = __ pc();
405
406 // same as in generate_call_stub():
407 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
408 const Address thread (rfp, thread_off * wordSize);
409
410 #ifdef ASSERT
411 // verify that threads correspond
412 {
413 Label L, S;
414 __ ldr(rscratch1, thread);
415 __ cmp(rthread, rscratch1);
416 __ br(Assembler::NE, S);
417 __ get_thread(rscratch1);
418 __ cmp(rthread, rscratch1);
419 __ br(Assembler::EQ, L);
420 __ bind(S);
421 __ stop("StubRoutines::catch_exception: threads must correspond");
422 __ bind(L);
423 }
424 #endif
425
426 // set pending exception
427 __ verify_oop(r0);
428
429 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
430 __ mov(rscratch1, (address)__FILE__);
431 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
432 __ movw(rscratch1, (int)__LINE__);
433 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
434
435 // complete return to VM
436 assert(StubRoutines::_call_stub_return_address != NULL,
437 "_call_stub_return_address must have been generated before");
438 __ b(StubRoutines::_call_stub_return_address);
439
440 return start;
441 }
442
443 // Continuation point for runtime calls returning with a pending
444 // exception. The pending exception check happened in the runtime
445 // or native call stub. The pending exception in Thread is
446 // converted into a Java-level exception.
447 //
448 // Contract with Java-level exception handlers:
449 // r0: exception
450 // r3: throwing pc
451 //
452 // NOTE: At entry of this stub, exception-pc must be in LR !!
453
454 // NOTE: this is always used as a jump target within generated code
455 // so it just needs to be generated code wiht no x86 prolog
456
generate_forward_exception()457 address generate_forward_exception() {
458 StubCodeMark mark(this, "StubRoutines", "forward exception");
459 address start = __ pc();
460
461 // Upon entry, LR points to the return address returning into
462 // Java (interpreted or compiled) code; i.e., the return address
463 // becomes the throwing pc.
464 //
465 // Arguments pushed before the runtime call are still on the stack
466 // but the exception handler will reset the stack pointer ->
467 // ignore them. A potential result in registers can be ignored as
468 // well.
469
470 #ifdef ASSERT
471 // make sure this code is only executed if there is a pending exception
472 {
473 Label L;
474 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
475 __ cbnz(rscratch1, L);
476 __ stop("StubRoutines::forward exception: no pending exception (1)");
477 __ bind(L);
478 }
479 #endif
480
481 // compute exception handler into r19
482
483 // call the VM to find the handler address associated with the
484 // caller address. pass thread in r0 and caller pc (ret address)
485 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
486 // the stack.
487 __ mov(c_rarg1, lr);
488 // lr will be trashed by the VM call so we move it to R19
489 // (callee-saved) because we also need to pass it to the handler
490 // returned by this call.
491 __ mov(r19, lr);
492 BLOCK_COMMENT("call exception_handler_for_return_address");
493 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
494 SharedRuntime::exception_handler_for_return_address),
495 rthread, c_rarg1);
496 // Reinitialize the ptrue predicate register, in case the external runtime
497 // call clobbers ptrue reg, as we may return to SVE compiled code.
498 __ reinitialize_ptrue();
499
500 // we should not really care that lr is no longer the callee
501 // address. we saved the value the handler needs in r19 so we can
502 // just copy it to r3. however, the C2 handler will push its own
503 // frame and then calls into the VM and the VM code asserts that
504 // the PC for the frame above the handler belongs to a compiled
505 // Java method. So, we restore lr here to satisfy that assert.
506 __ mov(lr, r19);
507 // setup r0 & r3 & clear pending exception
508 __ mov(r3, r19);
509 __ mov(r19, r0);
510 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
511 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
512
513 #ifdef ASSERT
514 // make sure exception is set
515 {
516 Label L;
517 __ cbnz(r0, L);
518 __ stop("StubRoutines::forward exception: no pending exception (2)");
519 __ bind(L);
520 }
521 #endif
522
523 // continue at exception handler
524 // r0: exception
525 // r3: throwing pc
526 // r19: exception handler
527 __ verify_oop(r0);
528 __ br(r19);
529
530 return start;
531 }
532
533 // Non-destructive plausibility checks for oops
534 //
535 // Arguments:
536 // r0: oop to verify
537 // rscratch1: error message
538 //
539 // Stack after saving c_rarg3:
540 // [tos + 0]: saved c_rarg3
541 // [tos + 1]: saved c_rarg2
542 // [tos + 2]: saved lr
543 // [tos + 3]: saved rscratch2
544 // [tos + 4]: saved r0
545 // [tos + 5]: saved rscratch1
generate_verify_oop()546 address generate_verify_oop() {
547
548 StubCodeMark mark(this, "StubRoutines", "verify_oop");
549 address start = __ pc();
550
551 Label exit, error;
552
553 // save c_rarg2 and c_rarg3
554 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
555
556 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
557 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
558 __ ldr(c_rarg3, Address(c_rarg2));
559 __ add(c_rarg3, c_rarg3, 1);
560 __ str(c_rarg3, Address(c_rarg2));
561
562 // object is in r0
563 // make sure object is 'reasonable'
564 __ cbz(r0, exit); // if obj is NULL it is OK
565
566 #if INCLUDE_ZGC
567 if (UseZGC) {
568 // Check if mask is good.
569 // verifies that ZAddressBadMask & r0 == 0
570 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
571 __ andr(c_rarg2, r0, c_rarg3);
572 __ cbnz(c_rarg2, error);
573 }
574 #endif
575
576 // Check if the oop is in the right area of memory
577 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
578 __ andr(c_rarg2, r0, c_rarg3);
579 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
580
581 // Compare c_rarg2 and c_rarg3. We don't use a compare
582 // instruction here because the flags register is live.
583 __ eor(c_rarg2, c_rarg2, c_rarg3);
584 __ cbnz(c_rarg2, error);
585
586 // make sure klass is 'reasonable', which is not zero.
587 __ load_klass(r0, r0); // get klass
588 __ cbz(r0, error); // if klass is NULL it is broken
589
590 // return if everything seems ok
591 __ bind(exit);
592
593 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
594 __ ret(lr);
595
596 // handle errors
597 __ bind(error);
598 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
599
600 __ push(RegSet::range(r0, r29), sp);
601 // debug(char* msg, int64_t pc, int64_t regs[])
602 __ mov(c_rarg0, rscratch1); // pass address of error message
603 __ mov(c_rarg1, lr); // pass return address
604 __ mov(c_rarg2, sp); // pass address of regs on stack
605 #ifndef PRODUCT
606 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
607 #endif
608 BLOCK_COMMENT("call MacroAssembler::debug");
609 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
610 __ blr(rscratch1);
611 __ hlt(0);
612
613 return start;
614 }
615
array_overlap_test(Label & L_no_overlap,Address::sxtw sf)616 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
617
618 // Generate indices for iota vector.
generate_iota_indices(const char * stub_name)619 address generate_iota_indices(const char *stub_name) {
620 __ align(CodeEntryAlignment);
621 StubCodeMark mark(this, "StubRoutines", stub_name);
622 address start = __ pc();
623 __ emit_data64(0x0706050403020100, relocInfo::none);
624 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
625 return start;
626 }
627
628 // The inner part of zero_words(). This is the bulk operation,
629 // zeroing words in blocks, possibly using DC ZVA to do it. The
630 // caller is responsible for zeroing the last few words.
631 //
632 // Inputs:
633 // r10: the HeapWord-aligned base address of an array to zero.
634 // r11: the count in HeapWords, r11 > 0.
635 //
636 // Returns r10 and r11, adjusted for the caller to clear.
637 // r10: the base address of the tail of words left to clear.
638 // r11: the number of words in the tail.
639 // r11 < MacroAssembler::zero_words_block_size.
640
generate_zero_blocks()641 address generate_zero_blocks() {
642 Label done;
643 Label base_aligned;
644
645 Register base = r10, cnt = r11;
646
647 __ align(CodeEntryAlignment);
648 StubCodeMark mark(this, "StubRoutines", "zero_blocks");
649 address start = __ pc();
650
651 if (UseBlockZeroing) {
652 int zva_length = VM_Version::zva_length();
653
654 // Ensure ZVA length can be divided by 16. This is required by
655 // the subsequent operations.
656 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
657
658 __ tbz(base, 3, base_aligned);
659 __ str(zr, Address(__ post(base, 8)));
660 __ sub(cnt, cnt, 1);
661 __ bind(base_aligned);
662
663 // Ensure count >= zva_length * 2 so that it still deserves a zva after
664 // alignment.
665 Label small;
666 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
667 __ subs(rscratch1, cnt, low_limit >> 3);
668 __ br(Assembler::LT, small);
669 __ zero_dcache_blocks(base, cnt);
670 __ bind(small);
671 }
672
673 {
674 // Number of stp instructions we'll unroll
675 const int unroll =
676 MacroAssembler::zero_words_block_size / 2;
677 // Clear the remaining blocks.
678 Label loop;
679 __ subs(cnt, cnt, unroll * 2);
680 __ br(Assembler::LT, done);
681 __ bind(loop);
682 for (int i = 0; i < unroll; i++)
683 __ stp(zr, zr, __ post(base, 16));
684 __ subs(cnt, cnt, unroll * 2);
685 __ br(Assembler::GE, loop);
686 __ bind(done);
687 __ add(cnt, cnt, unroll * 2);
688 }
689
690 __ ret(lr);
691
692 return start;
693 }
694
695
696 typedef enum {
697 copy_forwards = 1,
698 copy_backwards = -1
699 } copy_direction;
700
701 // Bulk copy of blocks of 8 words.
702 //
703 // count is a count of words.
704 //
705 // Precondition: count >= 8
706 //
707 // Postconditions:
708 //
709 // The least significant bit of count contains the remaining count
710 // of words to copy. The rest of count is trash.
711 //
712 // s and d are adjusted to point to the remaining words to copy
713 //
generate_copy_longs(Label & start,Register s,Register d,Register count,copy_direction direction)714 void generate_copy_longs(Label &start, Register s, Register d, Register count,
715 copy_direction direction) {
716 int unit = wordSize * direction;
717 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
718
719 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
720 t4 = r7, t5 = r10, t6 = r11, t7 = r12;
721 const Register stride = r13;
722
723 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
724 assert_different_registers(s, d, count, rscratch1);
725
726 Label again, drain;
727 const char *stub_name;
728 if (direction == copy_forwards)
729 stub_name = "forward_copy_longs";
730 else
731 stub_name = "backward_copy_longs";
732
733 __ align(CodeEntryAlignment);
734
735 StubCodeMark mark(this, "StubRoutines", stub_name);
736
737 __ bind(start);
738
739 Label unaligned_copy_long;
740 if (AvoidUnalignedAccesses) {
741 __ tbnz(d, 3, unaligned_copy_long);
742 }
743
744 if (direction == copy_forwards) {
745 __ sub(s, s, bias);
746 __ sub(d, d, bias);
747 }
748
749 #ifdef ASSERT
750 // Make sure we are never given < 8 words
751 {
752 Label L;
753 __ cmp(count, (u1)8);
754 __ br(Assembler::GE, L);
755 __ stop("genrate_copy_longs called with < 8 words");
756 __ bind(L);
757 }
758 #endif
759
760 // Fill 8 registers
761 if (UseSIMDForMemoryOps) {
762 __ ldpq(v0, v1, Address(s, 4 * unit));
763 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
764 } else {
765 __ ldp(t0, t1, Address(s, 2 * unit));
766 __ ldp(t2, t3, Address(s, 4 * unit));
767 __ ldp(t4, t5, Address(s, 6 * unit));
768 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
769 }
770
771 __ subs(count, count, 16);
772 __ br(Assembler::LO, drain);
773
774 int prefetch = PrefetchCopyIntervalInBytes;
775 bool use_stride = false;
776 if (direction == copy_backwards) {
777 use_stride = prefetch > 256;
778 prefetch = -prefetch;
779 if (use_stride) __ mov(stride, prefetch);
780 }
781
782 __ bind(again);
783
784 if (PrefetchCopyIntervalInBytes > 0)
785 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
786
787 if (UseSIMDForMemoryOps) {
788 __ stpq(v0, v1, Address(d, 4 * unit));
789 __ ldpq(v0, v1, Address(s, 4 * unit));
790 __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
791 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
792 } else {
793 __ stp(t0, t1, Address(d, 2 * unit));
794 __ ldp(t0, t1, Address(s, 2 * unit));
795 __ stp(t2, t3, Address(d, 4 * unit));
796 __ ldp(t2, t3, Address(s, 4 * unit));
797 __ stp(t4, t5, Address(d, 6 * unit));
798 __ ldp(t4, t5, Address(s, 6 * unit));
799 __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
800 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
801 }
802
803 __ subs(count, count, 8);
804 __ br(Assembler::HS, again);
805
806 // Drain
807 __ bind(drain);
808 if (UseSIMDForMemoryOps) {
809 __ stpq(v0, v1, Address(d, 4 * unit));
810 __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
811 } else {
812 __ stp(t0, t1, Address(d, 2 * unit));
813 __ stp(t2, t3, Address(d, 4 * unit));
814 __ stp(t4, t5, Address(d, 6 * unit));
815 __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
816 }
817
818 {
819 Label L1, L2;
820 __ tbz(count, exact_log2(4), L1);
821 if (UseSIMDForMemoryOps) {
822 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
823 __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
824 } else {
825 __ ldp(t0, t1, Address(s, 2 * unit));
826 __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
827 __ stp(t0, t1, Address(d, 2 * unit));
828 __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
829 }
830 __ bind(L1);
831
832 if (direction == copy_forwards) {
833 __ add(s, s, bias);
834 __ add(d, d, bias);
835 }
836
837 __ tbz(count, 1, L2);
838 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
839 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
840 __ bind(L2);
841 }
842
843 __ ret(lr);
844
845 if (AvoidUnalignedAccesses) {
846 Label drain, again;
847 // Register order for storing. Order is different for backward copy.
848
849 __ bind(unaligned_copy_long);
850
851 // source address is even aligned, target odd aligned
852 //
853 // when forward copying word pairs we read long pairs at offsets
854 // {0, 2, 4, 6} (in long words). when backwards copying we read
855 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
856 // address by -2 in the forwards case so we can compute the
857 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
858 // or -1.
859 //
860 // when forward copying we need to store 1 word, 3 pairs and
861 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
862 // zero offset We adjust the destination by -1 which means we
863 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
864 //
865 // When backwards copyng we need to store 1 word, 3 pairs and
866 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
867 // offsets {1, 3, 5, 7, 8} * unit.
868
869 if (direction == copy_forwards) {
870 __ sub(s, s, 16);
871 __ sub(d, d, 8);
872 }
873
874 // Fill 8 registers
875 //
876 // for forwards copy s was offset by -16 from the original input
877 // value of s so the register contents are at these offsets
878 // relative to the 64 bit block addressed by that original input
879 // and so on for each successive 64 byte block when s is updated
880 //
881 // t0 at offset 0, t1 at offset 8
882 // t2 at offset 16, t3 at offset 24
883 // t4 at offset 32, t5 at offset 40
884 // t6 at offset 48, t7 at offset 56
885
886 // for backwards copy s was not offset so the register contents
887 // are at these offsets into the preceding 64 byte block
888 // relative to that original input and so on for each successive
889 // preceding 64 byte block when s is updated. this explains the
890 // slightly counter-intuitive looking pattern of register usage
891 // in the stp instructions for backwards copy.
892 //
893 // t0 at offset -16, t1 at offset -8
894 // t2 at offset -32, t3 at offset -24
895 // t4 at offset -48, t5 at offset -40
896 // t6 at offset -64, t7 at offset -56
897
898 __ ldp(t0, t1, Address(s, 2 * unit));
899 __ ldp(t2, t3, Address(s, 4 * unit));
900 __ ldp(t4, t5, Address(s, 6 * unit));
901 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
902
903 __ subs(count, count, 16);
904 __ br(Assembler::LO, drain);
905
906 int prefetch = PrefetchCopyIntervalInBytes;
907 bool use_stride = false;
908 if (direction == copy_backwards) {
909 use_stride = prefetch > 256;
910 prefetch = -prefetch;
911 if (use_stride) __ mov(stride, prefetch);
912 }
913
914 __ bind(again);
915
916 if (PrefetchCopyIntervalInBytes > 0)
917 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
918
919 if (direction == copy_forwards) {
920 // allowing for the offset of -8 the store instructions place
921 // registers into the target 64 bit block at the following
922 // offsets
923 //
924 // t0 at offset 0
925 // t1 at offset 8, t2 at offset 16
926 // t3 at offset 24, t4 at offset 32
927 // t5 at offset 40, t6 at offset 48
928 // t7 at offset 56
929
930 __ str(t0, Address(d, 1 * unit));
931 __ stp(t1, t2, Address(d, 2 * unit));
932 __ ldp(t0, t1, Address(s, 2 * unit));
933 __ stp(t3, t4, Address(d, 4 * unit));
934 __ ldp(t2, t3, Address(s, 4 * unit));
935 __ stp(t5, t6, Address(d, 6 * unit));
936 __ ldp(t4, t5, Address(s, 6 * unit));
937 __ str(t7, Address(__ pre(d, 8 * unit)));
938 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
939 } else {
940 // d was not offset when we started so the registers are
941 // written into the 64 bit block preceding d with the following
942 // offsets
943 //
944 // t1 at offset -8
945 // t3 at offset -24, t0 at offset -16
946 // t5 at offset -48, t2 at offset -32
947 // t7 at offset -56, t4 at offset -48
948 // t6 at offset -64
949 //
950 // note that this matches the offsets previously noted for the
951 // loads
952
953 __ str(t1, Address(d, 1 * unit));
954 __ stp(t3, t0, Address(d, 3 * unit));
955 __ ldp(t0, t1, Address(s, 2 * unit));
956 __ stp(t5, t2, Address(d, 5 * unit));
957 __ ldp(t2, t3, Address(s, 4 * unit));
958 __ stp(t7, t4, Address(d, 7 * unit));
959 __ ldp(t4, t5, Address(s, 6 * unit));
960 __ str(t6, Address(__ pre(d, 8 * unit)));
961 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
962 }
963
964 __ subs(count, count, 8);
965 __ br(Assembler::HS, again);
966
967 // Drain
968 //
969 // this uses the same pattern of offsets and register arguments
970 // as above
971 __ bind(drain);
972 if (direction == copy_forwards) {
973 __ str(t0, Address(d, 1 * unit));
974 __ stp(t1, t2, Address(d, 2 * unit));
975 __ stp(t3, t4, Address(d, 4 * unit));
976 __ stp(t5, t6, Address(d, 6 * unit));
977 __ str(t7, Address(__ pre(d, 8 * unit)));
978 } else {
979 __ str(t1, Address(d, 1 * unit));
980 __ stp(t3, t0, Address(d, 3 * unit));
981 __ stp(t5, t2, Address(d, 5 * unit));
982 __ stp(t7, t4, Address(d, 7 * unit));
983 __ str(t6, Address(__ pre(d, 8 * unit)));
984 }
985 // now we need to copy any remaining part block which may
986 // include a 4 word block subblock and/or a 2 word subblock.
987 // bits 2 and 1 in the count are the tell-tale for whetehr we
988 // have each such subblock
989 {
990 Label L1, L2;
991 __ tbz(count, exact_log2(4), L1);
992 // this is the same as above but copying only 4 longs hence
993 // with ony one intervening stp between the str instructions
994 // but note that the offsets and registers still follow the
995 // same pattern
996 __ ldp(t0, t1, Address(s, 2 * unit));
997 __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
998 if (direction == copy_forwards) {
999 __ str(t0, Address(d, 1 * unit));
1000 __ stp(t1, t2, Address(d, 2 * unit));
1001 __ str(t3, Address(__ pre(d, 4 * unit)));
1002 } else {
1003 __ str(t1, Address(d, 1 * unit));
1004 __ stp(t3, t0, Address(d, 3 * unit));
1005 __ str(t2, Address(__ pre(d, 4 * unit)));
1006 }
1007 __ bind(L1);
1008
1009 __ tbz(count, 1, L2);
1010 // this is the same as above but copying only 2 longs hence
1011 // there is no intervening stp between the str instructions
1012 // but note that the offset and register patterns are still
1013 // the same
1014 __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1015 if (direction == copy_forwards) {
1016 __ str(t0, Address(d, 1 * unit));
1017 __ str(t1, Address(__ pre(d, 2 * unit)));
1018 } else {
1019 __ str(t1, Address(d, 1 * unit));
1020 __ str(t0, Address(__ pre(d, 2 * unit)));
1021 }
1022 __ bind(L2);
1023
1024 // for forwards copy we need to re-adjust the offsets we
1025 // applied so that s and d are follow the last words written
1026
1027 if (direction == copy_forwards) {
1028 __ add(s, s, 16);
1029 __ add(d, d, 8);
1030 }
1031
1032 }
1033
1034 __ ret(lr);
1035 }
1036 }
1037
1038 // Small copy: less than 16 bytes.
1039 //
1040 // NB: Ignores all of the bits of count which represent more than 15
1041 // bytes, so a caller doesn't have to mask them.
1042
copy_memory_small(Register s,Register d,Register count,Register tmp,int step)1043 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1044 bool is_backwards = step < 0;
1045 size_t granularity = uabs(step);
1046 int direction = is_backwards ? -1 : 1;
1047 int unit = wordSize * direction;
1048
1049 Label Lword, Lint, Lshort, Lbyte;
1050
1051 assert(granularity
1052 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1053
1054 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1055
1056 // ??? I don't know if this bit-test-and-branch is the right thing
1057 // to do. It does a lot of jumping, resulting in several
1058 // mispredicted branches. It might make more sense to do this
1059 // with something like Duff's device with a single computed branch.
1060
1061 __ tbz(count, 3 - exact_log2(granularity), Lword);
1062 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1063 __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1064 __ bind(Lword);
1065
1066 if (granularity <= sizeof (jint)) {
1067 __ tbz(count, 2 - exact_log2(granularity), Lint);
1068 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1069 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1070 __ bind(Lint);
1071 }
1072
1073 if (granularity <= sizeof (jshort)) {
1074 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1075 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1076 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1077 __ bind(Lshort);
1078 }
1079
1080 if (granularity <= sizeof (jbyte)) {
1081 __ tbz(count, 0, Lbyte);
1082 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1083 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1084 __ bind(Lbyte);
1085 }
1086 }
1087
1088 Label copy_f, copy_b;
1089
1090 // All-singing all-dancing memory copy.
1091 //
1092 // Copy count units of memory from s to d. The size of a unit is
1093 // step, which can be positive or negative depending on the direction
1094 // of copy. If is_aligned is false, we align the source address.
1095 //
1096
copy_memory(bool is_aligned,Register s,Register d,Register count,Register tmp,int step)1097 void copy_memory(bool is_aligned, Register s, Register d,
1098 Register count, Register tmp, int step) {
1099 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1100 bool is_backwards = step < 0;
1101 unsigned int granularity = uabs(step);
1102 const Register t0 = r3, t1 = r4;
1103
1104 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1105 // load all the data before writing anything
1106 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1107 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1108 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1109 const Register send = r17, dend = r16;
1110
1111 if (PrefetchCopyIntervalInBytes > 0)
1112 __ prfm(Address(s, 0), PLDL1KEEP);
1113 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1114 __ br(Assembler::HI, copy_big);
1115
1116 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1117 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1118
1119 __ cmp(count, u1(16/granularity));
1120 __ br(Assembler::LS, copy16);
1121
1122 __ cmp(count, u1(64/granularity));
1123 __ br(Assembler::HI, copy80);
1124
1125 __ cmp(count, u1(32/granularity));
1126 __ br(Assembler::LS, copy32);
1127
1128 // 33..64 bytes
1129 if (UseSIMDForMemoryOps) {
1130 __ ldpq(v0, v1, Address(s, 0));
1131 __ ldpq(v2, v3, Address(send, -32));
1132 __ stpq(v0, v1, Address(d, 0));
1133 __ stpq(v2, v3, Address(dend, -32));
1134 } else {
1135 __ ldp(t0, t1, Address(s, 0));
1136 __ ldp(t2, t3, Address(s, 16));
1137 __ ldp(t4, t5, Address(send, -32));
1138 __ ldp(t6, t7, Address(send, -16));
1139
1140 __ stp(t0, t1, Address(d, 0));
1141 __ stp(t2, t3, Address(d, 16));
1142 __ stp(t4, t5, Address(dend, -32));
1143 __ stp(t6, t7, Address(dend, -16));
1144 }
1145 __ b(finish);
1146
1147 // 17..32 bytes
1148 __ bind(copy32);
1149 __ ldp(t0, t1, Address(s, 0));
1150 __ ldp(t2, t3, Address(send, -16));
1151 __ stp(t0, t1, Address(d, 0));
1152 __ stp(t2, t3, Address(dend, -16));
1153 __ b(finish);
1154
1155 // 65..80/96 bytes
1156 // (96 bytes if SIMD because we do 32 byes per instruction)
1157 __ bind(copy80);
1158 if (UseSIMDForMemoryOps) {
1159 __ ldpq(v0, v1, Address(s, 0));
1160 __ ldpq(v2, v3, Address(s, 32));
1161 // Unaligned pointers can be an issue for copying.
1162 // The issue has more chances to happen when granularity of data is
1163 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1164 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1165 // The most performance drop has been seen for the range 65-80 bytes.
1166 // For such cases using the pair of ldp/stp instead of the third pair of
1167 // ldpq/stpq fixes the performance issue.
1168 if (granularity < sizeof (jint)) {
1169 Label copy96;
1170 __ cmp(count, u1(80/granularity));
1171 __ br(Assembler::HI, copy96);
1172 __ ldp(t0, t1, Address(send, -16));
1173
1174 __ stpq(v0, v1, Address(d, 0));
1175 __ stpq(v2, v3, Address(d, 32));
1176 __ stp(t0, t1, Address(dend, -16));
1177 __ b(finish);
1178
1179 __ bind(copy96);
1180 }
1181 __ ldpq(v4, v5, Address(send, -32));
1182
1183 __ stpq(v0, v1, Address(d, 0));
1184 __ stpq(v2, v3, Address(d, 32));
1185 __ stpq(v4, v5, Address(dend, -32));
1186 } else {
1187 __ ldp(t0, t1, Address(s, 0));
1188 __ ldp(t2, t3, Address(s, 16));
1189 __ ldp(t4, t5, Address(s, 32));
1190 __ ldp(t6, t7, Address(s, 48));
1191 __ ldp(t8, t9, Address(send, -16));
1192
1193 __ stp(t0, t1, Address(d, 0));
1194 __ stp(t2, t3, Address(d, 16));
1195 __ stp(t4, t5, Address(d, 32));
1196 __ stp(t6, t7, Address(d, 48));
1197 __ stp(t8, t9, Address(dend, -16));
1198 }
1199 __ b(finish);
1200
1201 // 0..16 bytes
1202 __ bind(copy16);
1203 __ cmp(count, u1(8/granularity));
1204 __ br(Assembler::LO, copy8);
1205
1206 // 8..16 bytes
1207 __ ldr(t0, Address(s, 0));
1208 __ ldr(t1, Address(send, -8));
1209 __ str(t0, Address(d, 0));
1210 __ str(t1, Address(dend, -8));
1211 __ b(finish);
1212
1213 if (granularity < 8) {
1214 // 4..7 bytes
1215 __ bind(copy8);
1216 __ tbz(count, 2 - exact_log2(granularity), copy4);
1217 __ ldrw(t0, Address(s, 0));
1218 __ ldrw(t1, Address(send, -4));
1219 __ strw(t0, Address(d, 0));
1220 __ strw(t1, Address(dend, -4));
1221 __ b(finish);
1222 if (granularity < 4) {
1223 // 0..3 bytes
1224 __ bind(copy4);
1225 __ cbz(count, finish); // get rid of 0 case
1226 if (granularity == 2) {
1227 __ ldrh(t0, Address(s, 0));
1228 __ strh(t0, Address(d, 0));
1229 } else { // granularity == 1
1230 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1231 // the first and last byte.
1232 // Handle the 3 byte case by loading and storing base + count/2
1233 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1234 // This does means in the 1 byte case we load/store the same
1235 // byte 3 times.
1236 __ lsr(count, count, 1);
1237 __ ldrb(t0, Address(s, 0));
1238 __ ldrb(t1, Address(send, -1));
1239 __ ldrb(t2, Address(s, count));
1240 __ strb(t0, Address(d, 0));
1241 __ strb(t1, Address(dend, -1));
1242 __ strb(t2, Address(d, count));
1243 }
1244 __ b(finish);
1245 }
1246 }
1247
1248 __ bind(copy_big);
1249 if (is_backwards) {
1250 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1251 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1252 }
1253
1254 // Now we've got the small case out of the way we can align the
1255 // source address on a 2-word boundary.
1256
1257 Label aligned;
1258
1259 if (is_aligned) {
1260 // We may have to adjust by 1 word to get s 2-word-aligned.
1261 __ tbz(s, exact_log2(wordSize), aligned);
1262 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1263 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1264 __ sub(count, count, wordSize/granularity);
1265 } else {
1266 if (is_backwards) {
1267 __ andr(rscratch2, s, 2 * wordSize - 1);
1268 } else {
1269 __ neg(rscratch2, s);
1270 __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1271 }
1272 // rscratch2 is the byte adjustment needed to align s.
1273 __ cbz(rscratch2, aligned);
1274 int shift = exact_log2(granularity);
1275 if (shift) __ lsr(rscratch2, rscratch2, shift);
1276 __ sub(count, count, rscratch2);
1277
1278 #if 0
1279 // ?? This code is only correct for a disjoint copy. It may or
1280 // may not make sense to use it in that case.
1281
1282 // Copy the first pair; s and d may not be aligned.
1283 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1284 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1285
1286 // Align s and d, adjust count
1287 if (is_backwards) {
1288 __ sub(s, s, rscratch2);
1289 __ sub(d, d, rscratch2);
1290 } else {
1291 __ add(s, s, rscratch2);
1292 __ add(d, d, rscratch2);
1293 }
1294 #else
1295 copy_memory_small(s, d, rscratch2, rscratch1, step);
1296 #endif
1297 }
1298
1299 __ bind(aligned);
1300
1301 // s is now 2-word-aligned.
1302
1303 // We have a count of units and some trailing bytes. Adjust the
1304 // count and do a bulk copy of words.
1305 __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1306 if (direction == copy_forwards)
1307 __ bl(copy_f);
1308 else
1309 __ bl(copy_b);
1310
1311 // And the tail.
1312 copy_memory_small(s, d, count, tmp, step);
1313
1314 if (granularity >= 8) __ bind(copy8);
1315 if (granularity >= 4) __ bind(copy4);
1316 __ bind(finish);
1317 }
1318
1319
clobber_registers()1320 void clobber_registers() {
1321 #ifdef ASSERT
1322 RegSet clobbered
1323 = MacroAssembler::call_clobbered_registers() - rscratch1;
1324 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1325 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1326 for (RegSetIterator<> it = clobbered.begin(); *it != noreg; ++it) {
1327 __ mov(*it, rscratch1);
1328 }
1329 #endif
1330
1331 }
1332
1333 // Scan over array at a for count oops, verifying each one.
1334 // Preserves a and count, clobbers rscratch1 and rscratch2.
verify_oop_array(int size,Register a,Register count,Register temp)1335 void verify_oop_array (int size, Register a, Register count, Register temp) {
1336 Label loop, end;
1337 __ mov(rscratch1, a);
1338 __ mov(rscratch2, zr);
1339 __ bind(loop);
1340 __ cmp(rscratch2, count);
1341 __ br(Assembler::HS, end);
1342 if (size == wordSize) {
1343 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1344 __ verify_oop(temp);
1345 } else {
1346 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1347 __ decode_heap_oop(temp); // calls verify_oop
1348 }
1349 __ add(rscratch2, rscratch2, 1);
1350 __ b(loop);
1351 __ bind(end);
1352 }
1353
1354 // Arguments:
1355 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1356 // ignored
1357 // is_oop - true => oop array, so generate store check code
1358 // name - stub name string
1359 //
1360 // Inputs:
1361 // c_rarg0 - source array address
1362 // c_rarg1 - destination array address
1363 // c_rarg2 - element count, treated as ssize_t, can be zero
1364 //
1365 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1366 // the hardware handle it. The two dwords within qwords that span
1367 // cache line boundaries will still be loaded and stored atomically.
1368 //
1369 // Side Effects:
1370 // disjoint_int_copy_entry is set to the no-overlap entry point
1371 // used by generate_conjoint_int_oop_copy().
1372 //
generate_disjoint_copy(int size,bool aligned,bool is_oop,address * entry,const char * name,bool dest_uninitialized=false)1373 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1374 const char *name, bool dest_uninitialized = false) {
1375 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1376 RegSet saved_reg = RegSet::of(s, d, count);
1377 __ align(CodeEntryAlignment);
1378 StubCodeMark mark(this, "StubRoutines", name);
1379 address start = __ pc();
1380 __ enter();
1381
1382 if (entry != NULL) {
1383 *entry = __ pc();
1384 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1385 BLOCK_COMMENT("Entry:");
1386 }
1387
1388 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1389 if (dest_uninitialized) {
1390 decorators |= IS_DEST_UNINITIALIZED;
1391 }
1392 if (aligned) {
1393 decorators |= ARRAYCOPY_ALIGNED;
1394 }
1395
1396 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1397 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1398
1399 if (is_oop) {
1400 // save regs before copy_memory
1401 __ push(RegSet::of(d, count), sp);
1402 }
1403 {
1404 // UnsafeCopyMemory page error: continue after ucm
1405 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1406 UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1407 copy_memory(aligned, s, d, count, rscratch1, size);
1408 }
1409
1410 if (is_oop) {
1411 __ pop(RegSet::of(d, count), sp);
1412 if (VerifyOops)
1413 verify_oop_array(size, d, count, r16);
1414 }
1415
1416 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1417
1418 __ leave();
1419 __ mov(r0, zr); // return 0
1420 __ ret(lr);
1421 return start;
1422 }
1423
1424 // Arguments:
1425 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1426 // ignored
1427 // is_oop - true => oop array, so generate store check code
1428 // name - stub name string
1429 //
1430 // Inputs:
1431 // c_rarg0 - source array address
1432 // c_rarg1 - destination array address
1433 // c_rarg2 - element count, treated as ssize_t, can be zero
1434 //
1435 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1436 // the hardware handle it. The two dwords within qwords that span
1437 // cache line boundaries will still be loaded and stored atomically.
1438 //
generate_conjoint_copy(int size,bool aligned,bool is_oop,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1439 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1440 address *entry, const char *name,
1441 bool dest_uninitialized = false) {
1442 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1443 RegSet saved_regs = RegSet::of(s, d, count);
1444 StubCodeMark mark(this, "StubRoutines", name);
1445 address start = __ pc();
1446 __ enter();
1447
1448 if (entry != NULL) {
1449 *entry = __ pc();
1450 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1451 BLOCK_COMMENT("Entry:");
1452 }
1453
1454 // use fwd copy when (d-s) above_equal (count*size)
1455 __ sub(rscratch1, d, s);
1456 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1457 __ br(Assembler::HS, nooverlap_target);
1458
1459 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1460 if (dest_uninitialized) {
1461 decorators |= IS_DEST_UNINITIALIZED;
1462 }
1463 if (aligned) {
1464 decorators |= ARRAYCOPY_ALIGNED;
1465 }
1466
1467 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1468 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1469
1470 if (is_oop) {
1471 // save regs before copy_memory
1472 __ push(RegSet::of(d, count), sp);
1473 }
1474 {
1475 // UnsafeCopyMemory page error: continue after ucm
1476 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1477 UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1478 copy_memory(aligned, s, d, count, rscratch1, -size);
1479 }
1480 if (is_oop) {
1481 __ pop(RegSet::of(d, count), sp);
1482 if (VerifyOops)
1483 verify_oop_array(size, d, count, r16);
1484 }
1485 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1486 __ leave();
1487 __ mov(r0, zr); // return 0
1488 __ ret(lr);
1489 return start;
1490 }
1491
1492 // Arguments:
1493 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1494 // ignored
1495 // name - stub name string
1496 //
1497 // Inputs:
1498 // c_rarg0 - source array address
1499 // c_rarg1 - destination array address
1500 // c_rarg2 - element count, treated as ssize_t, can be zero
1501 //
1502 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1503 // we let the hardware handle it. The one to eight bytes within words,
1504 // dwords or qwords that span cache line boundaries will still be loaded
1505 // and stored atomically.
1506 //
1507 // Side Effects:
1508 // disjoint_byte_copy_entry is set to the no-overlap entry point //
1509 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1510 // we let the hardware handle it. The one to eight bytes within words,
1511 // dwords or qwords that span cache line boundaries will still be loaded
1512 // and stored atomically.
1513 //
1514 // Side Effects:
1515 // disjoint_byte_copy_entry is set to the no-overlap entry point
1516 // used by generate_conjoint_byte_copy().
1517 //
generate_disjoint_byte_copy(bool aligned,address * entry,const char * name)1518 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1519 const bool not_oop = false;
1520 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1521 }
1522
1523 // Arguments:
1524 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1525 // ignored
1526 // name - stub name string
1527 //
1528 // Inputs:
1529 // c_rarg0 - source array address
1530 // c_rarg1 - destination array address
1531 // c_rarg2 - element count, treated as ssize_t, can be zero
1532 //
1533 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1534 // we let the hardware handle it. The one to eight bytes within words,
1535 // dwords or qwords that span cache line boundaries will still be loaded
1536 // and stored atomically.
1537 //
generate_conjoint_byte_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1538 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1539 address* entry, const char *name) {
1540 const bool not_oop = false;
1541 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1542 }
1543
1544 // Arguments:
1545 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546 // ignored
1547 // name - stub name string
1548 //
1549 // Inputs:
1550 // c_rarg0 - source array address
1551 // c_rarg1 - destination array address
1552 // c_rarg2 - element count, treated as ssize_t, can be zero
1553 //
1554 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1555 // let the hardware handle it. The two or four words within dwords
1556 // or qwords that span cache line boundaries will still be loaded
1557 // and stored atomically.
1558 //
1559 // Side Effects:
1560 // disjoint_short_copy_entry is set to the no-overlap entry point
1561 // used by generate_conjoint_short_copy().
1562 //
generate_disjoint_short_copy(bool aligned,address * entry,const char * name)1563 address generate_disjoint_short_copy(bool aligned,
1564 address* entry, const char *name) {
1565 const bool not_oop = false;
1566 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1567 }
1568
1569 // Arguments:
1570 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1571 // ignored
1572 // name - stub name string
1573 //
1574 // Inputs:
1575 // c_rarg0 - source array address
1576 // c_rarg1 - destination array address
1577 // c_rarg2 - element count, treated as ssize_t, can be zero
1578 //
1579 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1580 // let the hardware handle it. The two or four words within dwords
1581 // or qwords that span cache line boundaries will still be loaded
1582 // and stored atomically.
1583 //
generate_conjoint_short_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1584 address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1585 address *entry, const char *name) {
1586 const bool not_oop = false;
1587 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1588
1589 }
1590 // Arguments:
1591 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1592 // ignored
1593 // name - stub name string
1594 //
1595 // Inputs:
1596 // c_rarg0 - source array address
1597 // c_rarg1 - destination array address
1598 // c_rarg2 - element count, treated as ssize_t, can be zero
1599 //
1600 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1601 // the hardware handle it. The two dwords within qwords that span
1602 // cache line boundaries will still be loaded and stored atomically.
1603 //
1604 // Side Effects:
1605 // disjoint_int_copy_entry is set to the no-overlap entry point
1606 // used by generate_conjoint_int_oop_copy().
1607 //
generate_disjoint_int_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1608 address generate_disjoint_int_copy(bool aligned, address *entry,
1609 const char *name, bool dest_uninitialized = false) {
1610 const bool not_oop = false;
1611 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1612 }
1613
1614 // Arguments:
1615 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1616 // ignored
1617 // name - stub name string
1618 //
1619 // Inputs:
1620 // c_rarg0 - source array address
1621 // c_rarg1 - destination array address
1622 // c_rarg2 - element count, treated as ssize_t, can be zero
1623 //
1624 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1625 // the hardware handle it. The two dwords within qwords that span
1626 // cache line boundaries will still be loaded and stored atomically.
1627 //
generate_conjoint_int_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1628 address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1629 address *entry, const char *name,
1630 bool dest_uninitialized = false) {
1631 const bool not_oop = false;
1632 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1633 }
1634
1635
1636 // Arguments:
1637 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1638 // ignored
1639 // name - stub name string
1640 //
1641 // Inputs:
1642 // c_rarg0 - source array address
1643 // c_rarg1 - destination array address
1644 // c_rarg2 - element count, treated as size_t, can be zero
1645 //
1646 // Side Effects:
1647 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1648 // no-overlap entry point used by generate_conjoint_long_oop_copy().
1649 //
generate_disjoint_long_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1650 address generate_disjoint_long_copy(bool aligned, address *entry,
1651 const char *name, bool dest_uninitialized = false) {
1652 const bool not_oop = false;
1653 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1654 }
1655
1656 // Arguments:
1657 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1658 // ignored
1659 // name - stub name string
1660 //
1661 // Inputs:
1662 // c_rarg0 - source array address
1663 // c_rarg1 - destination array address
1664 // c_rarg2 - element count, treated as size_t, can be zero
1665 //
generate_conjoint_long_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1666 address generate_conjoint_long_copy(bool aligned,
1667 address nooverlap_target, address *entry,
1668 const char *name, bool dest_uninitialized = false) {
1669 const bool not_oop = false;
1670 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1671 }
1672
1673 // Arguments:
1674 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1675 // ignored
1676 // name - stub name string
1677 //
1678 // Inputs:
1679 // c_rarg0 - source array address
1680 // c_rarg1 - destination array address
1681 // c_rarg2 - element count, treated as size_t, can be zero
1682 //
1683 // Side Effects:
1684 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1685 // no-overlap entry point used by generate_conjoint_long_oop_copy().
1686 //
generate_disjoint_oop_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized)1687 address generate_disjoint_oop_copy(bool aligned, address *entry,
1688 const char *name, bool dest_uninitialized) {
1689 const bool is_oop = true;
1690 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1691 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1692 }
1693
1694 // Arguments:
1695 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1696 // ignored
1697 // name - stub name string
1698 //
1699 // Inputs:
1700 // c_rarg0 - source array address
1701 // c_rarg1 - destination array address
1702 // c_rarg2 - element count, treated as size_t, can be zero
1703 //
generate_conjoint_oop_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized)1704 address generate_conjoint_oop_copy(bool aligned,
1705 address nooverlap_target, address *entry,
1706 const char *name, bool dest_uninitialized) {
1707 const bool is_oop = true;
1708 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1709 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1710 name, dest_uninitialized);
1711 }
1712
1713
1714 // Helper for generating a dynamic type check.
1715 // Smashes rscratch1, rscratch2.
generate_type_check(Register sub_klass,Register super_check_offset,Register super_klass,Label & L_success)1716 void generate_type_check(Register sub_klass,
1717 Register super_check_offset,
1718 Register super_klass,
1719 Label& L_success) {
1720 assert_different_registers(sub_klass, super_check_offset, super_klass);
1721
1722 BLOCK_COMMENT("type_check:");
1723
1724 Label L_miss;
1725
1726 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL,
1727 super_check_offset);
1728 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1729
1730 // Fall through on failure!
1731 __ BIND(L_miss);
1732 }
1733
1734 //
1735 // Generate checkcasting array copy stub
1736 //
1737 // Input:
1738 // c_rarg0 - source array address
1739 // c_rarg1 - destination array address
1740 // c_rarg2 - element count, treated as ssize_t, can be zero
1741 // c_rarg3 - size_t ckoff (super_check_offset)
1742 // c_rarg4 - oop ckval (super_klass)
1743 //
1744 // Output:
1745 // r0 == 0 - success
1746 // r0 == -1^K - failure, where K is partial transfer count
1747 //
generate_checkcast_copy(const char * name,address * entry,bool dest_uninitialized=false)1748 address generate_checkcast_copy(const char *name, address *entry,
1749 bool dest_uninitialized = false) {
1750
1751 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1752
1753 // Input registers (after setup_arg_regs)
1754 const Register from = c_rarg0; // source array address
1755 const Register to = c_rarg1; // destination array address
1756 const Register count = c_rarg2; // elementscount
1757 const Register ckoff = c_rarg3; // super_check_offset
1758 const Register ckval = c_rarg4; // super_klass
1759
1760 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1761 RegSet wb_post_saved_regs = RegSet::of(count);
1762
1763 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1764 const Register copied_oop = r22; // actual oop copied
1765 const Register count_save = r21; // orig elementscount
1766 const Register start_to = r20; // destination array start address
1767 const Register r19_klass = r19; // oop._klass
1768
1769 //---------------------------------------------------------------
1770 // Assembler stub will be used for this call to arraycopy
1771 // if the two arrays are subtypes of Object[] but the
1772 // destination array type is not equal to or a supertype
1773 // of the source type. Each element must be separately
1774 // checked.
1775
1776 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1777 copied_oop, r19_klass, count_save);
1778
1779 __ align(CodeEntryAlignment);
1780 StubCodeMark mark(this, "StubRoutines", name);
1781 address start = __ pc();
1782
1783 __ enter(); // required for proper stackwalking of RuntimeStub frame
1784
1785 #ifdef ASSERT
1786 // caller guarantees that the arrays really are different
1787 // otherwise, we would have to make conjoint checks
1788 { Label L;
1789 array_overlap_test(L, TIMES_OOP);
1790 __ stop("checkcast_copy within a single array");
1791 __ bind(L);
1792 }
1793 #endif //ASSERT
1794
1795 // Caller of this entry point must set up the argument registers.
1796 if (entry != NULL) {
1797 *entry = __ pc();
1798 BLOCK_COMMENT("Entry:");
1799 }
1800
1801 // Empty array: Nothing to do.
1802 __ cbz(count, L_done);
1803 __ push(RegSet::of(r19, r20, r21, r22), sp);
1804
1805 #ifdef ASSERT
1806 BLOCK_COMMENT("assert consistent ckoff/ckval");
1807 // The ckoff and ckval must be mutually consistent,
1808 // even though caller generates both.
1809 { Label L;
1810 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1811 __ ldrw(start_to, Address(ckval, sco_offset));
1812 __ cmpw(ckoff, start_to);
1813 __ br(Assembler::EQ, L);
1814 __ stop("super_check_offset inconsistent");
1815 __ bind(L);
1816 }
1817 #endif //ASSERT
1818
1819 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1820 bool is_oop = true;
1821 if (dest_uninitialized) {
1822 decorators |= IS_DEST_UNINITIALIZED;
1823 }
1824
1825 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1826 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1827
1828 // save the original count
1829 __ mov(count_save, count);
1830
1831 // Copy from low to high addresses
1832 __ mov(start_to, to); // Save destination array start address
1833 __ b(L_load_element);
1834
1835 // ======== begin loop ========
1836 // (Loop is rotated; its entry is L_load_element.)
1837 // Loop control:
1838 // for (; count != 0; count--) {
1839 // copied_oop = load_heap_oop(from++);
1840 // ... generate_type_check ...;
1841 // store_heap_oop(to++, copied_oop);
1842 // }
1843 __ align(OptoLoopAlignment);
1844
1845 __ BIND(L_store_element);
1846 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop
1847 __ sub(count, count, 1);
1848 __ cbz(count, L_do_card_marks);
1849
1850 // ======== loop entry is here ========
1851 __ BIND(L_load_element);
1852 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1853 __ cbz(copied_oop, L_store_element);
1854
1855 __ load_klass(r19_klass, copied_oop);// query the object klass
1856 generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1857 // ======== end loop ========
1858
1859 // It was a real error; we must depend on the caller to finish the job.
1860 // Register count = remaining oops, count_orig = total oops.
1861 // Emit GC store barriers for the oops we have copied and report
1862 // their number to the caller.
1863
1864 __ subs(count, count_save, count); // K = partially copied oop count
1865 __ eon(count, count, zr); // report (-1^K) to caller
1866 __ br(Assembler::EQ, L_done_pop);
1867
1868 __ BIND(L_do_card_marks);
1869 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1870
1871 __ bind(L_done_pop);
1872 __ pop(RegSet::of(r19, r20, r21, r22), sp);
1873 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1874
1875 __ bind(L_done);
1876 __ mov(r0, count);
1877 __ leave();
1878 __ ret(lr);
1879
1880 return start;
1881 }
1882
1883 // Perform range checks on the proposed arraycopy.
1884 // Kills temp, but nothing else.
1885 // Also, clean the sign bits of src_pos and dst_pos.
arraycopy_range_checks(Register src,Register src_pos,Register dst,Register dst_pos,Register length,Register temp,Label & L_failed)1886 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
1887 Register src_pos, // source position (c_rarg1)
1888 Register dst, // destination array oo (c_rarg2)
1889 Register dst_pos, // destination position (c_rarg3)
1890 Register length,
1891 Register temp,
1892 Label& L_failed) {
1893 BLOCK_COMMENT("arraycopy_range_checks:");
1894
1895 assert_different_registers(rscratch1, temp);
1896
1897 // if (src_pos + length > arrayOop(src)->length()) FAIL;
1898 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1899 __ addw(temp, length, src_pos);
1900 __ cmpw(temp, rscratch1);
1901 __ br(Assembler::HI, L_failed);
1902
1903 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
1904 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1905 __ addw(temp, length, dst_pos);
1906 __ cmpw(temp, rscratch1);
1907 __ br(Assembler::HI, L_failed);
1908
1909 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1910 __ movw(src_pos, src_pos);
1911 __ movw(dst_pos, dst_pos);
1912
1913 BLOCK_COMMENT("arraycopy_range_checks done");
1914 }
1915
1916 // These stubs get called from some dumb test routine.
1917 // I'll write them properly when they're called from
1918 // something that's actually doing something.
fake_arraycopy_stub(address src,address dst,int count)1919 static void fake_arraycopy_stub(address src, address dst, int count) {
1920 assert(count == 0, "huh?");
1921 }
1922
1923
1924 //
1925 // Generate 'unsafe' array copy stub
1926 // Though just as safe as the other stubs, it takes an unscaled
1927 // size_t argument instead of an element count.
1928 //
1929 // Input:
1930 // c_rarg0 - source array address
1931 // c_rarg1 - destination array address
1932 // c_rarg2 - byte count, treated as ssize_t, can be zero
1933 //
1934 // Examines the alignment of the operands and dispatches
1935 // to a long, int, short, or byte copy loop.
1936 //
generate_unsafe_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address long_copy_entry)1937 address generate_unsafe_copy(const char *name,
1938 address byte_copy_entry,
1939 address short_copy_entry,
1940 address int_copy_entry,
1941 address long_copy_entry) {
1942 Label L_long_aligned, L_int_aligned, L_short_aligned;
1943 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1944
1945 __ align(CodeEntryAlignment);
1946 StubCodeMark mark(this, "StubRoutines", name);
1947 address start = __ pc();
1948 __ enter(); // required for proper stackwalking of RuntimeStub frame
1949
1950 // bump this on entry, not on exit:
1951 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1952
1953 __ orr(rscratch1, s, d);
1954 __ orr(rscratch1, rscratch1, count);
1955
1956 __ andr(rscratch1, rscratch1, BytesPerLong-1);
1957 __ cbz(rscratch1, L_long_aligned);
1958 __ andr(rscratch1, rscratch1, BytesPerInt-1);
1959 __ cbz(rscratch1, L_int_aligned);
1960 __ tbz(rscratch1, 0, L_short_aligned);
1961 __ b(RuntimeAddress(byte_copy_entry));
1962
1963 __ BIND(L_short_aligned);
1964 __ lsr(count, count, LogBytesPerShort); // size => short_count
1965 __ b(RuntimeAddress(short_copy_entry));
1966 __ BIND(L_int_aligned);
1967 __ lsr(count, count, LogBytesPerInt); // size => int_count
1968 __ b(RuntimeAddress(int_copy_entry));
1969 __ BIND(L_long_aligned);
1970 __ lsr(count, count, LogBytesPerLong); // size => long_count
1971 __ b(RuntimeAddress(long_copy_entry));
1972
1973 return start;
1974 }
1975
1976 //
1977 // Generate generic array copy stubs
1978 //
1979 // Input:
1980 // c_rarg0 - src oop
1981 // c_rarg1 - src_pos (32-bits)
1982 // c_rarg2 - dst oop
1983 // c_rarg3 - dst_pos (32-bits)
1984 // c_rarg4 - element count (32-bits)
1985 //
1986 // Output:
1987 // r0 == 0 - success
1988 // r0 == -1^K - failure, where K is partial transfer count
1989 //
generate_generic_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address oop_copy_entry,address long_copy_entry,address checkcast_copy_entry)1990 address generate_generic_copy(const char *name,
1991 address byte_copy_entry, address short_copy_entry,
1992 address int_copy_entry, address oop_copy_entry,
1993 address long_copy_entry, address checkcast_copy_entry) {
1994
1995 Label L_failed, L_objArray;
1996 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1997
1998 // Input registers
1999 const Register src = c_rarg0; // source array oop
2000 const Register src_pos = c_rarg1; // source position
2001 const Register dst = c_rarg2; // destination array oop
2002 const Register dst_pos = c_rarg3; // destination position
2003 const Register length = c_rarg4;
2004
2005
2006 // Registers used as temps
2007 const Register dst_klass = c_rarg5;
2008
2009 __ align(CodeEntryAlignment);
2010
2011 StubCodeMark mark(this, "StubRoutines", name);
2012
2013 address start = __ pc();
2014
2015 __ enter(); // required for proper stackwalking of RuntimeStub frame
2016
2017 // bump this on entry, not on exit:
2018 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2019
2020 //-----------------------------------------------------------------------
2021 // Assembler stub will be used for this call to arraycopy
2022 // if the following conditions are met:
2023 //
2024 // (1) src and dst must not be null.
2025 // (2) src_pos must not be negative.
2026 // (3) dst_pos must not be negative.
2027 // (4) length must not be negative.
2028 // (5) src klass and dst klass should be the same and not NULL.
2029 // (6) src and dst should be arrays.
2030 // (7) src_pos + length must not exceed length of src.
2031 // (8) dst_pos + length must not exceed length of dst.
2032 //
2033
2034 // if (src == NULL) return -1;
2035 __ cbz(src, L_failed);
2036
2037 // if (src_pos < 0) return -1;
2038 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2039
2040 // if (dst == NULL) return -1;
2041 __ cbz(dst, L_failed);
2042
2043 // if (dst_pos < 0) return -1;
2044 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2045
2046 // registers used as temp
2047 const Register scratch_length = r16; // elements count to copy
2048 const Register scratch_src_klass = r17; // array klass
2049 const Register lh = r15; // layout helper
2050
2051 // if (length < 0) return -1;
2052 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2053 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2054
2055 __ load_klass(scratch_src_klass, src);
2056 #ifdef ASSERT
2057 // assert(src->klass() != NULL);
2058 {
2059 BLOCK_COMMENT("assert klasses not null {");
2060 Label L1, L2;
2061 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL
2062 __ bind(L1);
2063 __ stop("broken null klass");
2064 __ bind(L2);
2065 __ load_klass(rscratch1, dst);
2066 __ cbz(rscratch1, L1); // this would be broken also
2067 BLOCK_COMMENT("} assert klasses not null done");
2068 }
2069 #endif
2070
2071 // Load layout helper (32-bits)
2072 //
2073 // |array_tag| | header_size | element_type | |log2_element_size|
2074 // 32 30 24 16 8 2 0
2075 //
2076 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2077 //
2078
2079 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2080
2081 // Handle objArrays completely differently...
2082 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2083 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2084 __ movw(rscratch1, objArray_lh);
2085 __ eorw(rscratch2, lh, rscratch1);
2086 __ cbzw(rscratch2, L_objArray);
2087
2088 // if (src->klass() != dst->klass()) return -1;
2089 __ load_klass(rscratch2, dst);
2090 __ eor(rscratch2, rscratch2, scratch_src_klass);
2091 __ cbnz(rscratch2, L_failed);
2092
2093 // if (!src->is_Array()) return -1;
2094 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2095
2096 // At this point, it is known to be a typeArray (array_tag 0x3).
2097 #ifdef ASSERT
2098 {
2099 BLOCK_COMMENT("assert primitive array {");
2100 Label L;
2101 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2102 __ cmpw(lh, rscratch2);
2103 __ br(Assembler::GE, L);
2104 __ stop("must be a primitive array");
2105 __ bind(L);
2106 BLOCK_COMMENT("} assert primitive array done");
2107 }
2108 #endif
2109
2110 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2111 rscratch2, L_failed);
2112
2113 // TypeArrayKlass
2114 //
2115 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2116 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2117 //
2118
2119 const Register rscratch1_offset = rscratch1; // array offset
2120 const Register r15_elsize = lh; // element size
2121
2122 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2123 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2124 __ add(src, src, rscratch1_offset); // src array offset
2125 __ add(dst, dst, rscratch1_offset); // dst array offset
2126 BLOCK_COMMENT("choose copy loop based on element size");
2127
2128 // next registers should be set before the jump to corresponding stub
2129 const Register from = c_rarg0; // source array address
2130 const Register to = c_rarg1; // destination array address
2131 const Register count = c_rarg2; // elements count
2132
2133 // 'from', 'to', 'count' registers should be set in such order
2134 // since they are the same as 'src', 'src_pos', 'dst'.
2135
2136 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2137
2138 // The possible values of elsize are 0-3, i.e. exact_log2(element
2139 // size in bytes). We do a simple bitwise binary search.
2140 __ BIND(L_copy_bytes);
2141 __ tbnz(r15_elsize, 1, L_copy_ints);
2142 __ tbnz(r15_elsize, 0, L_copy_shorts);
2143 __ lea(from, Address(src, src_pos));// src_addr
2144 __ lea(to, Address(dst, dst_pos));// dst_addr
2145 __ movw(count, scratch_length); // length
2146 __ b(RuntimeAddress(byte_copy_entry));
2147
2148 __ BIND(L_copy_shorts);
2149 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2150 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2151 __ movw(count, scratch_length); // length
2152 __ b(RuntimeAddress(short_copy_entry));
2153
2154 __ BIND(L_copy_ints);
2155 __ tbnz(r15_elsize, 0, L_copy_longs);
2156 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2157 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2158 __ movw(count, scratch_length); // length
2159 __ b(RuntimeAddress(int_copy_entry));
2160
2161 __ BIND(L_copy_longs);
2162 #ifdef ASSERT
2163 {
2164 BLOCK_COMMENT("assert long copy {");
2165 Label L;
2166 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2167 __ cmpw(r15_elsize, LogBytesPerLong);
2168 __ br(Assembler::EQ, L);
2169 __ stop("must be long copy, but elsize is wrong");
2170 __ bind(L);
2171 BLOCK_COMMENT("} assert long copy done");
2172 }
2173 #endif
2174 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2175 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2176 __ movw(count, scratch_length); // length
2177 __ b(RuntimeAddress(long_copy_entry));
2178
2179 // ObjArrayKlass
2180 __ BIND(L_objArray);
2181 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2182
2183 Label L_plain_copy, L_checkcast_copy;
2184 // test array classes for subtyping
2185 __ load_klass(r15, dst);
2186 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2187 __ br(Assembler::NE, L_checkcast_copy);
2188
2189 // Identically typed arrays can be copied without element-wise checks.
2190 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2191 rscratch2, L_failed);
2192
2193 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2194 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2195 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2196 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2197 __ movw(count, scratch_length); // length
2198 __ BIND(L_plain_copy);
2199 __ b(RuntimeAddress(oop_copy_entry));
2200
2201 __ BIND(L_checkcast_copy);
2202 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2203 {
2204 // Before looking at dst.length, make sure dst is also an objArray.
2205 __ ldrw(rscratch1, Address(r15, lh_offset));
2206 __ movw(rscratch2, objArray_lh);
2207 __ eorw(rscratch1, rscratch1, rscratch2);
2208 __ cbnzw(rscratch1, L_failed);
2209
2210 // It is safe to examine both src.length and dst.length.
2211 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2212 r15, L_failed);
2213
2214 __ load_klass(dst_klass, dst); // reload
2215
2216 // Marshal the base address arguments now, freeing registers.
2217 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2218 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2219 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2220 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2221 __ movw(count, length); // length (reloaded)
2222 Register sco_temp = c_rarg3; // this register is free now
2223 assert_different_registers(from, to, count, sco_temp,
2224 dst_klass, scratch_src_klass);
2225 // assert_clean_int(count, sco_temp);
2226
2227 // Generate the type check.
2228 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2229 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2230
2231 // Smashes rscratch1, rscratch2
2232 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2233
2234 // Fetch destination element klass from the ObjArrayKlass header.
2235 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2236 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2237 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2238
2239 // the checkcast_copy loop needs two extra arguments:
2240 assert(c_rarg3 == sco_temp, "#3 already in place");
2241 // Set up arguments for checkcast_copy_entry.
2242 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2243 __ b(RuntimeAddress(checkcast_copy_entry));
2244 }
2245
2246 __ BIND(L_failed);
2247 __ mov(r0, -1);
2248 __ leave(); // required for proper stackwalking of RuntimeStub frame
2249 __ ret(lr);
2250
2251 return start;
2252 }
2253
2254 //
2255 // Generate stub for array fill. If "aligned" is true, the
2256 // "to" address is assumed to be heapword aligned.
2257 //
2258 // Arguments for generated stub:
2259 // to: c_rarg0
2260 // value: c_rarg1
2261 // count: c_rarg2 treated as signed
2262 //
generate_fill(BasicType t,bool aligned,const char * name)2263 address generate_fill(BasicType t, bool aligned, const char *name) {
2264 __ align(CodeEntryAlignment);
2265 StubCodeMark mark(this, "StubRoutines", name);
2266 address start = __ pc();
2267
2268 BLOCK_COMMENT("Entry:");
2269
2270 const Register to = c_rarg0; // source array address
2271 const Register value = c_rarg1; // value
2272 const Register count = c_rarg2; // elements count
2273
2274 const Register bz_base = r10; // base for block_zero routine
2275 const Register cnt_words = r11; // temp register
2276
2277 __ enter();
2278
2279 Label L_fill_elements, L_exit1;
2280
2281 int shift = -1;
2282 switch (t) {
2283 case T_BYTE:
2284 shift = 0;
2285 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2286 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2287 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2288 __ br(Assembler::LO, L_fill_elements);
2289 break;
2290 case T_SHORT:
2291 shift = 1;
2292 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2293 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2294 __ br(Assembler::LO, L_fill_elements);
2295 break;
2296 case T_INT:
2297 shift = 2;
2298 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2299 __ br(Assembler::LO, L_fill_elements);
2300 break;
2301 default: ShouldNotReachHere();
2302 }
2303
2304 // Align source address at 8 bytes address boundary.
2305 Label L_skip_align1, L_skip_align2, L_skip_align4;
2306 if (!aligned) {
2307 switch (t) {
2308 case T_BYTE:
2309 // One byte misalignment happens only for byte arrays.
2310 __ tbz(to, 0, L_skip_align1);
2311 __ strb(value, Address(__ post(to, 1)));
2312 __ subw(count, count, 1);
2313 __ bind(L_skip_align1);
2314 // Fallthrough
2315 case T_SHORT:
2316 // Two bytes misalignment happens only for byte and short (char) arrays.
2317 __ tbz(to, 1, L_skip_align2);
2318 __ strh(value, Address(__ post(to, 2)));
2319 __ subw(count, count, 2 >> shift);
2320 __ bind(L_skip_align2);
2321 // Fallthrough
2322 case T_INT:
2323 // Align to 8 bytes, we know we are 4 byte aligned to start.
2324 __ tbz(to, 2, L_skip_align4);
2325 __ strw(value, Address(__ post(to, 4)));
2326 __ subw(count, count, 4 >> shift);
2327 __ bind(L_skip_align4);
2328 break;
2329 default: ShouldNotReachHere();
2330 }
2331 }
2332
2333 //
2334 // Fill large chunks
2335 //
2336 __ lsrw(cnt_words, count, 3 - shift); // number of words
2337 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2338 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2339 if (UseBlockZeroing) {
2340 Label non_block_zeroing, rest;
2341 // If the fill value is zero we can use the fast zero_words().
2342 __ cbnz(value, non_block_zeroing);
2343 __ mov(bz_base, to);
2344 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2345 __ zero_words(bz_base, cnt_words);
2346 __ b(rest);
2347 __ bind(non_block_zeroing);
2348 __ fill_words(to, cnt_words, value);
2349 __ bind(rest);
2350 } else {
2351 __ fill_words(to, cnt_words, value);
2352 }
2353
2354 // Remaining count is less than 8 bytes. Fill it by a single store.
2355 // Note that the total length is no less than 8 bytes.
2356 if (t == T_BYTE || t == T_SHORT) {
2357 Label L_exit1;
2358 __ cbzw(count, L_exit1);
2359 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2360 __ str(value, Address(to, -8)); // overwrite some elements
2361 __ bind(L_exit1);
2362 __ leave();
2363 __ ret(lr);
2364 }
2365
2366 // Handle copies less than 8 bytes.
2367 Label L_fill_2, L_fill_4, L_exit2;
2368 __ bind(L_fill_elements);
2369 switch (t) {
2370 case T_BYTE:
2371 __ tbz(count, 0, L_fill_2);
2372 __ strb(value, Address(__ post(to, 1)));
2373 __ bind(L_fill_2);
2374 __ tbz(count, 1, L_fill_4);
2375 __ strh(value, Address(__ post(to, 2)));
2376 __ bind(L_fill_4);
2377 __ tbz(count, 2, L_exit2);
2378 __ strw(value, Address(to));
2379 break;
2380 case T_SHORT:
2381 __ tbz(count, 0, L_fill_4);
2382 __ strh(value, Address(__ post(to, 2)));
2383 __ bind(L_fill_4);
2384 __ tbz(count, 1, L_exit2);
2385 __ strw(value, Address(to));
2386 break;
2387 case T_INT:
2388 __ cbzw(count, L_exit2);
2389 __ strw(value, Address(to));
2390 break;
2391 default: ShouldNotReachHere();
2392 }
2393 __ bind(L_exit2);
2394 __ leave();
2395 __ ret(lr);
2396 return start;
2397 }
2398
generate_data_cache_writeback()2399 address generate_data_cache_writeback() {
2400 const Register line = c_rarg0; // address of line to write back
2401
2402 __ align(CodeEntryAlignment);
2403
2404 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2405
2406 address start = __ pc();
2407 __ enter();
2408 __ cache_wb(Address(line, 0));
2409 __ leave();
2410 __ ret(lr);
2411
2412 return start;
2413 }
2414
generate_data_cache_writeback_sync()2415 address generate_data_cache_writeback_sync() {
2416 const Register is_pre = c_rarg0; // pre or post sync
2417
2418 __ align(CodeEntryAlignment);
2419
2420 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2421
2422 // pre wbsync is a no-op
2423 // post wbsync translates to an sfence
2424
2425 Label skip;
2426 address start = __ pc();
2427 __ enter();
2428 __ cbnz(is_pre, skip);
2429 __ cache_wbsync(false);
2430 __ bind(skip);
2431 __ leave();
2432 __ ret(lr);
2433
2434 return start;
2435 }
2436
generate_arraycopy_stubs()2437 void generate_arraycopy_stubs() {
2438 address entry;
2439 address entry_jbyte_arraycopy;
2440 address entry_jshort_arraycopy;
2441 address entry_jint_arraycopy;
2442 address entry_oop_arraycopy;
2443 address entry_jlong_arraycopy;
2444 address entry_checkcast_arraycopy;
2445
2446 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2447 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2448
2449 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2450
2451 //*** jbyte
2452 // Always need aligned and unaligned versions
2453 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
2454 "jbyte_disjoint_arraycopy");
2455 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry,
2456 &entry_jbyte_arraycopy,
2457 "jbyte_arraycopy");
2458 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2459 "arrayof_jbyte_disjoint_arraycopy");
2460 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL,
2461 "arrayof_jbyte_arraycopy");
2462
2463 //*** jshort
2464 // Always need aligned and unaligned versions
2465 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
2466 "jshort_disjoint_arraycopy");
2467 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry,
2468 &entry_jshort_arraycopy,
2469 "jshort_arraycopy");
2470 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2471 "arrayof_jshort_disjoint_arraycopy");
2472 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL,
2473 "arrayof_jshort_arraycopy");
2474
2475 //*** jint
2476 // Aligned versions
2477 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2478 "arrayof_jint_disjoint_arraycopy");
2479 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2480 "arrayof_jint_arraycopy");
2481 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2482 // entry_jint_arraycopy always points to the unaligned version
2483 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry,
2484 "jint_disjoint_arraycopy");
2485 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry,
2486 &entry_jint_arraycopy,
2487 "jint_arraycopy");
2488
2489 //*** jlong
2490 // It is always aligned
2491 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2492 "arrayof_jlong_disjoint_arraycopy");
2493 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2494 "arrayof_jlong_arraycopy");
2495 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2496 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2497
2498 //*** oops
2499 {
2500 // With compressed oops we need unaligned versions; notice that
2501 // we overwrite entry_oop_arraycopy.
2502 bool aligned = !UseCompressedOops;
2503
2504 StubRoutines::_arrayof_oop_disjoint_arraycopy
2505 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2506 /*dest_uninitialized*/false);
2507 StubRoutines::_arrayof_oop_arraycopy
2508 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2509 /*dest_uninitialized*/false);
2510 // Aligned versions without pre-barriers
2511 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2512 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2513 /*dest_uninitialized*/true);
2514 StubRoutines::_arrayof_oop_arraycopy_uninit
2515 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2516 /*dest_uninitialized*/true);
2517 }
2518
2519 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2520 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2521 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2522 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2523
2524 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2525 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2526 /*dest_uninitialized*/true);
2527
2528 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
2529 entry_jbyte_arraycopy,
2530 entry_jshort_arraycopy,
2531 entry_jint_arraycopy,
2532 entry_jlong_arraycopy);
2533
2534 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
2535 entry_jbyte_arraycopy,
2536 entry_jshort_arraycopy,
2537 entry_jint_arraycopy,
2538 entry_oop_arraycopy,
2539 entry_jlong_arraycopy,
2540 entry_checkcast_arraycopy);
2541
2542 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2543 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2544 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2545 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2546 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2547 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2548 }
2549
generate_math_stubs()2550 void generate_math_stubs() { Unimplemented(); }
2551
2552 // Arguments:
2553 //
2554 // Inputs:
2555 // c_rarg0 - source byte array address
2556 // c_rarg1 - destination byte array address
2557 // c_rarg2 - K (key) in little endian int array
2558 //
generate_aescrypt_encryptBlock()2559 address generate_aescrypt_encryptBlock() {
2560 __ align(CodeEntryAlignment);
2561 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2562
2563 Label L_doLast;
2564
2565 const Register from = c_rarg0; // source array address
2566 const Register to = c_rarg1; // destination array address
2567 const Register key = c_rarg2; // key array address
2568 const Register keylen = rscratch1;
2569
2570 address start = __ pc();
2571 __ enter();
2572
2573 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2574
2575 __ ld1(v0, __ T16B, from); // get 16 bytes of input
2576
2577 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2578 __ rev32(v1, __ T16B, v1);
2579 __ rev32(v2, __ T16B, v2);
2580 __ rev32(v3, __ T16B, v3);
2581 __ rev32(v4, __ T16B, v4);
2582 __ aese(v0, v1);
2583 __ aesmc(v0, v0);
2584 __ aese(v0, v2);
2585 __ aesmc(v0, v0);
2586 __ aese(v0, v3);
2587 __ aesmc(v0, v0);
2588 __ aese(v0, v4);
2589 __ aesmc(v0, v0);
2590
2591 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2592 __ rev32(v1, __ T16B, v1);
2593 __ rev32(v2, __ T16B, v2);
2594 __ rev32(v3, __ T16B, v3);
2595 __ rev32(v4, __ T16B, v4);
2596 __ aese(v0, v1);
2597 __ aesmc(v0, v0);
2598 __ aese(v0, v2);
2599 __ aesmc(v0, v0);
2600 __ aese(v0, v3);
2601 __ aesmc(v0, v0);
2602 __ aese(v0, v4);
2603 __ aesmc(v0, v0);
2604
2605 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2606 __ rev32(v1, __ T16B, v1);
2607 __ rev32(v2, __ T16B, v2);
2608
2609 __ cmpw(keylen, 44);
2610 __ br(Assembler::EQ, L_doLast);
2611
2612 __ aese(v0, v1);
2613 __ aesmc(v0, v0);
2614 __ aese(v0, v2);
2615 __ aesmc(v0, v0);
2616
2617 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2618 __ rev32(v1, __ T16B, v1);
2619 __ rev32(v2, __ T16B, v2);
2620
2621 __ cmpw(keylen, 52);
2622 __ br(Assembler::EQ, L_doLast);
2623
2624 __ aese(v0, v1);
2625 __ aesmc(v0, v0);
2626 __ aese(v0, v2);
2627 __ aesmc(v0, v0);
2628
2629 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2630 __ rev32(v1, __ T16B, v1);
2631 __ rev32(v2, __ T16B, v2);
2632
2633 __ BIND(L_doLast);
2634
2635 __ aese(v0, v1);
2636 __ aesmc(v0, v0);
2637 __ aese(v0, v2);
2638
2639 __ ld1(v1, __ T16B, key);
2640 __ rev32(v1, __ T16B, v1);
2641 __ eor(v0, __ T16B, v0, v1);
2642
2643 __ st1(v0, __ T16B, to);
2644
2645 __ mov(r0, 0);
2646
2647 __ leave();
2648 __ ret(lr);
2649
2650 return start;
2651 }
2652
2653 // Arguments:
2654 //
2655 // Inputs:
2656 // c_rarg0 - source byte array address
2657 // c_rarg1 - destination byte array address
2658 // c_rarg2 - K (key) in little endian int array
2659 //
generate_aescrypt_decryptBlock()2660 address generate_aescrypt_decryptBlock() {
2661 assert(UseAES, "need AES cryptographic extension support");
2662 __ align(CodeEntryAlignment);
2663 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2664 Label L_doLast;
2665
2666 const Register from = c_rarg0; // source array address
2667 const Register to = c_rarg1; // destination array address
2668 const Register key = c_rarg2; // key array address
2669 const Register keylen = rscratch1;
2670
2671 address start = __ pc();
2672 __ enter(); // required for proper stackwalking of RuntimeStub frame
2673
2674 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2675
2676 __ ld1(v0, __ T16B, from); // get 16 bytes of input
2677
2678 __ ld1(v5, __ T16B, __ post(key, 16));
2679 __ rev32(v5, __ T16B, v5);
2680
2681 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2682 __ rev32(v1, __ T16B, v1);
2683 __ rev32(v2, __ T16B, v2);
2684 __ rev32(v3, __ T16B, v3);
2685 __ rev32(v4, __ T16B, v4);
2686 __ aesd(v0, v1);
2687 __ aesimc(v0, v0);
2688 __ aesd(v0, v2);
2689 __ aesimc(v0, v0);
2690 __ aesd(v0, v3);
2691 __ aesimc(v0, v0);
2692 __ aesd(v0, v4);
2693 __ aesimc(v0, v0);
2694
2695 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2696 __ rev32(v1, __ T16B, v1);
2697 __ rev32(v2, __ T16B, v2);
2698 __ rev32(v3, __ T16B, v3);
2699 __ rev32(v4, __ T16B, v4);
2700 __ aesd(v0, v1);
2701 __ aesimc(v0, v0);
2702 __ aesd(v0, v2);
2703 __ aesimc(v0, v0);
2704 __ aesd(v0, v3);
2705 __ aesimc(v0, v0);
2706 __ aesd(v0, v4);
2707 __ aesimc(v0, v0);
2708
2709 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2710 __ rev32(v1, __ T16B, v1);
2711 __ rev32(v2, __ T16B, v2);
2712
2713 __ cmpw(keylen, 44);
2714 __ br(Assembler::EQ, L_doLast);
2715
2716 __ aesd(v0, v1);
2717 __ aesimc(v0, v0);
2718 __ aesd(v0, v2);
2719 __ aesimc(v0, v0);
2720
2721 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2722 __ rev32(v1, __ T16B, v1);
2723 __ rev32(v2, __ T16B, v2);
2724
2725 __ cmpw(keylen, 52);
2726 __ br(Assembler::EQ, L_doLast);
2727
2728 __ aesd(v0, v1);
2729 __ aesimc(v0, v0);
2730 __ aesd(v0, v2);
2731 __ aesimc(v0, v0);
2732
2733 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2734 __ rev32(v1, __ T16B, v1);
2735 __ rev32(v2, __ T16B, v2);
2736
2737 __ BIND(L_doLast);
2738
2739 __ aesd(v0, v1);
2740 __ aesimc(v0, v0);
2741 __ aesd(v0, v2);
2742
2743 __ eor(v0, __ T16B, v0, v5);
2744
2745 __ st1(v0, __ T16B, to);
2746
2747 __ mov(r0, 0);
2748
2749 __ leave();
2750 __ ret(lr);
2751
2752 return start;
2753 }
2754
2755 // Arguments:
2756 //
2757 // Inputs:
2758 // c_rarg0 - source byte array address
2759 // c_rarg1 - destination byte array address
2760 // c_rarg2 - K (key) in little endian int array
2761 // c_rarg3 - r vector byte array address
2762 // c_rarg4 - input length
2763 //
2764 // Output:
2765 // x0 - input length
2766 //
generate_cipherBlockChaining_encryptAESCrypt()2767 address generate_cipherBlockChaining_encryptAESCrypt() {
2768 assert(UseAES, "need AES cryptographic extension support");
2769 __ align(CodeEntryAlignment);
2770 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2771
2772 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2773
2774 const Register from = c_rarg0; // source array address
2775 const Register to = c_rarg1; // destination array address
2776 const Register key = c_rarg2; // key array address
2777 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2778 // and left with the results of the last encryption block
2779 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2780 const Register keylen = rscratch1;
2781
2782 address start = __ pc();
2783
2784 __ enter();
2785
2786 __ movw(rscratch2, len_reg);
2787
2788 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2789
2790 __ ld1(v0, __ T16B, rvec);
2791
2792 __ cmpw(keylen, 52);
2793 __ br(Assembler::CC, L_loadkeys_44);
2794 __ br(Assembler::EQ, L_loadkeys_52);
2795
2796 __ ld1(v17, v18, __ T16B, __ post(key, 32));
2797 __ rev32(v17, __ T16B, v17);
2798 __ rev32(v18, __ T16B, v18);
2799 __ BIND(L_loadkeys_52);
2800 __ ld1(v19, v20, __ T16B, __ post(key, 32));
2801 __ rev32(v19, __ T16B, v19);
2802 __ rev32(v20, __ T16B, v20);
2803 __ BIND(L_loadkeys_44);
2804 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2805 __ rev32(v21, __ T16B, v21);
2806 __ rev32(v22, __ T16B, v22);
2807 __ rev32(v23, __ T16B, v23);
2808 __ rev32(v24, __ T16B, v24);
2809 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2810 __ rev32(v25, __ T16B, v25);
2811 __ rev32(v26, __ T16B, v26);
2812 __ rev32(v27, __ T16B, v27);
2813 __ rev32(v28, __ T16B, v28);
2814 __ ld1(v29, v30, v31, __ T16B, key);
2815 __ rev32(v29, __ T16B, v29);
2816 __ rev32(v30, __ T16B, v30);
2817 __ rev32(v31, __ T16B, v31);
2818
2819 __ BIND(L_aes_loop);
2820 __ ld1(v1, __ T16B, __ post(from, 16));
2821 __ eor(v0, __ T16B, v0, v1);
2822
2823 __ br(Assembler::CC, L_rounds_44);
2824 __ br(Assembler::EQ, L_rounds_52);
2825
2826 __ aese(v0, v17); __ aesmc(v0, v0);
2827 __ aese(v0, v18); __ aesmc(v0, v0);
2828 __ BIND(L_rounds_52);
2829 __ aese(v0, v19); __ aesmc(v0, v0);
2830 __ aese(v0, v20); __ aesmc(v0, v0);
2831 __ BIND(L_rounds_44);
2832 __ aese(v0, v21); __ aesmc(v0, v0);
2833 __ aese(v0, v22); __ aesmc(v0, v0);
2834 __ aese(v0, v23); __ aesmc(v0, v0);
2835 __ aese(v0, v24); __ aesmc(v0, v0);
2836 __ aese(v0, v25); __ aesmc(v0, v0);
2837 __ aese(v0, v26); __ aesmc(v0, v0);
2838 __ aese(v0, v27); __ aesmc(v0, v0);
2839 __ aese(v0, v28); __ aesmc(v0, v0);
2840 __ aese(v0, v29); __ aesmc(v0, v0);
2841 __ aese(v0, v30);
2842 __ eor(v0, __ T16B, v0, v31);
2843
2844 __ st1(v0, __ T16B, __ post(to, 16));
2845
2846 __ subw(len_reg, len_reg, 16);
2847 __ cbnzw(len_reg, L_aes_loop);
2848
2849 __ st1(v0, __ T16B, rvec);
2850
2851 __ mov(r0, rscratch2);
2852
2853 __ leave();
2854 __ ret(lr);
2855
2856 return start;
2857 }
2858
2859 // Arguments:
2860 //
2861 // Inputs:
2862 // c_rarg0 - source byte array address
2863 // c_rarg1 - destination byte array address
2864 // c_rarg2 - K (key) in little endian int array
2865 // c_rarg3 - r vector byte array address
2866 // c_rarg4 - input length
2867 //
2868 // Output:
2869 // r0 - input length
2870 //
generate_cipherBlockChaining_decryptAESCrypt()2871 address generate_cipherBlockChaining_decryptAESCrypt() {
2872 assert(UseAES, "need AES cryptographic extension support");
2873 __ align(CodeEntryAlignment);
2874 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2875
2876 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2877
2878 const Register from = c_rarg0; // source array address
2879 const Register to = c_rarg1; // destination array address
2880 const Register key = c_rarg2; // key array address
2881 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2882 // and left with the results of the last encryption block
2883 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2884 const Register keylen = rscratch1;
2885
2886 address start = __ pc();
2887
2888 __ enter();
2889
2890 __ movw(rscratch2, len_reg);
2891
2892 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2893
2894 __ ld1(v2, __ T16B, rvec);
2895
2896 __ ld1(v31, __ T16B, __ post(key, 16));
2897 __ rev32(v31, __ T16B, v31);
2898
2899 __ cmpw(keylen, 52);
2900 __ br(Assembler::CC, L_loadkeys_44);
2901 __ br(Assembler::EQ, L_loadkeys_52);
2902
2903 __ ld1(v17, v18, __ T16B, __ post(key, 32));
2904 __ rev32(v17, __ T16B, v17);
2905 __ rev32(v18, __ T16B, v18);
2906 __ BIND(L_loadkeys_52);
2907 __ ld1(v19, v20, __ T16B, __ post(key, 32));
2908 __ rev32(v19, __ T16B, v19);
2909 __ rev32(v20, __ T16B, v20);
2910 __ BIND(L_loadkeys_44);
2911 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2912 __ rev32(v21, __ T16B, v21);
2913 __ rev32(v22, __ T16B, v22);
2914 __ rev32(v23, __ T16B, v23);
2915 __ rev32(v24, __ T16B, v24);
2916 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2917 __ rev32(v25, __ T16B, v25);
2918 __ rev32(v26, __ T16B, v26);
2919 __ rev32(v27, __ T16B, v27);
2920 __ rev32(v28, __ T16B, v28);
2921 __ ld1(v29, v30, __ T16B, key);
2922 __ rev32(v29, __ T16B, v29);
2923 __ rev32(v30, __ T16B, v30);
2924
2925 __ BIND(L_aes_loop);
2926 __ ld1(v0, __ T16B, __ post(from, 16));
2927 __ orr(v1, __ T16B, v0, v0);
2928
2929 __ br(Assembler::CC, L_rounds_44);
2930 __ br(Assembler::EQ, L_rounds_52);
2931
2932 __ aesd(v0, v17); __ aesimc(v0, v0);
2933 __ aesd(v0, v18); __ aesimc(v0, v0);
2934 __ BIND(L_rounds_52);
2935 __ aesd(v0, v19); __ aesimc(v0, v0);
2936 __ aesd(v0, v20); __ aesimc(v0, v0);
2937 __ BIND(L_rounds_44);
2938 __ aesd(v0, v21); __ aesimc(v0, v0);
2939 __ aesd(v0, v22); __ aesimc(v0, v0);
2940 __ aesd(v0, v23); __ aesimc(v0, v0);
2941 __ aesd(v0, v24); __ aesimc(v0, v0);
2942 __ aesd(v0, v25); __ aesimc(v0, v0);
2943 __ aesd(v0, v26); __ aesimc(v0, v0);
2944 __ aesd(v0, v27); __ aesimc(v0, v0);
2945 __ aesd(v0, v28); __ aesimc(v0, v0);
2946 __ aesd(v0, v29); __ aesimc(v0, v0);
2947 __ aesd(v0, v30);
2948 __ eor(v0, __ T16B, v0, v31);
2949 __ eor(v0, __ T16B, v0, v2);
2950
2951 __ st1(v0, __ T16B, __ post(to, 16));
2952 __ orr(v2, __ T16B, v1, v1);
2953
2954 __ subw(len_reg, len_reg, 16);
2955 __ cbnzw(len_reg, L_aes_loop);
2956
2957 __ st1(v2, __ T16B, rvec);
2958
2959 __ mov(r0, rscratch2);
2960
2961 __ leave();
2962 __ ret(lr);
2963
2964 return start;
2965 }
2966
2967 // Arguments:
2968 //
2969 // Inputs:
2970 // c_rarg0 - byte[] source+offset
2971 // c_rarg1 - int[] SHA.state
2972 // c_rarg2 - int offset
2973 // c_rarg3 - int limit
2974 //
generate_sha1_implCompress(bool multi_block,const char * name)2975 address generate_sha1_implCompress(bool multi_block, const char *name) {
2976 __ align(CodeEntryAlignment);
2977 StubCodeMark mark(this, "StubRoutines", name);
2978 address start = __ pc();
2979
2980 Register buf = c_rarg0;
2981 Register state = c_rarg1;
2982 Register ofs = c_rarg2;
2983 Register limit = c_rarg3;
2984
2985 Label keys;
2986 Label sha1_loop;
2987
2988 // load the keys into v0..v3
2989 __ adr(rscratch1, keys);
2990 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2991 // load 5 words state into v6, v7
2992 __ ldrq(v6, Address(state, 0));
2993 __ ldrs(v7, Address(state, 16));
2994
2995
2996 __ BIND(sha1_loop);
2997 // load 64 bytes of data into v16..v19
2998 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2999 __ rev32(v16, __ T16B, v16);
3000 __ rev32(v17, __ T16B, v17);
3001 __ rev32(v18, __ T16B, v18);
3002 __ rev32(v19, __ T16B, v19);
3003
3004 // do the sha1
3005 __ addv(v4, __ T4S, v16, v0);
3006 __ orr(v20, __ T16B, v6, v6);
3007
3008 FloatRegister d0 = v16;
3009 FloatRegister d1 = v17;
3010 FloatRegister d2 = v18;
3011 FloatRegister d3 = v19;
3012
3013 for (int round = 0; round < 20; round++) {
3014 FloatRegister tmp1 = (round & 1) ? v4 : v5;
3015 FloatRegister tmp2 = (round & 1) ? v21 : v22;
3016 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3017 FloatRegister tmp4 = (round & 1) ? v5 : v4;
3018 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3019
3020 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3021 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3022 __ sha1h(tmp2, __ T4S, v20);
3023 if (round < 5)
3024 __ sha1c(v20, __ T4S, tmp3, tmp4);
3025 else if (round < 10 || round >= 15)
3026 __ sha1p(v20, __ T4S, tmp3, tmp4);
3027 else
3028 __ sha1m(v20, __ T4S, tmp3, tmp4);
3029 if (round < 16) __ sha1su1(d0, __ T4S, d3);
3030
3031 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3032 }
3033
3034 __ addv(v7, __ T2S, v7, v21);
3035 __ addv(v6, __ T4S, v6, v20);
3036
3037 if (multi_block) {
3038 __ add(ofs, ofs, 64);
3039 __ cmp(ofs, limit);
3040 __ br(Assembler::LE, sha1_loop);
3041 __ mov(c_rarg0, ofs); // return ofs
3042 }
3043
3044 __ strq(v6, Address(state, 0));
3045 __ strs(v7, Address(state, 16));
3046
3047 __ ret(lr);
3048
3049 __ bind(keys);
3050 __ emit_int32(0x5a827999);
3051 __ emit_int32(0x6ed9eba1);
3052 __ emit_int32(0x8f1bbcdc);
3053 __ emit_int32(0xca62c1d6);
3054
3055 return start;
3056 }
3057
3058
3059 // Arguments:
3060 //
3061 // Inputs:
3062 // c_rarg0 - byte[] source+offset
3063 // c_rarg1 - int[] SHA.state
3064 // c_rarg2 - int offset
3065 // c_rarg3 - int limit
3066 //
generate_sha256_implCompress(bool multi_block,const char * name)3067 address generate_sha256_implCompress(bool multi_block, const char *name) {
3068 static const uint32_t round_consts[64] = {
3069 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3070 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3071 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3072 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3073 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3074 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3075 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3076 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3077 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3078 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3079 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3080 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3081 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3082 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3083 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3084 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3085 };
3086 __ align(CodeEntryAlignment);
3087 StubCodeMark mark(this, "StubRoutines", name);
3088 address start = __ pc();
3089
3090 Register buf = c_rarg0;
3091 Register state = c_rarg1;
3092 Register ofs = c_rarg2;
3093 Register limit = c_rarg3;
3094
3095 Label sha1_loop;
3096
3097 __ stpd(v8, v9, __ pre(sp, -32));
3098 __ stpd(v10, v11, Address(sp, 16));
3099
3100 // dga == v0
3101 // dgb == v1
3102 // dg0 == v2
3103 // dg1 == v3
3104 // dg2 == v4
3105 // t0 == v6
3106 // t1 == v7
3107
3108 // load 16 keys to v16..v31
3109 __ lea(rscratch1, ExternalAddress((address)round_consts));
3110 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3111 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3112 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3113 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3114
3115 // load 8 words (256 bits) state
3116 __ ldpq(v0, v1, state);
3117
3118 __ BIND(sha1_loop);
3119 // load 64 bytes of data into v8..v11
3120 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3121 __ rev32(v8, __ T16B, v8);
3122 __ rev32(v9, __ T16B, v9);
3123 __ rev32(v10, __ T16B, v10);
3124 __ rev32(v11, __ T16B, v11);
3125
3126 __ addv(v6, __ T4S, v8, v16);
3127 __ orr(v2, __ T16B, v0, v0);
3128 __ orr(v3, __ T16B, v1, v1);
3129
3130 FloatRegister d0 = v8;
3131 FloatRegister d1 = v9;
3132 FloatRegister d2 = v10;
3133 FloatRegister d3 = v11;
3134
3135
3136 for (int round = 0; round < 16; round++) {
3137 FloatRegister tmp1 = (round & 1) ? v6 : v7;
3138 FloatRegister tmp2 = (round & 1) ? v7 : v6;
3139 FloatRegister tmp3 = (round & 1) ? v2 : v4;
3140 FloatRegister tmp4 = (round & 1) ? v4 : v2;
3141
3142 if (round < 12) __ sha256su0(d0, __ T4S, d1);
3143 __ orr(v4, __ T16B, v2, v2);
3144 if (round < 15)
3145 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3146 __ sha256h(v2, __ T4S, v3, tmp2);
3147 __ sha256h2(v3, __ T4S, v4, tmp2);
3148 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3149
3150 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3151 }
3152
3153 __ addv(v0, __ T4S, v0, v2);
3154 __ addv(v1, __ T4S, v1, v3);
3155
3156 if (multi_block) {
3157 __ add(ofs, ofs, 64);
3158 __ cmp(ofs, limit);
3159 __ br(Assembler::LE, sha1_loop);
3160 __ mov(c_rarg0, ofs); // return ofs
3161 }
3162
3163 __ ldpd(v10, v11, Address(sp, 16));
3164 __ ldpd(v8, v9, __ post(sp, 32));
3165
3166 __ stpq(v0, v1, state);
3167
3168 __ ret(lr);
3169
3170 return start;
3171 }
3172
3173 // Arguments:
3174 //
3175 // Inputs:
3176 // c_rarg0 - byte[] source+offset
3177 // c_rarg1 - int[] SHA.state
3178 // c_rarg2 - int offset
3179 // c_rarg3 - int limit
3180 //
generate_sha512_implCompress(bool multi_block,const char * name)3181 address generate_sha512_implCompress(bool multi_block, const char *name) {
3182 static const uint64_t round_consts[80] = {
3183 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3184 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3185 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3186 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3187 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3188 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3189 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3190 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3191 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3192 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3193 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3194 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3195 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3196 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3197 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3198 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3199 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3200 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3201 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3202 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3203 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3204 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3205 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3206 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3207 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3208 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3209 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3210 };
3211
3212 // Double rounds for sha512.
3213 #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3214 if (dr < 36) \
3215 __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16)); \
3216 __ addv(v5, __ T2D, v##rc0, v##in0); \
3217 __ ext(v6, __ T16B, v##i2, v##i3, 8); \
3218 __ ext(v5, __ T16B, v5, v5, 8); \
3219 __ ext(v7, __ T16B, v##i1, v##i2, 8); \
3220 __ addv(v##i3, __ T2D, v##i3, v5); \
3221 if (dr < 32) { \
3222 __ ext(v5, __ T16B, v##in3, v##in4, 8); \
3223 __ sha512su0(v##in0, __ T2D, v##in1); \
3224 } \
3225 __ sha512h(v##i3, __ T2D, v6, v7); \
3226 if (dr < 32) \
3227 __ sha512su1(v##in0, __ T2D, v##in2, v5); \
3228 __ addv(v##i4, __ T2D, v##i1, v##i3); \
3229 __ sha512h2(v##i3, __ T2D, v##i1, v##i0); \
3230
3231 __ align(CodeEntryAlignment);
3232 StubCodeMark mark(this, "StubRoutines", name);
3233 address start = __ pc();
3234
3235 Register buf = c_rarg0;
3236 Register state = c_rarg1;
3237 Register ofs = c_rarg2;
3238 Register limit = c_rarg3;
3239
3240 __ stpd(v8, v9, __ pre(sp, -64));
3241 __ stpd(v10, v11, Address(sp, 16));
3242 __ stpd(v12, v13, Address(sp, 32));
3243 __ stpd(v14, v15, Address(sp, 48));
3244
3245 Label sha512_loop;
3246
3247 // load state
3248 __ ld1(v8, v9, v10, v11, __ T2D, state);
3249
3250 // load first 4 round constants
3251 __ lea(rscratch1, ExternalAddress((address)round_consts));
3252 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3253
3254 __ BIND(sha512_loop);
3255 // load 128B of data into v12..v19
3256 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3257 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3258 __ rev64(v12, __ T16B, v12);
3259 __ rev64(v13, __ T16B, v13);
3260 __ rev64(v14, __ T16B, v14);
3261 __ rev64(v15, __ T16B, v15);
3262 __ rev64(v16, __ T16B, v16);
3263 __ rev64(v17, __ T16B, v17);
3264 __ rev64(v18, __ T16B, v18);
3265 __ rev64(v19, __ T16B, v19);
3266
3267 __ mov(rscratch2, rscratch1);
3268
3269 __ mov(v0, __ T16B, v8);
3270 __ mov(v1, __ T16B, v9);
3271 __ mov(v2, __ T16B, v10);
3272 __ mov(v3, __ T16B, v11);
3273
3274 sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3275 sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3276 sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3277 sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3278 sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3279 sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3280 sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3281 sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3282 sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3283 sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3284 sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3285 sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3286 sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3287 sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3288 sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3289 sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3290 sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3291 sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3292 sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3293 sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3294 sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3295 sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3296 sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3297 sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3298 sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3299 sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3300 sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3301 sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3302 sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3303 sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3304 sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3305 sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3306 sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12, 0, 0, 0, 0);
3307 sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13, 0, 0, 0, 0);
3308 sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14, 0, 0, 0, 0);
3309 sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15, 0, 0, 0, 0);
3310 sha512_dround(36, 3, 0, 4, 2, 1, 24, 0, 16, 0, 0, 0, 0);
3311 sha512_dround(37, 2, 3, 1, 4, 0, 25, 0, 17, 0, 0, 0, 0);
3312 sha512_dround(38, 4, 2, 0, 1, 3, 26, 0, 18, 0, 0, 0, 0);
3313 sha512_dround(39, 1, 4, 3, 0, 2, 27, 0, 19, 0, 0, 0, 0);
3314
3315 __ addv(v8, __ T2D, v8, v0);
3316 __ addv(v9, __ T2D, v9, v1);
3317 __ addv(v10, __ T2D, v10, v2);
3318 __ addv(v11, __ T2D, v11, v3);
3319
3320 if (multi_block) {
3321 __ add(ofs, ofs, 128);
3322 __ cmp(ofs, limit);
3323 __ br(Assembler::LE, sha512_loop);
3324 __ mov(c_rarg0, ofs); // return ofs
3325 }
3326
3327 __ st1(v8, v9, v10, v11, __ T2D, state);
3328
3329 __ ldpd(v14, v15, Address(sp, 48));
3330 __ ldpd(v12, v13, Address(sp, 32));
3331 __ ldpd(v10, v11, Address(sp, 16));
3332 __ ldpd(v8, v9, __ post(sp, 64));
3333
3334 __ ret(lr);
3335
3336 return start;
3337 }
3338
3339 // Arguments:
3340 //
3341 // Inputs:
3342 // c_rarg0 - byte[] source+offset
3343 // c_rarg1 - byte[] SHA.state
3344 // c_rarg2 - int digest_length
3345 // c_rarg3 - int offset
3346 // c_rarg4 - int limit
3347 //
generate_sha3_implCompress(bool multi_block,const char * name)3348 address generate_sha3_implCompress(bool multi_block, const char *name) {
3349 static const uint64_t round_consts[24] = {
3350 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3351 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3352 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3353 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3354 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3355 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3356 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3357 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3358 };
3359
3360 __ align(CodeEntryAlignment);
3361 StubCodeMark mark(this, "StubRoutines", name);
3362 address start = __ pc();
3363
3364 Register buf = c_rarg0;
3365 Register state = c_rarg1;
3366 Register digest_length = c_rarg2;
3367 Register ofs = c_rarg3;
3368 Register limit = c_rarg4;
3369
3370 Label sha3_loop, rounds24_loop;
3371 Label sha3_512, sha3_384_or_224, sha3_256;
3372
3373 __ stpd(v8, v9, __ pre(sp, -64));
3374 __ stpd(v10, v11, Address(sp, 16));
3375 __ stpd(v12, v13, Address(sp, 32));
3376 __ stpd(v14, v15, Address(sp, 48));
3377
3378 // load state
3379 __ add(rscratch1, state, 32);
3380 __ ld1(v0, v1, v2, v3, __ T1D, state);
3381 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
3382 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3383 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3384 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3385 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3386 __ ld1(v24, __ T1D, rscratch1);
3387
3388 __ BIND(sha3_loop);
3389
3390 // 24 keccak rounds
3391 __ movw(rscratch2, 24);
3392
3393 // load round_constants base
3394 __ lea(rscratch1, ExternalAddress((address) round_consts));
3395
3396 // load input
3397 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3398 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3399 __ eor(v0, __ T8B, v0, v25);
3400 __ eor(v1, __ T8B, v1, v26);
3401 __ eor(v2, __ T8B, v2, v27);
3402 __ eor(v3, __ T8B, v3, v28);
3403 __ eor(v4, __ T8B, v4, v29);
3404 __ eor(v5, __ T8B, v5, v30);
3405 __ eor(v6, __ T8B, v6, v31);
3406
3407 // digest_length == 64, SHA3-512
3408 __ tbnz(digest_length, 6, sha3_512);
3409
3410 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3411 __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3412 __ eor(v7, __ T8B, v7, v25);
3413 __ eor(v8, __ T8B, v8, v26);
3414 __ eor(v9, __ T8B, v9, v27);
3415 __ eor(v10, __ T8B, v10, v28);
3416 __ eor(v11, __ T8B, v11, v29);
3417 __ eor(v12, __ T8B, v12, v30);
3418
3419 // digest_length == 28, SHA3-224; digest_length == 48, SHA3-384
3420 __ tbnz(digest_length, 4, sha3_384_or_224);
3421
3422 // SHA3-256
3423 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3424 __ eor(v13, __ T8B, v13, v25);
3425 __ eor(v14, __ T8B, v14, v26);
3426 __ eor(v15, __ T8B, v15, v27);
3427 __ eor(v16, __ T8B, v16, v28);
3428 __ b(rounds24_loop);
3429
3430 __ BIND(sha3_384_or_224);
3431 __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3432
3433 // SHA3-224
3434 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3435 __ ld1(v29, __ T8B, __ post(buf, 8));
3436 __ eor(v13, __ T8B, v13, v25);
3437 __ eor(v14, __ T8B, v14, v26);
3438 __ eor(v15, __ T8B, v15, v27);
3439 __ eor(v16, __ T8B, v16, v28);
3440 __ eor(v17, __ T8B, v17, v29);
3441 __ b(rounds24_loop);
3442
3443 __ BIND(sha3_512);
3444 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3445 __ eor(v7, __ T8B, v7, v25);
3446 __ eor(v8, __ T8B, v8, v26);
3447
3448 __ BIND(rounds24_loop);
3449 __ subw(rscratch2, rscratch2, 1);
3450
3451 __ eor3(v29, __ T16B, v4, v9, v14);
3452 __ eor3(v26, __ T16B, v1, v6, v11);
3453 __ eor3(v28, __ T16B, v3, v8, v13);
3454 __ eor3(v25, __ T16B, v0, v5, v10);
3455 __ eor3(v27, __ T16B, v2, v7, v12);
3456 __ eor3(v29, __ T16B, v29, v19, v24);
3457 __ eor3(v26, __ T16B, v26, v16, v21);
3458 __ eor3(v28, __ T16B, v28, v18, v23);
3459 __ eor3(v25, __ T16B, v25, v15, v20);
3460 __ eor3(v27, __ T16B, v27, v17, v22);
3461
3462 __ rax1(v30, __ T2D, v29, v26);
3463 __ rax1(v26, __ T2D, v26, v28);
3464 __ rax1(v28, __ T2D, v28, v25);
3465 __ rax1(v25, __ T2D, v25, v27);
3466 __ rax1(v27, __ T2D, v27, v29);
3467
3468 __ eor(v0, __ T16B, v0, v30);
3469 __ xar(v29, __ T2D, v1, v25, (64 - 1));
3470 __ xar(v1, __ T2D, v6, v25, (64 - 44));
3471 __ xar(v6, __ T2D, v9, v28, (64 - 20));
3472 __ xar(v9, __ T2D, v22, v26, (64 - 61));
3473 __ xar(v22, __ T2D, v14, v28, (64 - 39));
3474 __ xar(v14, __ T2D, v20, v30, (64 - 18));
3475 __ xar(v31, __ T2D, v2, v26, (64 - 62));
3476 __ xar(v2, __ T2D, v12, v26, (64 - 43));
3477 __ xar(v12, __ T2D, v13, v27, (64 - 25));
3478 __ xar(v13, __ T2D, v19, v28, (64 - 8));
3479 __ xar(v19, __ T2D, v23, v27, (64 - 56));
3480 __ xar(v23, __ T2D, v15, v30, (64 - 41));
3481 __ xar(v15, __ T2D, v4, v28, (64 - 27));
3482 __ xar(v28, __ T2D, v24, v28, (64 - 14));
3483 __ xar(v24, __ T2D, v21, v25, (64 - 2));
3484 __ xar(v8, __ T2D, v8, v27, (64 - 55));
3485 __ xar(v4, __ T2D, v16, v25, (64 - 45));
3486 __ xar(v16, __ T2D, v5, v30, (64 - 36));
3487 __ xar(v5, __ T2D, v3, v27, (64 - 28));
3488 __ xar(v27, __ T2D, v18, v27, (64 - 21));
3489 __ xar(v3, __ T2D, v17, v26, (64 - 15));
3490 __ xar(v25, __ T2D, v11, v25, (64 - 10));
3491 __ xar(v26, __ T2D, v7, v26, (64 - 6));
3492 __ xar(v30, __ T2D, v10, v30, (64 - 3));
3493
3494 __ bcax(v20, __ T16B, v31, v22, v8);
3495 __ bcax(v21, __ T16B, v8, v23, v22);
3496 __ bcax(v22, __ T16B, v22, v24, v23);
3497 __ bcax(v23, __ T16B, v23, v31, v24);
3498 __ bcax(v24, __ T16B, v24, v8, v31);
3499
3500 __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3501
3502 __ bcax(v17, __ T16B, v25, v19, v3);
3503 __ bcax(v18, __ T16B, v3, v15, v19);
3504 __ bcax(v19, __ T16B, v19, v16, v15);
3505 __ bcax(v15, __ T16B, v15, v25, v16);
3506 __ bcax(v16, __ T16B, v16, v3, v25);
3507
3508 __ bcax(v10, __ T16B, v29, v12, v26);
3509 __ bcax(v11, __ T16B, v26, v13, v12);
3510 __ bcax(v12, __ T16B, v12, v14, v13);
3511 __ bcax(v13, __ T16B, v13, v29, v14);
3512 __ bcax(v14, __ T16B, v14, v26, v29);
3513
3514 __ bcax(v7, __ T16B, v30, v9, v4);
3515 __ bcax(v8, __ T16B, v4, v5, v9);
3516 __ bcax(v9, __ T16B, v9, v6, v5);
3517 __ bcax(v5, __ T16B, v5, v30, v6);
3518 __ bcax(v6, __ T16B, v6, v4, v30);
3519
3520 __ bcax(v3, __ T16B, v27, v0, v28);
3521 __ bcax(v4, __ T16B, v28, v1, v0);
3522 __ bcax(v0, __ T16B, v0, v2, v1);
3523 __ bcax(v1, __ T16B, v1, v27, v2);
3524 __ bcax(v2, __ T16B, v2, v28, v27);
3525
3526 __ eor(v0, __ T16B, v0, v31);
3527
3528 __ cbnzw(rscratch2, rounds24_loop);
3529
3530 if (multi_block) {
3531 // block_size = 200 - 2 * digest_length, ofs += block_size
3532 __ add(ofs, ofs, 200);
3533 __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3534
3535 __ cmp(ofs, limit);
3536 __ br(Assembler::LE, sha3_loop);
3537 __ mov(c_rarg0, ofs); // return ofs
3538 }
3539
3540 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
3541 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
3542 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3543 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3544 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3545 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3546 __ st1(v24, __ T1D, state);
3547
3548 __ ldpd(v14, v15, Address(sp, 48));
3549 __ ldpd(v12, v13, Address(sp, 32));
3550 __ ldpd(v10, v11, Address(sp, 16));
3551 __ ldpd(v8, v9, __ post(sp, 64));
3552
3553 __ ret(lr);
3554
3555 return start;
3556 }
3557
3558 // Safefetch stubs.
generate_safefetch(const char * name,int size,address * entry,address * fault_pc,address * continuation_pc)3559 void generate_safefetch(const char* name, int size, address* entry,
3560 address* fault_pc, address* continuation_pc) {
3561 // safefetch signatures:
3562 // int SafeFetch32(int* adr, int errValue);
3563 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3564 //
3565 // arguments:
3566 // c_rarg0 = adr
3567 // c_rarg1 = errValue
3568 //
3569 // result:
3570 // PPC_RET = *adr or errValue
3571
3572 StubCodeMark mark(this, "StubRoutines", name);
3573
3574 // Entry point, pc or function descriptor.
3575 *entry = __ pc();
3576
3577 // Load *adr into c_rarg1, may fault.
3578 *fault_pc = __ pc();
3579 switch (size) {
3580 case 4:
3581 // int32_t
3582 __ ldrw(c_rarg1, Address(c_rarg0, 0));
3583 break;
3584 case 8:
3585 // int64_t
3586 __ ldr(c_rarg1, Address(c_rarg0, 0));
3587 break;
3588 default:
3589 ShouldNotReachHere();
3590 }
3591
3592 // return errValue or *adr
3593 *continuation_pc = __ pc();
3594 __ mov(r0, c_rarg1);
3595 __ ret(lr);
3596 }
3597
3598 /**
3599 * Arguments:
3600 *
3601 * Inputs:
3602 * c_rarg0 - int crc
3603 * c_rarg1 - byte* buf
3604 * c_rarg2 - int length
3605 *
3606 * Ouput:
3607 * rax - int crc result
3608 */
generate_updateBytesCRC32()3609 address generate_updateBytesCRC32() {
3610 assert(UseCRC32Intrinsics, "what are we doing here?");
3611
3612 __ align(CodeEntryAlignment);
3613 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3614
3615 address start = __ pc();
3616
3617 const Register crc = c_rarg0; // crc
3618 const Register buf = c_rarg1; // source java byte array address
3619 const Register len = c_rarg2; // length
3620 const Register table0 = c_rarg3; // crc_table address
3621 const Register table1 = c_rarg4;
3622 const Register table2 = c_rarg5;
3623 const Register table3 = c_rarg6;
3624 const Register tmp3 = c_rarg7;
3625
3626 BLOCK_COMMENT("Entry:");
3627 __ enter(); // required for proper stackwalking of RuntimeStub frame
3628
3629 __ kernel_crc32(crc, buf, len,
3630 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3631
3632 __ leave(); // required for proper stackwalking of RuntimeStub frame
3633 __ ret(lr);
3634
3635 return start;
3636 }
3637
3638 /**
3639 * Arguments:
3640 *
3641 * Inputs:
3642 * c_rarg0 - int crc
3643 * c_rarg1 - byte* buf
3644 * c_rarg2 - int length
3645 * c_rarg3 - int* table
3646 *
3647 * Ouput:
3648 * r0 - int crc result
3649 */
generate_updateBytesCRC32C()3650 address generate_updateBytesCRC32C() {
3651 assert(UseCRC32CIntrinsics, "what are we doing here?");
3652
3653 __ align(CodeEntryAlignment);
3654 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3655
3656 address start = __ pc();
3657
3658 const Register crc = c_rarg0; // crc
3659 const Register buf = c_rarg1; // source java byte array address
3660 const Register len = c_rarg2; // length
3661 const Register table0 = c_rarg3; // crc_table address
3662 const Register table1 = c_rarg4;
3663 const Register table2 = c_rarg5;
3664 const Register table3 = c_rarg6;
3665 const Register tmp3 = c_rarg7;
3666
3667 BLOCK_COMMENT("Entry:");
3668 __ enter(); // required for proper stackwalking of RuntimeStub frame
3669
3670 __ kernel_crc32c(crc, buf, len,
3671 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3672
3673 __ leave(); // required for proper stackwalking of RuntimeStub frame
3674 __ ret(lr);
3675
3676 return start;
3677 }
3678
3679 /***
3680 * Arguments:
3681 *
3682 * Inputs:
3683 * c_rarg0 - int adler
3684 * c_rarg1 - byte* buff
3685 * c_rarg2 - int len
3686 *
3687 * Output:
3688 * c_rarg0 - int adler result
3689 */
generate_updateBytesAdler32()3690 address generate_updateBytesAdler32() {
3691 __ align(CodeEntryAlignment);
3692 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3693 address start = __ pc();
3694
3695 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3696
3697 // Aliases
3698 Register adler = c_rarg0;
3699 Register s1 = c_rarg0;
3700 Register s2 = c_rarg3;
3701 Register buff = c_rarg1;
3702 Register len = c_rarg2;
3703 Register nmax = r4;
3704 Register base = r5;
3705 Register count = r6;
3706 Register temp0 = rscratch1;
3707 Register temp1 = rscratch2;
3708 FloatRegister vbytes = v0;
3709 FloatRegister vs1acc = v1;
3710 FloatRegister vs2acc = v2;
3711 FloatRegister vtable = v3;
3712
3713 // Max number of bytes we can process before having to take the mod
3714 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3715 uint64_t BASE = 0xfff1;
3716 uint64_t NMAX = 0x15B0;
3717
3718 __ mov(base, BASE);
3719 __ mov(nmax, NMAX);
3720
3721 // Load accumulation coefficients for the upper 16 bits
3722 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3723 __ ld1(vtable, __ T16B, Address(temp0));
3724
3725 // s1 is initialized to the lower 16 bits of adler
3726 // s2 is initialized to the upper 16 bits of adler
3727 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
3728 __ uxth(s1, adler); // s1 = (adler & 0xffff)
3729
3730 // The pipelined loop needs at least 16 elements for 1 iteration
3731 // It does check this, but it is more effective to skip to the cleanup loop
3732 __ cmp(len, (u1)16);
3733 __ br(Assembler::HS, L_nmax);
3734 __ cbz(len, L_combine);
3735
3736 __ bind(L_simple_by1_loop);
3737 __ ldrb(temp0, Address(__ post(buff, 1)));
3738 __ add(s1, s1, temp0);
3739 __ add(s2, s2, s1);
3740 __ subs(len, len, 1);
3741 __ br(Assembler::HI, L_simple_by1_loop);
3742
3743 // s1 = s1 % BASE
3744 __ subs(temp0, s1, base);
3745 __ csel(s1, temp0, s1, Assembler::HS);
3746
3747 // s2 = s2 % BASE
3748 __ lsr(temp0, s2, 16);
3749 __ lsl(temp1, temp0, 4);
3750 __ sub(temp1, temp1, temp0);
3751 __ add(s2, temp1, s2, ext::uxth);
3752
3753 __ subs(temp0, s2, base);
3754 __ csel(s2, temp0, s2, Assembler::HS);
3755
3756 __ b(L_combine);
3757
3758 __ bind(L_nmax);
3759 __ subs(len, len, nmax);
3760 __ sub(count, nmax, 16);
3761 __ br(Assembler::LO, L_by16);
3762
3763 __ bind(L_nmax_loop);
3764
3765 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3766 vbytes, vs1acc, vs2acc, vtable);
3767
3768 __ subs(count, count, 16);
3769 __ br(Assembler::HS, L_nmax_loop);
3770
3771 // s1 = s1 % BASE
3772 __ lsr(temp0, s1, 16);
3773 __ lsl(temp1, temp0, 4);
3774 __ sub(temp1, temp1, temp0);
3775 __ add(temp1, temp1, s1, ext::uxth);
3776
3777 __ lsr(temp0, temp1, 16);
3778 __ lsl(s1, temp0, 4);
3779 __ sub(s1, s1, temp0);
3780 __ add(s1, s1, temp1, ext:: uxth);
3781
3782 __ subs(temp0, s1, base);
3783 __ csel(s1, temp0, s1, Assembler::HS);
3784
3785 // s2 = s2 % BASE
3786 __ lsr(temp0, s2, 16);
3787 __ lsl(temp1, temp0, 4);
3788 __ sub(temp1, temp1, temp0);
3789 __ add(temp1, temp1, s2, ext::uxth);
3790
3791 __ lsr(temp0, temp1, 16);
3792 __ lsl(s2, temp0, 4);
3793 __ sub(s2, s2, temp0);
3794 __ add(s2, s2, temp1, ext:: uxth);
3795
3796 __ subs(temp0, s2, base);
3797 __ csel(s2, temp0, s2, Assembler::HS);
3798
3799 __ subs(len, len, nmax);
3800 __ sub(count, nmax, 16);
3801 __ br(Assembler::HS, L_nmax_loop);
3802
3803 __ bind(L_by16);
3804 __ adds(len, len, count);
3805 __ br(Assembler::LO, L_by1);
3806
3807 __ bind(L_by16_loop);
3808
3809 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3810 vbytes, vs1acc, vs2acc, vtable);
3811
3812 __ subs(len, len, 16);
3813 __ br(Assembler::HS, L_by16_loop);
3814
3815 __ bind(L_by1);
3816 __ adds(len, len, 15);
3817 __ br(Assembler::LO, L_do_mod);
3818
3819 __ bind(L_by1_loop);
3820 __ ldrb(temp0, Address(__ post(buff, 1)));
3821 __ add(s1, temp0, s1);
3822 __ add(s2, s2, s1);
3823 __ subs(len, len, 1);
3824 __ br(Assembler::HS, L_by1_loop);
3825
3826 __ bind(L_do_mod);
3827 // s1 = s1 % BASE
3828 __ lsr(temp0, s1, 16);
3829 __ lsl(temp1, temp0, 4);
3830 __ sub(temp1, temp1, temp0);
3831 __ add(temp1, temp1, s1, ext::uxth);
3832
3833 __ lsr(temp0, temp1, 16);
3834 __ lsl(s1, temp0, 4);
3835 __ sub(s1, s1, temp0);
3836 __ add(s1, s1, temp1, ext:: uxth);
3837
3838 __ subs(temp0, s1, base);
3839 __ csel(s1, temp0, s1, Assembler::HS);
3840
3841 // s2 = s2 % BASE
3842 __ lsr(temp0, s2, 16);
3843 __ lsl(temp1, temp0, 4);
3844 __ sub(temp1, temp1, temp0);
3845 __ add(temp1, temp1, s2, ext::uxth);
3846
3847 __ lsr(temp0, temp1, 16);
3848 __ lsl(s2, temp0, 4);
3849 __ sub(s2, s2, temp0);
3850 __ add(s2, s2, temp1, ext:: uxth);
3851
3852 __ subs(temp0, s2, base);
3853 __ csel(s2, temp0, s2, Assembler::HS);
3854
3855 // Combine lower bits and higher bits
3856 __ bind(L_combine);
3857 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3858
3859 __ ret(lr);
3860
3861 return start;
3862 }
3863
generate_updateBytesAdler32_accum(Register s1,Register s2,Register buff,Register temp0,Register temp1,FloatRegister vbytes,FloatRegister vs1acc,FloatRegister vs2acc,FloatRegister vtable)3864 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3865 Register temp0, Register temp1, FloatRegister vbytes,
3866 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3867 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3868 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3869 // In non-vectorized code, we update s1 and s2 as:
3870 // s1 <- s1 + b1
3871 // s2 <- s2 + s1
3872 // s1 <- s1 + b2
3873 // s2 <- s2 + b1
3874 // ...
3875 // s1 <- s1 + b16
3876 // s2 <- s2 + s1
3877 // Putting above assignments together, we have:
3878 // s1_new = s1 + b1 + b2 + ... + b16
3879 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3880 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3881 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3882 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3883
3884 // s2 = s2 + s1 * 16
3885 __ add(s2, s2, s1, Assembler::LSL, 4);
3886
3887 // vs1acc = b1 + b2 + b3 + ... + b16
3888 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3889 __ umullv(vs2acc, __ T8B, vtable, vbytes);
3890 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3891 __ uaddlv(vs1acc, __ T16B, vbytes);
3892 __ uaddlv(vs2acc, __ T8H, vs2acc);
3893
3894 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3895 __ fmovd(temp0, vs1acc);
3896 __ fmovd(temp1, vs2acc);
3897 __ add(s1, s1, temp0);
3898 __ add(s2, s2, temp1);
3899 }
3900
3901 /**
3902 * Arguments:
3903 *
3904 * Input:
3905 * c_rarg0 - x address
3906 * c_rarg1 - x length
3907 * c_rarg2 - y address
3908 * c_rarg3 - y lenth
3909 * c_rarg4 - z address
3910 * c_rarg5 - z length
3911 */
generate_multiplyToLen()3912 address generate_multiplyToLen() {
3913 __ align(CodeEntryAlignment);
3914 StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3915
3916 address start = __ pc();
3917 const Register x = r0;
3918 const Register xlen = r1;
3919 const Register y = r2;
3920 const Register ylen = r3;
3921 const Register z = r4;
3922 const Register zlen = r5;
3923
3924 const Register tmp1 = r10;
3925 const Register tmp2 = r11;
3926 const Register tmp3 = r12;
3927 const Register tmp4 = r13;
3928 const Register tmp5 = r14;
3929 const Register tmp6 = r15;
3930 const Register tmp7 = r16;
3931
3932 BLOCK_COMMENT("Entry:");
3933 __ enter(); // required for proper stackwalking of RuntimeStub frame
3934 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3935 __ leave(); // required for proper stackwalking of RuntimeStub frame
3936 __ ret(lr);
3937
3938 return start;
3939 }
3940
generate_squareToLen()3941 address generate_squareToLen() {
3942 // squareToLen algorithm for sizes 1..127 described in java code works
3943 // faster than multiply_to_len on some CPUs and slower on others, but
3944 // multiply_to_len shows a bit better overall results
3945 __ align(CodeEntryAlignment);
3946 StubCodeMark mark(this, "StubRoutines", "squareToLen");
3947 address start = __ pc();
3948
3949 const Register x = r0;
3950 const Register xlen = r1;
3951 const Register z = r2;
3952 const Register zlen = r3;
3953 const Register y = r4; // == x
3954 const Register ylen = r5; // == xlen
3955
3956 const Register tmp1 = r10;
3957 const Register tmp2 = r11;
3958 const Register tmp3 = r12;
3959 const Register tmp4 = r13;
3960 const Register tmp5 = r14;
3961 const Register tmp6 = r15;
3962 const Register tmp7 = r16;
3963
3964 RegSet spilled_regs = RegSet::of(y, ylen);
3965 BLOCK_COMMENT("Entry:");
3966 __ enter();
3967 __ push(spilled_regs, sp);
3968 __ mov(y, x);
3969 __ mov(ylen, xlen);
3970 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3971 __ pop(spilled_regs, sp);
3972 __ leave();
3973 __ ret(lr);
3974 return start;
3975 }
3976
generate_mulAdd()3977 address generate_mulAdd() {
3978 __ align(CodeEntryAlignment);
3979 StubCodeMark mark(this, "StubRoutines", "mulAdd");
3980
3981 address start = __ pc();
3982
3983 const Register out = r0;
3984 const Register in = r1;
3985 const Register offset = r2;
3986 const Register len = r3;
3987 const Register k = r4;
3988
3989 BLOCK_COMMENT("Entry:");
3990 __ enter();
3991 __ mul_add(out, in, offset, len, k);
3992 __ leave();
3993 __ ret(lr);
3994
3995 return start;
3996 }
3997
3998 // Arguments:
3999 //
4000 // Input:
4001 // c_rarg0 - newArr address
4002 // c_rarg1 - oldArr address
4003 // c_rarg2 - newIdx
4004 // c_rarg3 - shiftCount
4005 // c_rarg4 - numIter
4006 //
generate_bigIntegerRightShift()4007 address generate_bigIntegerRightShift() {
4008 __ align(CodeEntryAlignment);
4009 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
4010 address start = __ pc();
4011
4012 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4013
4014 Register newArr = c_rarg0;
4015 Register oldArr = c_rarg1;
4016 Register newIdx = c_rarg2;
4017 Register shiftCount = c_rarg3;
4018 Register numIter = c_rarg4;
4019 Register idx = numIter;
4020
4021 Register newArrCur = rscratch1;
4022 Register shiftRevCount = rscratch2;
4023 Register oldArrCur = r13;
4024 Register oldArrNext = r14;
4025
4026 FloatRegister oldElem0 = v0;
4027 FloatRegister oldElem1 = v1;
4028 FloatRegister newElem = v2;
4029 FloatRegister shiftVCount = v3;
4030 FloatRegister shiftVRevCount = v4;
4031
4032 __ cbz(idx, Exit);
4033
4034 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4035
4036 // left shift count
4037 __ movw(shiftRevCount, 32);
4038 __ subw(shiftRevCount, shiftRevCount, shiftCount);
4039
4040 // numIter too small to allow a 4-words SIMD loop, rolling back
4041 __ cmp(numIter, (u1)4);
4042 __ br(Assembler::LT, ShiftThree);
4043
4044 __ dup(shiftVCount, __ T4S, shiftCount);
4045 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4046 __ negr(shiftVCount, __ T4S, shiftVCount);
4047
4048 __ BIND(ShiftSIMDLoop);
4049
4050 // Calculate the load addresses
4051 __ sub(idx, idx, 4);
4052 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4053 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
4054 __ add(oldArrCur, oldArrNext, 4);
4055
4056 // Load 4 words and process
4057 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
4058 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
4059 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
4060 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
4061 __ orr(newElem, __ T16B, oldElem0, oldElem1);
4062 __ st1(newElem, __ T4S, Address(newArrCur));
4063
4064 __ cmp(idx, (u1)4);
4065 __ br(Assembler::LT, ShiftTwoLoop);
4066 __ b(ShiftSIMDLoop);
4067
4068 __ BIND(ShiftTwoLoop);
4069 __ cbz(idx, Exit);
4070 __ cmp(idx, (u1)1);
4071 __ br(Assembler::EQ, ShiftOne);
4072
4073 // Calculate the load addresses
4074 __ sub(idx, idx, 2);
4075 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4076 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
4077 __ add(oldArrCur, oldArrNext, 4);
4078
4079 // Load 2 words and process
4080 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
4081 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
4082 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4083 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4084 __ orr(newElem, __ T8B, oldElem0, oldElem1);
4085 __ st1(newElem, __ T2S, Address(newArrCur));
4086 __ b(ShiftTwoLoop);
4087
4088 __ BIND(ShiftThree);
4089 __ tbz(idx, 1, ShiftOne);
4090 __ tbz(idx, 0, ShiftTwo);
4091 __ ldrw(r10, Address(oldArr, 12));
4092 __ ldrw(r11, Address(oldArr, 8));
4093 __ lsrvw(r10, r10, shiftCount);
4094 __ lslvw(r11, r11, shiftRevCount);
4095 __ orrw(r12, r10, r11);
4096 __ strw(r12, Address(newArr, 8));
4097
4098 __ BIND(ShiftTwo);
4099 __ ldrw(r10, Address(oldArr, 8));
4100 __ ldrw(r11, Address(oldArr, 4));
4101 __ lsrvw(r10, r10, shiftCount);
4102 __ lslvw(r11, r11, shiftRevCount);
4103 __ orrw(r12, r10, r11);
4104 __ strw(r12, Address(newArr, 4));
4105
4106 __ BIND(ShiftOne);
4107 __ ldrw(r10, Address(oldArr, 4));
4108 __ ldrw(r11, Address(oldArr));
4109 __ lsrvw(r10, r10, shiftCount);
4110 __ lslvw(r11, r11, shiftRevCount);
4111 __ orrw(r12, r10, r11);
4112 __ strw(r12, Address(newArr));
4113
4114 __ BIND(Exit);
4115 __ ret(lr);
4116
4117 return start;
4118 }
4119
4120 // Arguments:
4121 //
4122 // Input:
4123 // c_rarg0 - newArr address
4124 // c_rarg1 - oldArr address
4125 // c_rarg2 - newIdx
4126 // c_rarg3 - shiftCount
4127 // c_rarg4 - numIter
4128 //
generate_bigIntegerLeftShift()4129 address generate_bigIntegerLeftShift() {
4130 __ align(CodeEntryAlignment);
4131 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
4132 address start = __ pc();
4133
4134 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4135
4136 Register newArr = c_rarg0;
4137 Register oldArr = c_rarg1;
4138 Register newIdx = c_rarg2;
4139 Register shiftCount = c_rarg3;
4140 Register numIter = c_rarg4;
4141
4142 Register shiftRevCount = rscratch1;
4143 Register oldArrNext = rscratch2;
4144
4145 FloatRegister oldElem0 = v0;
4146 FloatRegister oldElem1 = v1;
4147 FloatRegister newElem = v2;
4148 FloatRegister shiftVCount = v3;
4149 FloatRegister shiftVRevCount = v4;
4150
4151 __ cbz(numIter, Exit);
4152
4153 __ add(oldArrNext, oldArr, 4);
4154 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4155
4156 // right shift count
4157 __ movw(shiftRevCount, 32);
4158 __ subw(shiftRevCount, shiftRevCount, shiftCount);
4159
4160 // numIter too small to allow a 4-words SIMD loop, rolling back
4161 __ cmp(numIter, (u1)4);
4162 __ br(Assembler::LT, ShiftThree);
4163
4164 __ dup(shiftVCount, __ T4S, shiftCount);
4165 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4166 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4167
4168 __ BIND(ShiftSIMDLoop);
4169
4170 // load 4 words and process
4171 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
4172 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
4173 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
4174 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
4175 __ orr(newElem, __ T16B, oldElem0, oldElem1);
4176 __ st1(newElem, __ T4S, __ post(newArr, 16));
4177 __ sub(numIter, numIter, 4);
4178
4179 __ cmp(numIter, (u1)4);
4180 __ br(Assembler::LT, ShiftTwoLoop);
4181 __ b(ShiftSIMDLoop);
4182
4183 __ BIND(ShiftTwoLoop);
4184 __ cbz(numIter, Exit);
4185 __ cmp(numIter, (u1)1);
4186 __ br(Assembler::EQ, ShiftOne);
4187
4188 // load 2 words and process
4189 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
4190 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
4191 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4192 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4193 __ orr(newElem, __ T8B, oldElem0, oldElem1);
4194 __ st1(newElem, __ T2S, __ post(newArr, 8));
4195 __ sub(numIter, numIter, 2);
4196 __ b(ShiftTwoLoop);
4197
4198 __ BIND(ShiftThree);
4199 __ ldrw(r10, __ post(oldArr, 4));
4200 __ ldrw(r11, __ post(oldArrNext, 4));
4201 __ lslvw(r10, r10, shiftCount);
4202 __ lsrvw(r11, r11, shiftRevCount);
4203 __ orrw(r12, r10, r11);
4204 __ strw(r12, __ post(newArr, 4));
4205 __ tbz(numIter, 1, Exit);
4206 __ tbz(numIter, 0, ShiftOne);
4207
4208 __ BIND(ShiftTwo);
4209 __ ldrw(r10, __ post(oldArr, 4));
4210 __ ldrw(r11, __ post(oldArrNext, 4));
4211 __ lslvw(r10, r10, shiftCount);
4212 __ lsrvw(r11, r11, shiftRevCount);
4213 __ orrw(r12, r10, r11);
4214 __ strw(r12, __ post(newArr, 4));
4215
4216 __ BIND(ShiftOne);
4217 __ ldrw(r10, Address(oldArr));
4218 __ ldrw(r11, Address(oldArrNext));
4219 __ lslvw(r10, r10, shiftCount);
4220 __ lsrvw(r11, r11, shiftRevCount);
4221 __ orrw(r12, r10, r11);
4222 __ strw(r12, Address(newArr));
4223
4224 __ BIND(Exit);
4225 __ ret(lr);
4226
4227 return start;
4228 }
4229
ghash_multiply(FloatRegister result_lo,FloatRegister result_hi,FloatRegister a,FloatRegister b,FloatRegister a1_xor_a0,FloatRegister tmp1,FloatRegister tmp2,FloatRegister tmp3,FloatRegister tmp4)4230 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
4231 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
4232 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
4233 // Karatsuba multiplication performs a 128*128 -> 256-bit
4234 // multiplication in three 128-bit multiplications and a few
4235 // additions.
4236 //
4237 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
4238 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
4239 //
4240 // Inputs:
4241 //
4242 // A0 in a.d[0] (subkey)
4243 // A1 in a.d[1]
4244 // (A1+A0) in a1_xor_a0.d[0]
4245 //
4246 // B0 in b.d[0] (state)
4247 // B1 in b.d[1]
4248
4249 __ ext(tmp1, __ T16B, b, b, 0x08);
4250 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1
4251 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0)
4252 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0
4253 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
4254
4255 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
4256 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
4257 __ eor(tmp2, __ T16B, tmp2, tmp4);
4258 __ eor(tmp2, __ T16B, tmp2, tmp3);
4259
4260 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
4261 __ ins(result_hi, __ D, tmp2, 0, 1);
4262 __ ins(result_lo, __ D, tmp2, 1, 0);
4263 }
4264
ghash_reduce(FloatRegister result,FloatRegister lo,FloatRegister hi,FloatRegister p,FloatRegister z,FloatRegister t1)4265 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
4266 FloatRegister p, FloatRegister z, FloatRegister t1) {
4267 const FloatRegister t0 = result;
4268
4269 // The GCM field polynomial f is z^128 + p(z), where p =
4270 // z^7+z^2+z+1.
4271 //
4272 // z^128 === -p(z) (mod (z^128 + p(z)))
4273 //
4274 // so, given that the product we're reducing is
4275 // a == lo + hi * z^128
4276 // substituting,
4277 // === lo - hi * p(z) (mod (z^128 + p(z)))
4278 //
4279 // we reduce by multiplying hi by p(z) and subtracting the result
4280 // from (i.e. XORing it with) lo. Because p has no nonzero high
4281 // bits we can do this with two 64-bit multiplications, lo*p and
4282 // hi*p.
4283
4284 __ pmull2(t0, __ T1Q, hi, p, __ T2D);
4285 __ ext(t1, __ T16B, t0, z, 8);
4286 __ eor(hi, __ T16B, hi, t1);
4287 __ ext(t1, __ T16B, z, t0, 8);
4288 __ eor(lo, __ T16B, lo, t1);
4289 __ pmull(t0, __ T1Q, hi, p, __ T1D);
4290 __ eor(result, __ T16B, lo, t0);
4291 }
4292
generate_has_negatives(address & has_negatives_long)4293 address generate_has_negatives(address &has_negatives_long) {
4294 const u1 large_loop_size = 64;
4295 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4296 int dcache_line = VM_Version::dcache_line_size();
4297
4298 Register ary1 = r1, len = r2, result = r0;
4299
4300 __ align(CodeEntryAlignment);
4301
4302 StubCodeMark mark(this, "StubRoutines", "has_negatives");
4303
4304 address entry = __ pc();
4305
4306 __ enter();
4307
4308 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
4309 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4310
4311 __ cmp(len, (u1)15);
4312 __ br(Assembler::GT, LEN_OVER_15);
4313 // The only case when execution falls into this code is when pointer is near
4314 // the end of memory page and we have to avoid reading next page
4315 __ add(ary1, ary1, len);
4316 __ subs(len, len, 8);
4317 __ br(Assembler::GT, LEN_OVER_8);
4318 __ ldr(rscratch2, Address(ary1, -8));
4319 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
4320 __ lsrv(rscratch2, rscratch2, rscratch1);
4321 __ tst(rscratch2, UPPER_BIT_MASK);
4322 __ cset(result, Assembler::NE);
4323 __ leave();
4324 __ ret(lr);
4325 __ bind(LEN_OVER_8);
4326 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4327 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4328 __ tst(rscratch2, UPPER_BIT_MASK);
4329 __ br(Assembler::NE, RET_TRUE_NO_POP);
4330 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4331 __ lsrv(rscratch1, rscratch1, rscratch2);
4332 __ tst(rscratch1, UPPER_BIT_MASK);
4333 __ cset(result, Assembler::NE);
4334 __ leave();
4335 __ ret(lr);
4336
4337 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4338 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4339
4340 has_negatives_long = __ pc(); // 2nd entry point
4341
4342 __ enter();
4343
4344 __ bind(LEN_OVER_15);
4345 __ push(spilled_regs, sp);
4346 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4347 __ cbz(rscratch2, ALIGNED);
4348 __ ldp(tmp6, tmp1, Address(ary1));
4349 __ mov(tmp5, 16);
4350 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4351 __ add(ary1, ary1, rscratch1);
4352 __ sub(len, len, rscratch1);
4353 __ orr(tmp6, tmp6, tmp1);
4354 __ tst(tmp6, UPPER_BIT_MASK);
4355 __ br(Assembler::NE, RET_TRUE);
4356
4357 __ bind(ALIGNED);
4358 __ cmp(len, large_loop_size);
4359 __ br(Assembler::LT, CHECK_16);
4360 // Perform 16-byte load as early return in pre-loop to handle situation
4361 // when initially aligned large array has negative values at starting bytes,
4362 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4363 // slower. Cases with negative bytes further ahead won't be affected that
4364 // much. In fact, it'll be faster due to early loads, less instructions and
4365 // less branches in LARGE_LOOP.
4366 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4367 __ sub(len, len, 16);
4368 __ orr(tmp6, tmp6, tmp1);
4369 __ tst(tmp6, UPPER_BIT_MASK);
4370 __ br(Assembler::NE, RET_TRUE);
4371 __ cmp(len, large_loop_size);
4372 __ br(Assembler::LT, CHECK_16);
4373
4374 if (SoftwarePrefetchHintDistance >= 0
4375 && SoftwarePrefetchHintDistance >= dcache_line) {
4376 // initial prefetch
4377 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4378 }
4379 __ bind(LARGE_LOOP);
4380 if (SoftwarePrefetchHintDistance >= 0) {
4381 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4382 }
4383 // Issue load instructions first, since it can save few CPU/MEM cycles, also
4384 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4385 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4386 // instructions per cycle and have less branches, but this approach disables
4387 // early return, thus, all 64 bytes are loaded and checked every time.
4388 __ ldp(tmp2, tmp3, Address(ary1));
4389 __ ldp(tmp4, tmp5, Address(ary1, 16));
4390 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4391 __ ldp(tmp6, tmp1, Address(ary1, 48));
4392 __ add(ary1, ary1, large_loop_size);
4393 __ sub(len, len, large_loop_size);
4394 __ orr(tmp2, tmp2, tmp3);
4395 __ orr(tmp4, tmp4, tmp5);
4396 __ orr(rscratch1, rscratch1, rscratch2);
4397 __ orr(tmp6, tmp6, tmp1);
4398 __ orr(tmp2, tmp2, tmp4);
4399 __ orr(rscratch1, rscratch1, tmp6);
4400 __ orr(tmp2, tmp2, rscratch1);
4401 __ tst(tmp2, UPPER_BIT_MASK);
4402 __ br(Assembler::NE, RET_TRUE);
4403 __ cmp(len, large_loop_size);
4404 __ br(Assembler::GE, LARGE_LOOP);
4405
4406 __ bind(CHECK_16); // small 16-byte load pre-loop
4407 __ cmp(len, (u1)16);
4408 __ br(Assembler::LT, POST_LOOP16);
4409
4410 __ bind(LOOP16); // small 16-byte load loop
4411 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4412 __ sub(len, len, 16);
4413 __ orr(tmp2, tmp2, tmp3);
4414 __ tst(tmp2, UPPER_BIT_MASK);
4415 __ br(Assembler::NE, RET_TRUE);
4416 __ cmp(len, (u1)16);
4417 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4418
4419 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4420 __ cmp(len, (u1)8);
4421 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4422 __ ldr(tmp3, Address(__ post(ary1, 8)));
4423 __ sub(len, len, 8);
4424 __ tst(tmp3, UPPER_BIT_MASK);
4425 __ br(Assembler::NE, RET_TRUE);
4426
4427 __ bind(POST_LOOP16_LOAD_TAIL);
4428 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
4429 __ ldr(tmp1, Address(ary1));
4430 __ mov(tmp2, 64);
4431 __ sub(tmp4, tmp2, len, __ LSL, 3);
4432 __ lslv(tmp1, tmp1, tmp4);
4433 __ tst(tmp1, UPPER_BIT_MASK);
4434 __ br(Assembler::NE, RET_TRUE);
4435 // Fallthrough
4436
4437 __ bind(RET_FALSE);
4438 __ pop(spilled_regs, sp);
4439 __ leave();
4440 __ mov(result, zr);
4441 __ ret(lr);
4442
4443 __ bind(RET_TRUE);
4444 __ pop(spilled_regs, sp);
4445 __ bind(RET_TRUE_NO_POP);
4446 __ leave();
4447 __ mov(result, 1);
4448 __ ret(lr);
4449
4450 __ bind(DONE);
4451 __ pop(spilled_regs, sp);
4452 __ leave();
4453 __ ret(lr);
4454 return entry;
4455 }
4456
generate_large_array_equals_loop_nonsimd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)4457 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4458 bool usePrefetch, Label &NOT_EQUAL) {
4459 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4460 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4461 tmp7 = r12, tmp8 = r13;
4462 Label LOOP;
4463
4464 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4465 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4466 __ bind(LOOP);
4467 if (usePrefetch) {
4468 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4469 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4470 }
4471 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4472 __ eor(tmp1, tmp1, tmp2);
4473 __ eor(tmp3, tmp3, tmp4);
4474 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4475 __ orr(tmp1, tmp1, tmp3);
4476 __ cbnz(tmp1, NOT_EQUAL);
4477 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4478 __ eor(tmp5, tmp5, tmp6);
4479 __ eor(tmp7, tmp7, tmp8);
4480 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4481 __ orr(tmp5, tmp5, tmp7);
4482 __ cbnz(tmp5, NOT_EQUAL);
4483 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4484 __ eor(tmp1, tmp1, tmp2);
4485 __ eor(tmp3, tmp3, tmp4);
4486 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4487 __ orr(tmp1, tmp1, tmp3);
4488 __ cbnz(tmp1, NOT_EQUAL);
4489 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4490 __ eor(tmp5, tmp5, tmp6);
4491 __ sub(cnt1, cnt1, 8 * wordSize);
4492 __ eor(tmp7, tmp7, tmp8);
4493 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4494 // tmp6 is not used. MacroAssembler::subs is used here (rather than
4495 // cmp) because subs allows an unlimited range of immediate operand.
4496 __ subs(tmp6, cnt1, loopThreshold);
4497 __ orr(tmp5, tmp5, tmp7);
4498 __ cbnz(tmp5, NOT_EQUAL);
4499 __ br(__ GE, LOOP);
4500 // post-loop
4501 __ eor(tmp1, tmp1, tmp2);
4502 __ eor(tmp3, tmp3, tmp4);
4503 __ orr(tmp1, tmp1, tmp3);
4504 __ sub(cnt1, cnt1, 2 * wordSize);
4505 __ cbnz(tmp1, NOT_EQUAL);
4506 }
4507
generate_large_array_equals_loop_simd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)4508 void generate_large_array_equals_loop_simd(int loopThreshold,
4509 bool usePrefetch, Label &NOT_EQUAL) {
4510 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4511 tmp2 = rscratch2;
4512 Label LOOP;
4513
4514 __ bind(LOOP);
4515 if (usePrefetch) {
4516 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4517 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4518 }
4519 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4520 __ sub(cnt1, cnt1, 8 * wordSize);
4521 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4522 __ subs(tmp1, cnt1, loopThreshold);
4523 __ eor(v0, __ T16B, v0, v4);
4524 __ eor(v1, __ T16B, v1, v5);
4525 __ eor(v2, __ T16B, v2, v6);
4526 __ eor(v3, __ T16B, v3, v7);
4527 __ orr(v0, __ T16B, v0, v1);
4528 __ orr(v1, __ T16B, v2, v3);
4529 __ orr(v0, __ T16B, v0, v1);
4530 __ umov(tmp1, v0, __ D, 0);
4531 __ umov(tmp2, v0, __ D, 1);
4532 __ orr(tmp1, tmp1, tmp2);
4533 __ cbnz(tmp1, NOT_EQUAL);
4534 __ br(__ GE, LOOP);
4535 }
4536
4537 // a1 = r1 - array1 address
4538 // a2 = r2 - array2 address
4539 // result = r0 - return value. Already contains "false"
4540 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4541 // r3-r5 are reserved temporary registers
generate_large_array_equals()4542 address generate_large_array_equals() {
4543 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4544 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4545 tmp7 = r12, tmp8 = r13;
4546 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4547 SMALL_LOOP, POST_LOOP;
4548 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4549 // calculate if at least 32 prefetched bytes are used
4550 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4551 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4552 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4553 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4554 tmp5, tmp6, tmp7, tmp8);
4555
4556 __ align(CodeEntryAlignment);
4557
4558 StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4559
4560 address entry = __ pc();
4561 __ enter();
4562 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
4563 // also advance pointers to use post-increment instead of pre-increment
4564 __ add(a1, a1, wordSize);
4565 __ add(a2, a2, wordSize);
4566 if (AvoidUnalignedAccesses) {
4567 // both implementations (SIMD/nonSIMD) are using relatively large load
4568 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4569 // on some CPUs in case of address is not at least 16-byte aligned.
4570 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4571 // load if needed at least for 1st address and make if 16-byte aligned.
4572 Label ALIGNED16;
4573 __ tbz(a1, 3, ALIGNED16);
4574 __ ldr(tmp1, Address(__ post(a1, wordSize)));
4575 __ ldr(tmp2, Address(__ post(a2, wordSize)));
4576 __ sub(cnt1, cnt1, wordSize);
4577 __ eor(tmp1, tmp1, tmp2);
4578 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4579 __ bind(ALIGNED16);
4580 }
4581 if (UseSIMDForArrayEquals) {
4582 if (SoftwarePrefetchHintDistance >= 0) {
4583 __ subs(tmp1, cnt1, prefetchLoopThreshold);
4584 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4585 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4586 /* prfm = */ true, NOT_EQUAL);
4587 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4588 __ br(__ LT, TAIL);
4589 }
4590 __ bind(NO_PREFETCH_LARGE_LOOP);
4591 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4592 /* prfm = */ false, NOT_EQUAL);
4593 } else {
4594 __ push(spilled_regs, sp);
4595 if (SoftwarePrefetchHintDistance >= 0) {
4596 __ subs(tmp1, cnt1, prefetchLoopThreshold);
4597 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4598 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4599 /* prfm = */ true, NOT_EQUAL);
4600 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4601 __ br(__ LT, TAIL);
4602 }
4603 __ bind(NO_PREFETCH_LARGE_LOOP);
4604 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4605 /* prfm = */ false, NOT_EQUAL);
4606 }
4607 __ bind(TAIL);
4608 __ cbz(cnt1, EQUAL);
4609 __ subs(cnt1, cnt1, wordSize);
4610 __ br(__ LE, POST_LOOP);
4611 __ bind(SMALL_LOOP);
4612 __ ldr(tmp1, Address(__ post(a1, wordSize)));
4613 __ ldr(tmp2, Address(__ post(a2, wordSize)));
4614 __ subs(cnt1, cnt1, wordSize);
4615 __ eor(tmp1, tmp1, tmp2);
4616 __ cbnz(tmp1, NOT_EQUAL);
4617 __ br(__ GT, SMALL_LOOP);
4618 __ bind(POST_LOOP);
4619 __ ldr(tmp1, Address(a1, cnt1));
4620 __ ldr(tmp2, Address(a2, cnt1));
4621 __ eor(tmp1, tmp1, tmp2);
4622 __ cbnz(tmp1, NOT_EQUAL);
4623 __ bind(EQUAL);
4624 __ mov(result, true);
4625 __ bind(NOT_EQUAL);
4626 if (!UseSIMDForArrayEquals) {
4627 __ pop(spilled_regs, sp);
4628 }
4629 __ bind(NOT_EQUAL_NO_POP);
4630 __ leave();
4631 __ ret(lr);
4632 return entry;
4633 }
4634
generate_dsin_dcos(bool isCos)4635 address generate_dsin_dcos(bool isCos) {
4636 __ align(CodeEntryAlignment);
4637 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4638 address start = __ pc();
4639 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4640 (address)StubRoutines::aarch64::_two_over_pi,
4641 (address)StubRoutines::aarch64::_pio2,
4642 (address)StubRoutines::aarch64::_dsin_coef,
4643 (address)StubRoutines::aarch64::_dcos_coef);
4644 return start;
4645 }
4646
generate_dlog()4647 address generate_dlog() {
4648 __ align(CodeEntryAlignment);
4649 StubCodeMark mark(this, "StubRoutines", "dlog");
4650 address entry = __ pc();
4651 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4652 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4653 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4654 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4655 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4656 return entry;
4657 }
4658
4659 // code for comparing 16 bytes of strings with same encoding
compare_string_16_bytes_same(Label & DIFF1,Label & DIFF2)4660 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4661 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4662 __ ldr(rscratch1, Address(__ post(str1, 8)));
4663 __ eor(rscratch2, tmp1, tmp2);
4664 __ ldr(cnt1, Address(__ post(str2, 8)));
4665 __ cbnz(rscratch2, DIFF1);
4666 __ ldr(tmp1, Address(__ post(str1, 8)));
4667 __ eor(rscratch2, rscratch1, cnt1);
4668 __ ldr(tmp2, Address(__ post(str2, 8)));
4669 __ cbnz(rscratch2, DIFF2);
4670 }
4671
4672 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
compare_string_16_x_LU(Register tmpL,Register tmpU,Label & DIFF1,Label & DIFF2)4673 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4674 Label &DIFF2) {
4675 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4676 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4677
4678 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4679 __ ldr(tmpU, Address(__ post(cnt1, 8)));
4680 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4681 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4682
4683 __ fmovd(tmpL, vtmp3);
4684 __ eor(rscratch2, tmp3, tmpL);
4685 __ cbnz(rscratch2, DIFF2);
4686
4687 __ ldr(tmp3, Address(__ post(cnt1, 8)));
4688 __ umov(tmpL, vtmp3, __ D, 1);
4689 __ eor(rscratch2, tmpU, tmpL);
4690 __ cbnz(rscratch2, DIFF1);
4691
4692 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4693 __ ldr(tmpU, Address(__ post(cnt1, 8)));
4694 __ fmovd(tmpL, vtmp);
4695 __ eor(rscratch2, tmp3, tmpL);
4696 __ cbnz(rscratch2, DIFF2);
4697
4698 __ ldr(tmp3, Address(__ post(cnt1, 8)));
4699 __ umov(tmpL, vtmp, __ D, 1);
4700 __ eor(rscratch2, tmpU, tmpL);
4701 __ cbnz(rscratch2, DIFF1);
4702 }
4703
4704 // r0 = result
4705 // r1 = str1
4706 // r2 = cnt1
4707 // r3 = str2
4708 // r4 = cnt2
4709 // r10 = tmp1
4710 // r11 = tmp2
generate_compare_long_string_different_encoding(bool isLU)4711 address generate_compare_long_string_different_encoding(bool isLU) {
4712 __ align(CodeEntryAlignment);
4713 StubCodeMark mark(this, "StubRoutines", isLU
4714 ? "compare_long_string_different_encoding LU"
4715 : "compare_long_string_different_encoding UL");
4716 address entry = __ pc();
4717 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4718 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4719 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4720 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4721 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4722 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4723 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4724
4725 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4726
4727 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4728 // cnt2 == amount of characters left to compare
4729 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4730 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4731 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4732 __ add(str2, str2, isLU ? wordSize : wordSize/2);
4733 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4734 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4735 __ eor(rscratch2, tmp1, tmp2);
4736 __ mov(rscratch1, tmp2);
4737 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4738 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4739 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4740 __ push(spilled_regs, sp);
4741 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4742 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4743
4744 __ ldr(tmp3, Address(__ post(cnt1, 8)));
4745
4746 if (SoftwarePrefetchHintDistance >= 0) {
4747 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4748 __ br(__ LT, NO_PREFETCH);
4749 __ bind(LARGE_LOOP_PREFETCH);
4750 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4751 __ mov(tmp4, 2);
4752 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4753 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4754 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4755 __ subs(tmp4, tmp4, 1);
4756 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4757 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4758 __ mov(tmp4, 2);
4759 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4760 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4761 __ subs(tmp4, tmp4, 1);
4762 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4763 __ sub(cnt2, cnt2, 64);
4764 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4765 __ br(__ GE, LARGE_LOOP_PREFETCH);
4766 }
4767 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4768 __ bind(NO_PREFETCH);
4769 __ subs(cnt2, cnt2, 16);
4770 __ br(__ LT, TAIL);
4771 __ align(OptoLoopAlignment);
4772 __ bind(SMALL_LOOP); // smaller loop
4773 __ subs(cnt2, cnt2, 16);
4774 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4775 __ br(__ GE, SMALL_LOOP);
4776 __ cmn(cnt2, (u1)16);
4777 __ br(__ EQ, LOAD_LAST);
4778 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4779 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4780 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4781 __ ldr(tmp3, Address(cnt1, -8));
4782 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4783 __ b(LOAD_LAST);
4784 __ bind(DIFF2);
4785 __ mov(tmpU, tmp3);
4786 __ bind(DIFF1);
4787 __ pop(spilled_regs, sp);
4788 __ b(CALCULATE_DIFFERENCE);
4789 __ bind(LOAD_LAST);
4790 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4791 // No need to load it again
4792 __ mov(tmpU, tmp3);
4793 __ pop(spilled_regs, sp);
4794
4795 // tmp2 points to the address of the last 4 Latin1 characters right now
4796 __ ldrs(vtmp, Address(tmp2));
4797 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4798 __ fmovd(tmpL, vtmp);
4799
4800 __ eor(rscratch2, tmpU, tmpL);
4801 __ cbz(rscratch2, DONE);
4802
4803 // Find the first different characters in the longwords and
4804 // compute their difference.
4805 __ bind(CALCULATE_DIFFERENCE);
4806 __ rev(rscratch2, rscratch2);
4807 __ clz(rscratch2, rscratch2);
4808 __ andr(rscratch2, rscratch2, -16);
4809 __ lsrv(tmp1, tmp1, rscratch2);
4810 __ uxthw(tmp1, tmp1);
4811 __ lsrv(rscratch1, rscratch1, rscratch2);
4812 __ uxthw(rscratch1, rscratch1);
4813 __ subw(result, tmp1, rscratch1);
4814 __ bind(DONE);
4815 __ ret(lr);
4816 return entry;
4817 }
4818
generate_method_entry_barrier()4819 address generate_method_entry_barrier() {
4820 __ align(CodeEntryAlignment);
4821 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
4822
4823 Label deoptimize_label;
4824
4825 address start = __ pc();
4826
4827 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
4828
4829 __ enter();
4830 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
4831
4832 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
4833
4834 __ push_call_clobbered_registers();
4835
4836 __ mov(c_rarg0, rscratch2);
4837 __ call_VM_leaf
4838 (CAST_FROM_FN_PTR
4839 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
4840
4841 __ reset_last_Java_frame(true);
4842
4843 __ mov(rscratch1, r0);
4844
4845 __ pop_call_clobbered_registers();
4846
4847 __ cbnz(rscratch1, deoptimize_label);
4848
4849 __ leave();
4850 __ ret(lr);
4851
4852 __ BIND(deoptimize_label);
4853
4854 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
4855 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
4856
4857 __ mov(sp, rscratch1);
4858 __ br(rscratch2);
4859
4860 return start;
4861 }
4862
4863 // r0 = result
4864 // r1 = str1
4865 // r2 = cnt1
4866 // r3 = str2
4867 // r4 = cnt2
4868 // r10 = tmp1
4869 // r11 = tmp2
generate_compare_long_string_same_encoding(bool isLL)4870 address generate_compare_long_string_same_encoding(bool isLL) {
4871 __ align(CodeEntryAlignment);
4872 StubCodeMark mark(this, "StubRoutines", isLL
4873 ? "compare_long_string_same_encoding LL"
4874 : "compare_long_string_same_encoding UU");
4875 address entry = __ pc();
4876 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4877 tmp1 = r10, tmp2 = r11;
4878 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4879 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4880 DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4881 // exit from large loop when less than 64 bytes left to read or we're about
4882 // to prefetch memory behind array border
4883 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4884 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4885 // update cnt2 counter with already loaded 8 bytes
4886 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4887 // update pointers, because of previous read
4888 __ add(str1, str1, wordSize);
4889 __ add(str2, str2, wordSize);
4890 if (SoftwarePrefetchHintDistance >= 0) {
4891 __ bind(LARGE_LOOP_PREFETCH);
4892 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4893 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4894 compare_string_16_bytes_same(DIFF, DIFF2);
4895 compare_string_16_bytes_same(DIFF, DIFF2);
4896 __ sub(cnt2, cnt2, isLL ? 64 : 32);
4897 compare_string_16_bytes_same(DIFF, DIFF2);
4898 __ subs(rscratch2, cnt2, largeLoopExitCondition);
4899 compare_string_16_bytes_same(DIFF, DIFF2);
4900 __ br(__ GT, LARGE_LOOP_PREFETCH);
4901 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4902 }
4903 // less than 16 bytes left?
4904 __ subs(cnt2, cnt2, isLL ? 16 : 8);
4905 __ br(__ LT, TAIL);
4906 __ align(OptoLoopAlignment);
4907 __ bind(SMALL_LOOP);
4908 compare_string_16_bytes_same(DIFF, DIFF2);
4909 __ subs(cnt2, cnt2, isLL ? 16 : 8);
4910 __ br(__ GE, SMALL_LOOP);
4911 __ bind(TAIL);
4912 __ adds(cnt2, cnt2, isLL ? 16 : 8);
4913 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4914 __ subs(cnt2, cnt2, isLL ? 8 : 4);
4915 __ br(__ LE, CHECK_LAST);
4916 __ eor(rscratch2, tmp1, tmp2);
4917 __ cbnz(rscratch2, DIFF);
4918 __ ldr(tmp1, Address(__ post(str1, 8)));
4919 __ ldr(tmp2, Address(__ post(str2, 8)));
4920 __ sub(cnt2, cnt2, isLL ? 8 : 4);
4921 __ bind(CHECK_LAST);
4922 if (!isLL) {
4923 __ add(cnt2, cnt2, cnt2); // now in bytes
4924 }
4925 __ eor(rscratch2, tmp1, tmp2);
4926 __ cbnz(rscratch2, DIFF);
4927 __ ldr(rscratch1, Address(str1, cnt2));
4928 __ ldr(cnt1, Address(str2, cnt2));
4929 __ eor(rscratch2, rscratch1, cnt1);
4930 __ cbz(rscratch2, LENGTH_DIFF);
4931 // Find the first different characters in the longwords and
4932 // compute their difference.
4933 __ bind(DIFF2);
4934 __ rev(rscratch2, rscratch2);
4935 __ clz(rscratch2, rscratch2);
4936 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4937 __ lsrv(rscratch1, rscratch1, rscratch2);
4938 if (isLL) {
4939 __ lsrv(cnt1, cnt1, rscratch2);
4940 __ uxtbw(rscratch1, rscratch1);
4941 __ uxtbw(cnt1, cnt1);
4942 } else {
4943 __ lsrv(cnt1, cnt1, rscratch2);
4944 __ uxthw(rscratch1, rscratch1);
4945 __ uxthw(cnt1, cnt1);
4946 }
4947 __ subw(result, rscratch1, cnt1);
4948 __ b(LENGTH_DIFF);
4949 __ bind(DIFF);
4950 __ rev(rscratch2, rscratch2);
4951 __ clz(rscratch2, rscratch2);
4952 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4953 __ lsrv(tmp1, tmp1, rscratch2);
4954 if (isLL) {
4955 __ lsrv(tmp2, tmp2, rscratch2);
4956 __ uxtbw(tmp1, tmp1);
4957 __ uxtbw(tmp2, tmp2);
4958 } else {
4959 __ lsrv(tmp2, tmp2, rscratch2);
4960 __ uxthw(tmp1, tmp1);
4961 __ uxthw(tmp2, tmp2);
4962 }
4963 __ subw(result, tmp1, tmp2);
4964 __ b(LENGTH_DIFF);
4965 __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4966 __ eor(rscratch2, tmp1, tmp2);
4967 __ cbnz(rscratch2, DIFF);
4968 __ bind(LENGTH_DIFF);
4969 __ ret(lr);
4970 return entry;
4971 }
4972
generate_compare_long_strings()4973 void generate_compare_long_strings() {
4974 StubRoutines::aarch64::_compare_long_string_LL
4975 = generate_compare_long_string_same_encoding(true);
4976 StubRoutines::aarch64::_compare_long_string_UU
4977 = generate_compare_long_string_same_encoding(false);
4978 StubRoutines::aarch64::_compare_long_string_LU
4979 = generate_compare_long_string_different_encoding(true);
4980 StubRoutines::aarch64::_compare_long_string_UL
4981 = generate_compare_long_string_different_encoding(false);
4982 }
4983
4984 // R0 = result
4985 // R1 = str2
4986 // R2 = cnt1
4987 // R3 = str1
4988 // R4 = cnt2
4989 // This generic linear code use few additional ideas, which makes it faster:
4990 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4991 // in order to skip initial loading(help in systems with 1 ld pipeline)
4992 // 2) we can use "fast" algorithm of finding single character to search for
4993 // first symbol with less branches(1 branch per each loaded register instead
4994 // of branch for each symbol), so, this is where constants like
4995 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4996 // 3) after loading and analyzing 1st register of source string, it can be
4997 // used to search for every 1st character entry, saving few loads in
4998 // comparison with "simplier-but-slower" implementation
4999 // 4) in order to avoid lots of push/pop operations, code below is heavily
5000 // re-using/re-initializing/compressing register values, which makes code
5001 // larger and a bit less readable, however, most of extra operations are
5002 // issued during loads or branches, so, penalty is minimal
generate_string_indexof_linear(bool str1_isL,bool str2_isL)5003 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5004 const char* stubName = str1_isL
5005 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5006 : "indexof_linear_uu";
5007 __ align(CodeEntryAlignment);
5008 StubCodeMark mark(this, "StubRoutines", stubName);
5009 address entry = __ pc();
5010
5011 int str1_chr_size = str1_isL ? 1 : 2;
5012 int str2_chr_size = str2_isL ? 1 : 2;
5013 int str1_chr_shift = str1_isL ? 0 : 1;
5014 int str2_chr_shift = str2_isL ? 0 : 1;
5015 bool isL = str1_isL && str2_isL;
5016 // parameters
5017 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5018 // temporary registers
5019 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5020 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5021 // redefinitions
5022 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5023
5024 __ push(spilled_regs, sp);
5025 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5026 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5027 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5028 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5029 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5030 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5031 // Read whole register from str1. It is safe, because length >=8 here
5032 __ ldr(ch1, Address(str1));
5033 // Read whole register from str2. It is safe, because length >=8 here
5034 __ ldr(ch2, Address(str2));
5035 __ sub(cnt2, cnt2, cnt1);
5036 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5037 if (str1_isL != str2_isL) {
5038 __ eor(v0, __ T16B, v0, v0);
5039 }
5040 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5041 __ mul(first, first, tmp1);
5042 // check if we have less than 1 register to check
5043 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5044 if (str1_isL != str2_isL) {
5045 __ fmovd(v1, ch1);
5046 }
5047 __ br(__ LE, L_SMALL);
5048 __ eor(ch2, first, ch2);
5049 if (str1_isL != str2_isL) {
5050 __ zip1(v1, __ T16B, v1, v0);
5051 }
5052 __ sub(tmp2, ch2, tmp1);
5053 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5054 __ bics(tmp2, tmp2, ch2);
5055 if (str1_isL != str2_isL) {
5056 __ fmovd(ch1, v1);
5057 }
5058 __ br(__ NE, L_HAS_ZERO);
5059 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5060 __ add(result, result, wordSize/str2_chr_size);
5061 __ add(str2, str2, wordSize);
5062 __ br(__ LT, L_POST_LOOP);
5063 __ BIND(L_LOOP);
5064 __ ldr(ch2, Address(str2));
5065 __ eor(ch2, first, ch2);
5066 __ sub(tmp2, ch2, tmp1);
5067 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5068 __ bics(tmp2, tmp2, ch2);
5069 __ br(__ NE, L_HAS_ZERO);
5070 __ BIND(L_LOOP_PROCEED);
5071 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5072 __ add(str2, str2, wordSize);
5073 __ add(result, result, wordSize/str2_chr_size);
5074 __ br(__ GE, L_LOOP);
5075 __ BIND(L_POST_LOOP);
5076 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5077 __ br(__ LE, NOMATCH);
5078 __ ldr(ch2, Address(str2));
5079 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5080 __ eor(ch2, first, ch2);
5081 __ sub(tmp2, ch2, tmp1);
5082 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5083 __ mov(tmp4, -1); // all bits set
5084 __ b(L_SMALL_PROCEED);
5085 __ align(OptoLoopAlignment);
5086 __ BIND(L_SMALL);
5087 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5088 __ eor(ch2, first, ch2);
5089 if (str1_isL != str2_isL) {
5090 __ zip1(v1, __ T16B, v1, v0);
5091 }
5092 __ sub(tmp2, ch2, tmp1);
5093 __ mov(tmp4, -1); // all bits set
5094 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5095 if (str1_isL != str2_isL) {
5096 __ fmovd(ch1, v1); // move converted 4 symbols
5097 }
5098 __ BIND(L_SMALL_PROCEED);
5099 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5100 __ bic(tmp2, tmp2, ch2);
5101 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5102 __ rbit(tmp2, tmp2);
5103 __ br(__ EQ, NOMATCH);
5104 __ BIND(L_SMALL_HAS_ZERO_LOOP);
5105 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5106 __ cmp(cnt1, u1(wordSize/str2_chr_size));
5107 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5108 if (str2_isL) { // LL
5109 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5110 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5111 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5112 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5113 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5114 } else {
5115 __ mov(ch2, 0xE); // all bits in byte set except last one
5116 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5117 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5118 __ lslv(tmp2, tmp2, tmp4);
5119 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5120 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5121 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5122 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5123 }
5124 __ cmp(ch1, ch2);
5125 __ mov(tmp4, wordSize/str2_chr_size);
5126 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5127 __ BIND(L_SMALL_CMP_LOOP);
5128 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5129 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5130 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5131 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5132 __ add(tmp4, tmp4, 1);
5133 __ cmp(tmp4, cnt1);
5134 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5135 __ cmp(first, ch2);
5136 __ br(__ EQ, L_SMALL_CMP_LOOP);
5137 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5138 __ cbz(tmp2, NOMATCH); // no more matches. exit
5139 __ clz(tmp4, tmp2);
5140 __ add(result, result, 1); // advance index
5141 __ add(str2, str2, str2_chr_size); // advance pointer
5142 __ b(L_SMALL_HAS_ZERO_LOOP);
5143 __ align(OptoLoopAlignment);
5144 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5145 __ cmp(first, ch2);
5146 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5147 __ b(DONE);
5148 __ align(OptoLoopAlignment);
5149 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5150 if (str2_isL) { // LL
5151 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5152 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5153 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5154 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5155 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5156 } else {
5157 __ mov(ch2, 0xE); // all bits in byte set except last one
5158 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5159 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5160 __ lslv(tmp2, tmp2, tmp4);
5161 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5162 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5163 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5164 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5165 }
5166 __ cmp(ch1, ch2);
5167 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5168 __ b(DONE);
5169 __ align(OptoLoopAlignment);
5170 __ BIND(L_HAS_ZERO);
5171 __ rbit(tmp2, tmp2);
5172 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5173 // Now, perform compression of counters(cnt2 and cnt1) into one register.
5174 // It's fine because both counters are 32bit and are not changed in this
5175 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5176 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5177 __ sub(result, result, 1);
5178 __ BIND(L_HAS_ZERO_LOOP);
5179 __ mov(cnt1, wordSize/str2_chr_size);
5180 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5181 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5182 if (str2_isL) {
5183 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5184 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5185 __ lslv(tmp2, tmp2, tmp4);
5186 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5187 __ add(tmp4, tmp4, 1);
5188 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5189 __ lsl(tmp2, tmp2, 1);
5190 __ mov(tmp4, wordSize/str2_chr_size);
5191 } else {
5192 __ mov(ch2, 0xE);
5193 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5194 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5195 __ lslv(tmp2, tmp2, tmp4);
5196 __ add(tmp4, tmp4, 1);
5197 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5198 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5199 __ lsl(tmp2, tmp2, 1);
5200 __ mov(tmp4, wordSize/str2_chr_size);
5201 __ sub(str2, str2, str2_chr_size);
5202 }
5203 __ cmp(ch1, ch2);
5204 __ mov(tmp4, wordSize/str2_chr_size);
5205 __ br(__ NE, L_CMP_LOOP_NOMATCH);
5206 __ BIND(L_CMP_LOOP);
5207 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5208 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5209 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5210 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5211 __ add(tmp4, tmp4, 1);
5212 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5213 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5214 __ cmp(cnt1, ch2);
5215 __ br(__ EQ, L_CMP_LOOP);
5216 __ BIND(L_CMP_LOOP_NOMATCH);
5217 // here we're not matched
5218 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5219 __ clz(tmp4, tmp2);
5220 __ add(str2, str2, str2_chr_size); // advance pointer
5221 __ b(L_HAS_ZERO_LOOP);
5222 __ align(OptoLoopAlignment);
5223 __ BIND(L_CMP_LOOP_LAST_CMP);
5224 __ cmp(cnt1, ch2);
5225 __ br(__ NE, L_CMP_LOOP_NOMATCH);
5226 __ b(DONE);
5227 __ align(OptoLoopAlignment);
5228 __ BIND(L_CMP_LOOP_LAST_CMP2);
5229 if (str2_isL) {
5230 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5231 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5232 __ lslv(tmp2, tmp2, tmp4);
5233 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5234 __ add(tmp4, tmp4, 1);
5235 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5236 __ lsl(tmp2, tmp2, 1);
5237 } else {
5238 __ mov(ch2, 0xE);
5239 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5240 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5241 __ lslv(tmp2, tmp2, tmp4);
5242 __ add(tmp4, tmp4, 1);
5243 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5244 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5245 __ lsl(tmp2, tmp2, 1);
5246 __ sub(str2, str2, str2_chr_size);
5247 }
5248 __ cmp(ch1, ch2);
5249 __ br(__ NE, L_CMP_LOOP_NOMATCH);
5250 __ b(DONE);
5251 __ align(OptoLoopAlignment);
5252 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5253 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5254 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5255 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5256 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5257 // result by analyzed characters value, so, we can just reset lower bits
5258 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5259 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5260 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5261 // index of last analyzed substring inside current octet. So, str2 in at
5262 // respective start address. We need to advance it to next octet
5263 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5264 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5265 __ bfm(result, zr, 0, 2 - str2_chr_shift);
5266 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5267 __ movw(cnt2, cnt2);
5268 __ b(L_LOOP_PROCEED);
5269 __ align(OptoLoopAlignment);
5270 __ BIND(NOMATCH);
5271 __ mov(result, -1);
5272 __ BIND(DONE);
5273 __ pop(spilled_regs, sp);
5274 __ ret(lr);
5275 return entry;
5276 }
5277
generate_string_indexof_stubs()5278 void generate_string_indexof_stubs() {
5279 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5280 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5281 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5282 }
5283
inflate_and_store_2_fp_registers(bool generatePrfm,FloatRegister src1,FloatRegister src2)5284 void inflate_and_store_2_fp_registers(bool generatePrfm,
5285 FloatRegister src1, FloatRegister src2) {
5286 Register dst = r1;
5287 __ zip1(v1, __ T16B, src1, v0);
5288 __ zip2(v2, __ T16B, src1, v0);
5289 if (generatePrfm) {
5290 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5291 }
5292 __ zip1(v3, __ T16B, src2, v0);
5293 __ zip2(v4, __ T16B, src2, v0);
5294 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5295 }
5296
5297 // R0 = src
5298 // R1 = dst
5299 // R2 = len
5300 // R3 = len >> 3
5301 // V0 = 0
5302 // v1 = loaded 8 bytes
generate_large_byte_array_inflate()5303 address generate_large_byte_array_inflate() {
5304 __ align(CodeEntryAlignment);
5305 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5306 address entry = __ pc();
5307 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5308 Register src = r0, dst = r1, len = r2, octetCounter = r3;
5309 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5310
5311 // do one more 8-byte read to have address 16-byte aligned in most cases
5312 // also use single store instruction
5313 __ ldrd(v2, __ post(src, 8));
5314 __ sub(octetCounter, octetCounter, 2);
5315 __ zip1(v1, __ T16B, v1, v0);
5316 __ zip1(v2, __ T16B, v2, v0);
5317 __ st1(v1, v2, __ T16B, __ post(dst, 32));
5318 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5319 __ subs(rscratch1, octetCounter, large_loop_threshold);
5320 __ br(__ LE, LOOP_START);
5321 __ b(LOOP_PRFM_START);
5322 __ bind(LOOP_PRFM);
5323 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5324 __ bind(LOOP_PRFM_START);
5325 __ prfm(Address(src, SoftwarePrefetchHintDistance));
5326 __ sub(octetCounter, octetCounter, 8);
5327 __ subs(rscratch1, octetCounter, large_loop_threshold);
5328 inflate_and_store_2_fp_registers(true, v3, v4);
5329 inflate_and_store_2_fp_registers(true, v5, v6);
5330 __ br(__ GT, LOOP_PRFM);
5331 __ cmp(octetCounter, (u1)8);
5332 __ br(__ LT, DONE);
5333 __ bind(LOOP);
5334 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5335 __ bind(LOOP_START);
5336 __ sub(octetCounter, octetCounter, 8);
5337 __ cmp(octetCounter, (u1)8);
5338 inflate_and_store_2_fp_registers(false, v3, v4);
5339 inflate_and_store_2_fp_registers(false, v5, v6);
5340 __ br(__ GE, LOOP);
5341 __ bind(DONE);
5342 __ ret(lr);
5343 return entry;
5344 }
5345
5346 /**
5347 * Arguments:
5348 *
5349 * Input:
5350 * c_rarg0 - current state address
5351 * c_rarg1 - H key address
5352 * c_rarg2 - data address
5353 * c_rarg3 - number of blocks
5354 *
5355 * Output:
5356 * Updated state at c_rarg0
5357 */
generate_ghash_processBlocks()5358 address generate_ghash_processBlocks() {
5359 // Bafflingly, GCM uses little-endian for the byte order, but
5360 // big-endian for the bit order. For example, the polynomial 1 is
5361 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5362 //
5363 // So, we must either reverse the bytes in each word and do
5364 // everything big-endian or reverse the bits in each byte and do
5365 // it little-endian. On AArch64 it's more idiomatic to reverse
5366 // the bits in each byte (we have an instruction, RBIT, to do
5367 // that) and keep the data in little-endian bit order throught the
5368 // calculation, bit-reversing the inputs and outputs.
5369
5370 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5371 __ align(wordSize * 2);
5372 address p = __ pc();
5373 __ emit_int64(0x87); // The low-order bits of the field
5374 // polynomial (i.e. p = z^7+z^2+z+1)
5375 // repeated in the low and high parts of a
5376 // 128-bit vector
5377 __ emit_int64(0x87);
5378
5379 __ align(CodeEntryAlignment);
5380 address start = __ pc();
5381
5382 Register state = c_rarg0;
5383 Register subkeyH = c_rarg1;
5384 Register data = c_rarg2;
5385 Register blocks = c_rarg3;
5386
5387 FloatRegister vzr = v30;
5388 __ eor(vzr, __ T16B, vzr, vzr); // zero register
5389
5390 __ ldrq(v0, Address(state));
5391 __ ldrq(v1, Address(subkeyH));
5392
5393 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
5394 __ rbit(v0, __ T16B, v0);
5395 __ rev64(v1, __ T16B, v1);
5396 __ rbit(v1, __ T16B, v1);
5397
5398 __ ldrq(v26, p);
5399
5400 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5401 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5402
5403 {
5404 Label L_ghash_loop;
5405 __ bind(L_ghash_loop);
5406
5407 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5408 // reversing each byte
5409 __ rbit(v2, __ T16B, v2);
5410 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
5411
5412 // Multiply state in v2 by subkey in v1
5413 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5414 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
5415 /*temps*/v6, v20, v18, v21);
5416 // Reduce v7:v5 by the field polynomial
5417 ghash_reduce(v0, v5, v7, v26, vzr, v20);
5418
5419 __ sub(blocks, blocks, 1);
5420 __ cbnz(blocks, L_ghash_loop);
5421 }
5422
5423 // The bit-reversed result is at this point in v0
5424 __ rev64(v1, __ T16B, v0);
5425 __ rbit(v1, __ T16B, v1);
5426
5427 __ st1(v1, __ T16B, state);
5428 __ ret(lr);
5429
5430 return start;
5431 }
5432
generate_base64_encode_simdround(Register src,Register dst,FloatRegister codec,u8 size)5433 void generate_base64_encode_simdround(Register src, Register dst,
5434 FloatRegister codec, u8 size) {
5435
5436 FloatRegister in0 = v4, in1 = v5, in2 = v6;
5437 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5438 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5439
5440 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5441
5442 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5443
5444 __ ushr(ind0, arrangement, in0, 2);
5445
5446 __ ushr(ind1, arrangement, in1, 2);
5447 __ shl(in0, arrangement, in0, 6);
5448 __ orr(ind1, arrangement, ind1, in0);
5449 __ ushr(ind1, arrangement, ind1, 2);
5450
5451 __ ushr(ind2, arrangement, in2, 4);
5452 __ shl(in1, arrangement, in1, 4);
5453 __ orr(ind2, arrangement, in1, ind2);
5454 __ ushr(ind2, arrangement, ind2, 2);
5455
5456 __ shl(ind3, arrangement, in2, 2);
5457 __ ushr(ind3, arrangement, ind3, 2);
5458
5459 __ tbl(out0, arrangement, codec, 4, ind0);
5460 __ tbl(out1, arrangement, codec, 4, ind1);
5461 __ tbl(out2, arrangement, codec, 4, ind2);
5462 __ tbl(out3, arrangement, codec, 4, ind3);
5463
5464 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
5465 }
5466
5467 /**
5468 * Arguments:
5469 *
5470 * Input:
5471 * c_rarg0 - src_start
5472 * c_rarg1 - src_offset
5473 * c_rarg2 - src_length
5474 * c_rarg3 - dest_start
5475 * c_rarg4 - dest_offset
5476 * c_rarg5 - isURL
5477 *
5478 */
generate_base64_encodeBlock()5479 address generate_base64_encodeBlock() {
5480
5481 static const char toBase64[64] = {
5482 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5483 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5484 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5485 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5486 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5487 };
5488
5489 static const char toBase64URL[64] = {
5490 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5491 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5492 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5493 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5494 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5495 };
5496
5497 __ align(CodeEntryAlignment);
5498 StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5499 address start = __ pc();
5500
5501 Register src = c_rarg0; // source array
5502 Register soff = c_rarg1; // source start offset
5503 Register send = c_rarg2; // source end offset
5504 Register dst = c_rarg3; // dest array
5505 Register doff = c_rarg4; // position for writing to dest array
5506 Register isURL = c_rarg5; // Base64 or URL chracter set
5507
5508 // c_rarg6 and c_rarg7 are free to use as temps
5509 Register codec = c_rarg6;
5510 Register length = c_rarg7;
5511
5512 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5513
5514 __ add(src, src, soff);
5515 __ add(dst, dst, doff);
5516 __ sub(length, send, soff);
5517
5518 // load the codec base address
5519 __ lea(codec, ExternalAddress((address) toBase64));
5520 __ cbz(isURL, ProcessData);
5521 __ lea(codec, ExternalAddress((address) toBase64URL));
5522
5523 __ BIND(ProcessData);
5524
5525 // too short to formup a SIMD loop, roll back
5526 __ cmp(length, (u1)24);
5527 __ br(Assembler::LT, Process3B);
5528
5529 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5530
5531 __ BIND(Process48B);
5532 __ cmp(length, (u1)48);
5533 __ br(Assembler::LT, Process24B);
5534 generate_base64_encode_simdround(src, dst, v0, 16);
5535 __ sub(length, length, 48);
5536 __ b(Process48B);
5537
5538 __ BIND(Process24B);
5539 __ cmp(length, (u1)24);
5540 __ br(Assembler::LT, SIMDExit);
5541 generate_base64_encode_simdround(src, dst, v0, 8);
5542 __ sub(length, length, 24);
5543
5544 __ BIND(SIMDExit);
5545 __ cbz(length, Exit);
5546
5547 __ BIND(Process3B);
5548 // 3 src bytes, 24 bits
5549 __ ldrb(r10, __ post(src, 1));
5550 __ ldrb(r11, __ post(src, 1));
5551 __ ldrb(r12, __ post(src, 1));
5552 __ orrw(r11, r11, r10, Assembler::LSL, 8);
5553 __ orrw(r12, r12, r11, Assembler::LSL, 8);
5554 // codec index
5555 __ ubfmw(r15, r12, 18, 23);
5556 __ ubfmw(r14, r12, 12, 17);
5557 __ ubfmw(r13, r12, 6, 11);
5558 __ andw(r12, r12, 63);
5559 // get the code based on the codec
5560 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
5561 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
5562 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
5563 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
5564 __ strb(r15, __ post(dst, 1));
5565 __ strb(r14, __ post(dst, 1));
5566 __ strb(r13, __ post(dst, 1));
5567 __ strb(r12, __ post(dst, 1));
5568 __ sub(length, length, 3);
5569 __ cbnz(length, Process3B);
5570
5571 __ BIND(Exit);
5572 __ ret(lr);
5573
5574 return start;
5575 }
5576
generate_base64_decode_simdround(Register src,Register dst,FloatRegister codecL,FloatRegister codecH,int size,Label & Exit)5577 void generate_base64_decode_simdround(Register src, Register dst,
5578 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
5579
5580 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
5581 FloatRegister out0 = v20, out1 = v21, out2 = v22;
5582
5583 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
5584 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
5585
5586 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
5587
5588 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5589
5590 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
5591
5592 // we need unsigned saturating substract, to make sure all input values
5593 // in range [0, 63] will have 0U value in the higher half lookup
5594 __ uqsubv(decH0, __ T16B, in0, v27);
5595 __ uqsubv(decH1, __ T16B, in1, v27);
5596 __ uqsubv(decH2, __ T16B, in2, v27);
5597 __ uqsubv(decH3, __ T16B, in3, v27);
5598
5599 // lower half lookup
5600 __ tbl(decL0, arrangement, codecL, 4, in0);
5601 __ tbl(decL1, arrangement, codecL, 4, in1);
5602 __ tbl(decL2, arrangement, codecL, 4, in2);
5603 __ tbl(decL3, arrangement, codecL, 4, in3);
5604
5605 // higher half lookup
5606 __ tbx(decH0, arrangement, codecH, 4, decH0);
5607 __ tbx(decH1, arrangement, codecH, 4, decH1);
5608 __ tbx(decH2, arrangement, codecH, 4, decH2);
5609 __ tbx(decH3, arrangement, codecH, 4, decH3);
5610
5611 // combine lower and higher
5612 __ orr(decL0, arrangement, decL0, decH0);
5613 __ orr(decL1, arrangement, decL1, decH1);
5614 __ orr(decL2, arrangement, decL2, decH2);
5615 __ orr(decL3, arrangement, decL3, decH3);
5616
5617 // check illegal inputs, value larger than 63 (maximum of 6 bits)
5618 __ cmhi(decH0, arrangement, decL0, v27);
5619 __ cmhi(decH1, arrangement, decL1, v27);
5620 __ cmhi(decH2, arrangement, decL2, v27);
5621 __ cmhi(decH3, arrangement, decL3, v27);
5622 __ orr(in0, arrangement, decH0, decH1);
5623 __ orr(in1, arrangement, decH2, decH3);
5624 __ orr(in2, arrangement, in0, in1);
5625 __ umaxv(in3, arrangement, in2);
5626 __ umov(rscratch2, in3, __ B, 0);
5627
5628 // get the data to output
5629 __ shl(out0, arrangement, decL0, 2);
5630 __ ushr(out1, arrangement, decL1, 4);
5631 __ orr(out0, arrangement, out0, out1);
5632 __ shl(out1, arrangement, decL1, 4);
5633 __ ushr(out2, arrangement, decL2, 2);
5634 __ orr(out1, arrangement, out1, out2);
5635 __ shl(out2, arrangement, decL2, 6);
5636 __ orr(out2, arrangement, out2, decL3);
5637
5638 __ cbz(rscratch2, NoIllegalData);
5639
5640 // handle illegal input
5641 __ umov(r10, in2, __ D, 0);
5642 if (size == 16) {
5643 __ cbnz(r10, ErrorInLowerHalf);
5644
5645 // illegal input is in higher half, store the lower half now.
5646 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
5647
5648 __ umov(r10, in2, __ D, 1);
5649 __ umov(r11, out0, __ D, 1);
5650 __ umov(r12, out1, __ D, 1);
5651 __ umov(r13, out2, __ D, 1);
5652 __ b(StoreLegalData);
5653
5654 __ BIND(ErrorInLowerHalf);
5655 }
5656 __ umov(r11, out0, __ D, 0);
5657 __ umov(r12, out1, __ D, 0);
5658 __ umov(r13, out2, __ D, 0);
5659
5660 __ BIND(StoreLegalData);
5661 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
5662 __ strb(r11, __ post(dst, 1));
5663 __ strb(r12, __ post(dst, 1));
5664 __ strb(r13, __ post(dst, 1));
5665 __ lsr(r10, r10, 8);
5666 __ lsr(r11, r11, 8);
5667 __ lsr(r12, r12, 8);
5668 __ lsr(r13, r13, 8);
5669 __ b(StoreLegalData);
5670
5671 __ BIND(NoIllegalData);
5672 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
5673 }
5674
5675
5676 /**
5677 * Arguments:
5678 *
5679 * Input:
5680 * c_rarg0 - src_start
5681 * c_rarg1 - src_offset
5682 * c_rarg2 - src_length
5683 * c_rarg3 - dest_start
5684 * c_rarg4 - dest_offset
5685 * c_rarg5 - isURL
5686 *
5687 */
generate_base64_decodeBlock()5688 address generate_base64_decodeBlock() {
5689
5690 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
5691 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
5692 // titled "Base64 decoding".
5693
5694 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
5695 // except the trailing character '=' is also treated illegal value in this instrinsic. That
5696 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
5697 static const uint8_t fromBase64ForNoSIMD[256] = {
5698 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5699 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5700 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
5701 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
5702 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
5703 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
5704 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
5705 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
5706 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5707 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5708 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5709 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5710 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5711 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5712 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5713 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5714 };
5715
5716 static const uint8_t fromBase64URLForNoSIMD[256] = {
5717 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5718 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5719 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
5720 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
5721 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
5722 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
5723 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
5724 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
5725 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5726 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5727 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5728 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5729 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5730 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5731 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5732 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5733 };
5734
5735 // A legal value of base64 code is in range [0, 127]. We need two lookups
5736 // with tbl/tbx and combine them to get the decode data. The 1st table vector
5737 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
5738 // table vector lookup use tbx, out of range indices are unchanged in
5739 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
5740 // The value of index 64 is set to 0, so that we know that we already get the
5741 // decoded data with the 1st lookup.
5742 static const uint8_t fromBase64ForSIMD[128] = {
5743 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5744 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5745 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
5746 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
5747 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
5748 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
5749 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
5750 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
5751 };
5752
5753 static const uint8_t fromBase64URLForSIMD[128] = {
5754 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5755 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5756 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
5757 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
5758 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
5759 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
5760 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
5761 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
5762 };
5763
5764 __ align(CodeEntryAlignment);
5765 StubCodeMark mark(this, "StubRoutines", "decodeBlock");
5766 address start = __ pc();
5767
5768 Register src = c_rarg0; // source array
5769 Register soff = c_rarg1; // source start offset
5770 Register send = c_rarg2; // source end offset
5771 Register dst = c_rarg3; // dest array
5772 Register doff = c_rarg4; // position for writing to dest array
5773 Register isURL = c_rarg5; // Base64 or URL character set
5774
5775 Register length = send; // reuse send as length of source data to process
5776
5777 Register simd_codec = c_rarg6;
5778 Register nosimd_codec = c_rarg7;
5779
5780 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
5781
5782 __ enter();
5783
5784 __ add(src, src, soff);
5785 __ add(dst, dst, doff);
5786
5787 __ mov(doff, dst);
5788
5789 __ sub(length, send, soff);
5790 __ bfm(length, zr, 0, 1);
5791
5792 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
5793 __ cbz(isURL, ProcessData);
5794 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
5795
5796 __ BIND(ProcessData);
5797 __ mov(rscratch1, length);
5798 __ cmp(length, (u1)144); // 144 = 80 + 64
5799 __ br(Assembler::LT, Process4B);
5800
5801 // In the MIME case, the line length cannot be more than 76
5802 // bytes (see RFC 2045). This is too short a block for SIMD
5803 // to be worthwhile, so we use non-SIMD here.
5804 __ movw(rscratch1, 79);
5805
5806 __ BIND(Process4B);
5807 __ ldrw(r14, __ post(src, 4));
5808 __ ubfxw(r10, r14, 0, 8);
5809 __ ubfxw(r11, r14, 8, 8);
5810 __ ubfxw(r12, r14, 16, 8);
5811 __ ubfxw(r13, r14, 24, 8);
5812 // get the de-code
5813 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
5814 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
5815 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
5816 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
5817 // error detection, 255u indicates an illegal input
5818 __ orrw(r14, r10, r11);
5819 __ orrw(r15, r12, r13);
5820 __ orrw(r14, r14, r15);
5821 __ tbnz(r14, 7, Exit);
5822 // recover the data
5823 __ lslw(r14, r10, 10);
5824 __ bfiw(r14, r11, 4, 6);
5825 __ bfmw(r14, r12, 2, 5);
5826 __ rev16w(r14, r14);
5827 __ bfiw(r13, r12, 6, 2);
5828 __ strh(r14, __ post(dst, 2));
5829 __ strb(r13, __ post(dst, 1));
5830 // non-simd loop
5831 __ subsw(rscratch1, rscratch1, 4);
5832 __ br(Assembler::GT, Process4B);
5833
5834 // if exiting from PreProcess80B, rscratch1 == -1;
5835 // otherwise, rscratch1 == 0.
5836 __ cbzw(rscratch1, Exit);
5837 __ sub(length, length, 80);
5838
5839 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
5840 __ cbz(isURL, SIMDEnter);
5841 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
5842
5843 __ BIND(SIMDEnter);
5844 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
5845 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
5846 __ mov(rscratch1, 63);
5847 __ dup(v27, __ T16B, rscratch1);
5848
5849 __ BIND(Process64B);
5850 __ cmp(length, (u1)64);
5851 __ br(Assembler::LT, Process32B);
5852 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
5853 __ sub(length, length, 64);
5854 __ b(Process64B);
5855
5856 __ BIND(Process32B);
5857 __ cmp(length, (u1)32);
5858 __ br(Assembler::LT, SIMDExit);
5859 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
5860 __ sub(length, length, 32);
5861 __ b(Process32B);
5862
5863 __ BIND(SIMDExit);
5864 __ cbz(length, Exit);
5865 __ movw(rscratch1, length);
5866 __ b(Process4B);
5867
5868 __ BIND(Exit);
5869 __ sub(c_rarg0, dst, doff);
5870
5871 __ leave();
5872 __ ret(lr);
5873
5874 return start;
5875 }
5876
5877 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
5878
5879 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
5880 //
5881 // If LSE is in use, generate LSE versions of all the stubs. The
5882 // non-LSE versions are in atomic_aarch64.S.
5883
5884 // class AtomicStubMark records the entry point of a stub and the
5885 // stub pointer which will point to it. The stub pointer is set to
5886 // the entry point when ~AtomicStubMark() is called, which must be
5887 // after ICache::invalidate_range. This ensures safe publication of
5888 // the generated code.
5889 class AtomicStubMark {
5890 address _entry_point;
5891 aarch64_atomic_stub_t *_stub;
5892 MacroAssembler *_masm;
5893 public:
AtomicStubMark(MacroAssembler * masm,aarch64_atomic_stub_t * stub)5894 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
5895 _masm = masm;
5896 __ align(32);
5897 _entry_point = __ pc();
5898 _stub = stub;
5899 }
~AtomicStubMark()5900 ~AtomicStubMark() {
5901 *_stub = (aarch64_atomic_stub_t)_entry_point;
5902 }
5903 };
5904
5905 // NB: For memory_order_conservative we need a trailing membar after
5906 // LSE atomic operations but not a leading membar.
5907 //
5908 // We don't need a leading membar because a clause in the Arm ARM
5909 // says:
5910 //
5911 // Barrier-ordered-before
5912 //
5913 // Barrier instructions order prior Memory effects before subsequent
5914 // Memory effects generated by the same Observer. A read or a write
5915 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
5916 // Observer if and only if RW1 appears in program order before RW 2
5917 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
5918 // instruction with both Acquire and Release semantics.
5919 //
5920 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
5921 // and Release semantics, therefore we don't need a leading
5922 // barrier. However, there is no corresponding Barrier-ordered-after
5923 // relationship, therefore we need a trailing membar to prevent a
5924 // later store or load from being reordered with the store in an
5925 // atomic instruction.
5926 //
5927 // This was checked by using the herd7 consistency model simulator
5928 // (http://diy.inria.fr/) with this test case:
5929 //
5930 // AArch64 LseCas
5931 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
5932 // P0 | P1;
5933 // LDR W4, [X2] | MOV W3, #0;
5934 // DMB LD | MOV W4, #1;
5935 // LDR W3, [X1] | CASAL W3, W4, [X1];
5936 // | DMB ISH;
5937 // | STR W4, [X2];
5938 // exists
5939 // (0:X3=0 /\ 0:X4=1)
5940 //
5941 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
5942 // with the store to x in P1. Without the DMB in P1 this may happen.
5943 //
5944 // At the time of writing we don't know of any AArch64 hardware that
5945 // reorders stores in this way, but the Reference Manual permits it.
5946
gen_cas_entry(Assembler::operand_size size,atomic_memory_order order)5947 void gen_cas_entry(Assembler::operand_size size,
5948 atomic_memory_order order) {
5949 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
5950 exchange_val = c_rarg2;
5951 bool acquire, release;
5952 switch (order) {
5953 case memory_order_relaxed:
5954 acquire = false;
5955 release = false;
5956 break;
5957 default:
5958 acquire = true;
5959 release = true;
5960 break;
5961 }
5962 __ mov(prev, compare_val);
5963 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
5964 if (order == memory_order_conservative) {
5965 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
5966 }
5967 if (size == Assembler::xword) {
5968 __ mov(r0, prev);
5969 } else {
5970 __ movw(r0, prev);
5971 }
5972 __ ret(lr);
5973 }
5974
gen_ldaddal_entry(Assembler::operand_size size)5975 void gen_ldaddal_entry(Assembler::operand_size size) {
5976 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
5977 __ ldaddal(size, incr, prev, addr);
5978 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
5979 if (size == Assembler::xword) {
5980 __ mov(r0, prev);
5981 } else {
5982 __ movw(r0, prev);
5983 }
5984 __ ret(lr);
5985 }
5986
gen_swpal_entry(Assembler::operand_size size)5987 void gen_swpal_entry(Assembler::operand_size size) {
5988 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
5989 __ swpal(size, incr, prev, addr);
5990 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
5991 if (size == Assembler::xword) {
5992 __ mov(r0, prev);
5993 } else {
5994 __ movw(r0, prev);
5995 }
5996 __ ret(lr);
5997 }
5998
generate_atomic_entry_points()5999 void generate_atomic_entry_points() {
6000 if (! UseLSE) {
6001 return;
6002 }
6003
6004 __ align(CodeEntryAlignment);
6005 StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6006 address first_entry = __ pc();
6007
6008 // All memory_order_conservative
6009 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6010 gen_ldaddal_entry(Assembler::word);
6011 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6012 gen_ldaddal_entry(Assembler::xword);
6013
6014 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6015 gen_swpal_entry(Assembler::word);
6016 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6017 gen_swpal_entry(Assembler::xword);
6018
6019 // CAS, memory_order_conservative
6020 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6021 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6022 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6023 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6024 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6025 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6026
6027 // CAS, memory_order_relaxed
6028 AtomicStubMark mark_cmpxchg_1_relaxed
6029 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6030 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6031 AtomicStubMark mark_cmpxchg_4_relaxed
6032 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6033 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6034 AtomicStubMark mark_cmpxchg_8_relaxed
6035 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6036 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6037
6038 ICache::invalidate_range(first_entry, __ pc() - first_entry);
6039 }
6040 #endif // LINUX
6041
6042 // Continuation point for throwing of implicit exceptions that are
6043 // not handled in the current activation. Fabricates an exception
6044 // oop and initiates normal exception dispatching in this
6045 // frame. Since we need to preserve callee-saved values (currently
6046 // only for C2, but done for C1 as well) we need a callee-saved oop
6047 // map and therefore have to make these stubs into RuntimeStubs
6048 // rather than BufferBlobs. If the compiler needs all registers to
6049 // be preserved between the fault point and the exception handler
6050 // then it must assume responsibility for that in
6051 // AbstractCompiler::continuation_for_implicit_null_exception or
6052 // continuation_for_implicit_division_by_zero_exception. All other
6053 // implicit exceptions (e.g., NullPointerException or
6054 // AbstractMethodError on entry) are either at call sites or
6055 // otherwise assume that stack unwinding will be initiated, so
6056 // caller saved registers were assumed volatile in the compiler.
6057
6058 #undef __
6059 #define __ masm->
6060
generate_throw_exception(const char * name,address runtime_entry,Register arg1=noreg,Register arg2=noreg)6061 address generate_throw_exception(const char* name,
6062 address runtime_entry,
6063 Register arg1 = noreg,
6064 Register arg2 = noreg) {
6065 // Information about frame layout at time of blocking runtime call.
6066 // Note that we only have to preserve callee-saved registers since
6067 // the compilers are responsible for supplying a continuation point
6068 // if they expect all registers to be preserved.
6069 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6070 enum layout {
6071 rfp_off = 0,
6072 rfp_off2,
6073 return_off,
6074 return_off2,
6075 framesize // inclusive of return address
6076 };
6077
6078 int insts_size = 512;
6079 int locs_size = 64;
6080
6081 CodeBuffer code(name, insts_size, locs_size);
6082 OopMapSet* oop_maps = new OopMapSet();
6083 MacroAssembler* masm = new MacroAssembler(&code);
6084
6085 address start = __ pc();
6086
6087 // This is an inlined and slightly modified version of call_VM
6088 // which has the ability to fetch the return PC out of
6089 // thread-local storage and also sets up last_Java_sp slightly
6090 // differently than the real call_VM
6091
6092 __ enter(); // Save FP and LR before call
6093
6094 assert(is_even(framesize/2), "sp not 16-byte aligned");
6095
6096 // lr and fp are already in place
6097 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
6098
6099 int frame_complete = __ pc() - start;
6100
6101 // Set up last_Java_sp and last_Java_fp
6102 address the_pc = __ pc();
6103 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6104
6105 // Call runtime
6106 if (arg1 != noreg) {
6107 assert(arg2 != c_rarg1, "clobbered");
6108 __ mov(c_rarg1, arg1);
6109 }
6110 if (arg2 != noreg) {
6111 __ mov(c_rarg2, arg2);
6112 }
6113 __ mov(c_rarg0, rthread);
6114 BLOCK_COMMENT("call runtime_entry");
6115 __ mov(rscratch1, runtime_entry);
6116 __ blr(rscratch1);
6117
6118 // Generate oop map
6119 OopMap* map = new OopMap(framesize, 0);
6120
6121 oop_maps->add_gc_map(the_pc - start, map);
6122
6123 __ reset_last_Java_frame(true);
6124
6125 // Reinitialize the ptrue predicate register, in case the external runtime
6126 // call clobbers ptrue reg, as we may return to SVE compiled code.
6127 __ reinitialize_ptrue();
6128
6129 __ leave();
6130
6131 // check for pending exceptions
6132 #ifdef ASSERT
6133 Label L;
6134 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6135 __ cbnz(rscratch1, L);
6136 __ should_not_reach_here();
6137 __ bind(L);
6138 #endif // ASSERT
6139 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6140
6141
6142 // codeBlob framesize is in words (not VMRegImpl::slot_size)
6143 RuntimeStub* stub =
6144 RuntimeStub::new_runtime_stub(name,
6145 &code,
6146 frame_complete,
6147 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6148 oop_maps, false);
6149 return stub->entry_point();
6150 }
6151
6152 class MontgomeryMultiplyGenerator : public MacroAssembler {
6153
6154 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6155 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6156
6157 RegSet _toSave;
6158 bool _squaring;
6159
6160 public:
MontgomeryMultiplyGenerator(Assembler * as,bool squaring)6161 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6162 : MacroAssembler(as->code()), _squaring(squaring) {
6163
6164 // Register allocation
6165
6166 RegSetIterator<> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6167 Pa_base = *regs; // Argument registers
6168 if (squaring)
6169 Pb_base = Pa_base;
6170 else
6171 Pb_base = *++regs;
6172 Pn_base = *++regs;
6173 Rlen= *++regs;
6174 inv = *++regs;
6175 Pm_base = *++regs;
6176
6177 // Working registers:
6178 Ra = *++regs; // The current digit of a, b, n, and m.
6179 Rb = *++regs;
6180 Rm = *++regs;
6181 Rn = *++regs;
6182
6183 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
6184 Pb = *++regs;
6185 Pm = *++regs;
6186 Pn = *++regs;
6187
6188 t0 = *++regs; // Three registers which form a
6189 t1 = *++regs; // triple-precision accumuator.
6190 t2 = *++regs;
6191
6192 Ri = *++regs; // Inner and outer loop indexes.
6193 Rj = *++regs;
6194
6195 Rhi_ab = *++regs; // Product registers: low and high parts
6196 Rlo_ab = *++regs; // of a*b and m*n.
6197 Rhi_mn = *++regs;
6198 Rlo_mn = *++regs;
6199
6200 // r19 and up are callee-saved.
6201 _toSave = RegSet::range(r19, *regs) + Pm_base;
6202 }
6203
6204 private:
save_regs()6205 void save_regs() {
6206 push(_toSave, sp);
6207 }
6208
restore_regs()6209 void restore_regs() {
6210 pop(_toSave, sp);
6211 }
6212
6213 template <typename T>
unroll_2(Register count,T block)6214 void unroll_2(Register count, T block) {
6215 Label loop, end, odd;
6216 tbnz(count, 0, odd);
6217 cbz(count, end);
6218 align(16);
6219 bind(loop);
6220 (this->*block)();
6221 bind(odd);
6222 (this->*block)();
6223 subs(count, count, 2);
6224 br(Assembler::GT, loop);
6225 bind(end);
6226 }
6227
6228 template <typename T>
unroll_2(Register count,T block,Register d,Register s,Register tmp)6229 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6230 Label loop, end, odd;
6231 tbnz(count, 0, odd);
6232 cbz(count, end);
6233 align(16);
6234 bind(loop);
6235 (this->*block)(d, s, tmp);
6236 bind(odd);
6237 (this->*block)(d, s, tmp);
6238 subs(count, count, 2);
6239 br(Assembler::GT, loop);
6240 bind(end);
6241 }
6242
pre1(RegisterOrConstant i)6243 void pre1(RegisterOrConstant i) {
6244 block_comment("pre1");
6245 // Pa = Pa_base;
6246 // Pb = Pb_base + i;
6247 // Pm = Pm_base;
6248 // Pn = Pn_base + i;
6249 // Ra = *Pa;
6250 // Rb = *Pb;
6251 // Rm = *Pm;
6252 // Rn = *Pn;
6253 ldr(Ra, Address(Pa_base));
6254 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6255 ldr(Rm, Address(Pm_base));
6256 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6257 lea(Pa, Address(Pa_base));
6258 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6259 lea(Pm, Address(Pm_base));
6260 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6261
6262 // Zero the m*n result.
6263 mov(Rhi_mn, zr);
6264 mov(Rlo_mn, zr);
6265 }
6266
6267 // The core multiply-accumulate step of a Montgomery
6268 // multiplication. The idea is to schedule operations as a
6269 // pipeline so that instructions with long latencies (loads and
6270 // multiplies) have time to complete before their results are
6271 // used. This most benefits in-order implementations of the
6272 // architecture but out-of-order ones also benefit.
step()6273 void step() {
6274 block_comment("step");
6275 // MACC(Ra, Rb, t0, t1, t2);
6276 // Ra = *++Pa;
6277 // Rb = *--Pb;
6278 umulh(Rhi_ab, Ra, Rb);
6279 mul(Rlo_ab, Ra, Rb);
6280 ldr(Ra, pre(Pa, wordSize));
6281 ldr(Rb, pre(Pb, -wordSize));
6282 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6283 // previous iteration.
6284 // MACC(Rm, Rn, t0, t1, t2);
6285 // Rm = *++Pm;
6286 // Rn = *--Pn;
6287 umulh(Rhi_mn, Rm, Rn);
6288 mul(Rlo_mn, Rm, Rn);
6289 ldr(Rm, pre(Pm, wordSize));
6290 ldr(Rn, pre(Pn, -wordSize));
6291 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6292 }
6293
post1()6294 void post1() {
6295 block_comment("post1");
6296
6297 // MACC(Ra, Rb, t0, t1, t2);
6298 // Ra = *++Pa;
6299 // Rb = *--Pb;
6300 umulh(Rhi_ab, Ra, Rb);
6301 mul(Rlo_ab, Ra, Rb);
6302 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
6303 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6304
6305 // *Pm = Rm = t0 * inv;
6306 mul(Rm, t0, inv);
6307 str(Rm, Address(Pm));
6308
6309 // MACC(Rm, Rn, t0, t1, t2);
6310 // t0 = t1; t1 = t2; t2 = 0;
6311 umulh(Rhi_mn, Rm, Rn);
6312
6313 #ifndef PRODUCT
6314 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6315 {
6316 mul(Rlo_mn, Rm, Rn);
6317 add(Rlo_mn, t0, Rlo_mn);
6318 Label ok;
6319 cbz(Rlo_mn, ok); {
6320 stop("broken Montgomery multiply");
6321 } bind(ok);
6322 }
6323 #endif
6324 // We have very carefully set things up so that
6325 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6326 // the lower half of Rm * Rn because we know the result already:
6327 // it must be -t0. t0 + (-t0) must generate a carry iff
6328 // t0 != 0. So, rather than do a mul and an adds we just set
6329 // the carry flag iff t0 is nonzero.
6330 //
6331 // mul(Rlo_mn, Rm, Rn);
6332 // adds(zr, t0, Rlo_mn);
6333 subs(zr, t0, 1); // Set carry iff t0 is nonzero
6334 adcs(t0, t1, Rhi_mn);
6335 adc(t1, t2, zr);
6336 mov(t2, zr);
6337 }
6338
pre2(RegisterOrConstant i,RegisterOrConstant len)6339 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6340 block_comment("pre2");
6341 // Pa = Pa_base + i-len;
6342 // Pb = Pb_base + len;
6343 // Pm = Pm_base + i-len;
6344 // Pn = Pn_base + len;
6345
6346 if (i.is_register()) {
6347 sub(Rj, i.as_register(), len);
6348 } else {
6349 mov(Rj, i.as_constant());
6350 sub(Rj, Rj, len);
6351 }
6352 // Rj == i-len
6353
6354 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6355 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6356 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6357 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6358
6359 // Ra = *++Pa;
6360 // Rb = *--Pb;
6361 // Rm = *++Pm;
6362 // Rn = *--Pn;
6363 ldr(Ra, pre(Pa, wordSize));
6364 ldr(Rb, pre(Pb, -wordSize));
6365 ldr(Rm, pre(Pm, wordSize));
6366 ldr(Rn, pre(Pn, -wordSize));
6367
6368 mov(Rhi_mn, zr);
6369 mov(Rlo_mn, zr);
6370 }
6371
post2(RegisterOrConstant i,RegisterOrConstant len)6372 void post2(RegisterOrConstant i, RegisterOrConstant len) {
6373 block_comment("post2");
6374 if (i.is_constant()) {
6375 mov(Rj, i.as_constant()-len.as_constant());
6376 } else {
6377 sub(Rj, i.as_register(), len);
6378 }
6379
6380 adds(t0, t0, Rlo_mn); // The pending m*n, low part
6381
6382 // As soon as we know the least significant digit of our result,
6383 // store it.
6384 // Pm_base[i-len] = t0;
6385 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6386
6387 // t0 = t1; t1 = t2; t2 = 0;
6388 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6389 adc(t1, t2, zr);
6390 mov(t2, zr);
6391 }
6392
6393 // A carry in t0 after Montgomery multiplication means that we
6394 // should subtract multiples of n from our result in m. We'll
6395 // keep doing that until there is no carry.
normalize(RegisterOrConstant len)6396 void normalize(RegisterOrConstant len) {
6397 block_comment("normalize");
6398 // while (t0)
6399 // t0 = sub(Pm_base, Pn_base, t0, len);
6400 Label loop, post, again;
6401 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6402 cbz(t0, post); {
6403 bind(again); {
6404 mov(i, zr);
6405 mov(cnt, len);
6406 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6407 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6408 subs(zr, zr, zr); // set carry flag, i.e. no borrow
6409 align(16);
6410 bind(loop); {
6411 sbcs(Rm, Rm, Rn);
6412 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6413 add(i, i, 1);
6414 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6415 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6416 sub(cnt, cnt, 1);
6417 } cbnz(cnt, loop);
6418 sbc(t0, t0, zr);
6419 } cbnz(t0, again);
6420 } bind(post);
6421 }
6422
6423 // Move memory at s to d, reversing words.
6424 // Increments d to end of copied memory
6425 // Destroys tmp1, tmp2
6426 // Preserves len
6427 // Leaves s pointing to the address which was in d at start
reverse(Register d,Register s,Register len,Register tmp1,Register tmp2)6428 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6429 assert(tmp1 < r19 && tmp2 < r19, "register corruption");
6430
6431 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
6432 mov(tmp1, len);
6433 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
6434 sub(s, d, len, ext::uxtw, LogBytesPerWord);
6435 }
6436 // where
reverse1(Register d,Register s,Register tmp)6437 void reverse1(Register d, Register s, Register tmp) {
6438 ldr(tmp, pre(s, -wordSize));
6439 ror(tmp, tmp, 32);
6440 str(tmp, post(d, wordSize));
6441 }
6442
step_squaring()6443 void step_squaring() {
6444 // An extra ACC
6445 step();
6446 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6447 }
6448
last_squaring(RegisterOrConstant i)6449 void last_squaring(RegisterOrConstant i) {
6450 Label dont;
6451 // if ((i & 1) == 0) {
6452 tbnz(i.as_register(), 0, dont); {
6453 // MACC(Ra, Rb, t0, t1, t2);
6454 // Ra = *++Pa;
6455 // Rb = *--Pb;
6456 umulh(Rhi_ab, Ra, Rb);
6457 mul(Rlo_ab, Ra, Rb);
6458 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6459 } bind(dont);
6460 }
6461
extra_step_squaring()6462 void extra_step_squaring() {
6463 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
6464
6465 // MACC(Rm, Rn, t0, t1, t2);
6466 // Rm = *++Pm;
6467 // Rn = *--Pn;
6468 umulh(Rhi_mn, Rm, Rn);
6469 mul(Rlo_mn, Rm, Rn);
6470 ldr(Rm, pre(Pm, wordSize));
6471 ldr(Rn, pre(Pn, -wordSize));
6472 }
6473
post1_squaring()6474 void post1_squaring() {
6475 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
6476
6477 // *Pm = Rm = t0 * inv;
6478 mul(Rm, t0, inv);
6479 str(Rm, Address(Pm));
6480
6481 // MACC(Rm, Rn, t0, t1, t2);
6482 // t0 = t1; t1 = t2; t2 = 0;
6483 umulh(Rhi_mn, Rm, Rn);
6484
6485 #ifndef PRODUCT
6486 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6487 {
6488 mul(Rlo_mn, Rm, Rn);
6489 add(Rlo_mn, t0, Rlo_mn);
6490 Label ok;
6491 cbz(Rlo_mn, ok); {
6492 stop("broken Montgomery multiply");
6493 } bind(ok);
6494 }
6495 #endif
6496 // We have very carefully set things up so that
6497 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6498 // the lower half of Rm * Rn because we know the result already:
6499 // it must be -t0. t0 + (-t0) must generate a carry iff
6500 // t0 != 0. So, rather than do a mul and an adds we just set
6501 // the carry flag iff t0 is nonzero.
6502 //
6503 // mul(Rlo_mn, Rm, Rn);
6504 // adds(zr, t0, Rlo_mn);
6505 subs(zr, t0, 1); // Set carry iff t0 is nonzero
6506 adcs(t0, t1, Rhi_mn);
6507 adc(t1, t2, zr);
6508 mov(t2, zr);
6509 }
6510
acc(Register Rhi,Register Rlo,Register t0,Register t1,Register t2)6511 void acc(Register Rhi, Register Rlo,
6512 Register t0, Register t1, Register t2) {
6513 adds(t0, t0, Rlo);
6514 adcs(t1, t1, Rhi);
6515 adc(t2, t2, zr);
6516 }
6517
6518 public:
6519 /**
6520 * Fast Montgomery multiplication. The derivation of the
6521 * algorithm is in A Cryptographic Library for the Motorola
6522 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
6523 *
6524 * Arguments:
6525 *
6526 * Inputs for multiplication:
6527 * c_rarg0 - int array elements a
6528 * c_rarg1 - int array elements b
6529 * c_rarg2 - int array elements n (the modulus)
6530 * c_rarg3 - int length
6531 * c_rarg4 - int inv
6532 * c_rarg5 - int array elements m (the result)
6533 *
6534 * Inputs for squaring:
6535 * c_rarg0 - int array elements a
6536 * c_rarg1 - int array elements n (the modulus)
6537 * c_rarg2 - int length
6538 * c_rarg3 - int inv
6539 * c_rarg4 - int array elements m (the result)
6540 *
6541 */
generate_multiply()6542 address generate_multiply() {
6543 Label argh, nothing;
6544 bind(argh);
6545 stop("MontgomeryMultiply total_allocation must be <= 8192");
6546
6547 align(CodeEntryAlignment);
6548 address entry = pc();
6549
6550 cbzw(Rlen, nothing);
6551
6552 enter();
6553
6554 // Make room.
6555 cmpw(Rlen, 512);
6556 br(Assembler::HI, argh);
6557 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
6558 andr(sp, Ra, -2 * wordSize);
6559
6560 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
6561
6562 {
6563 // Copy input args, reversing as we go. We use Ra as a
6564 // temporary variable.
6565 reverse(Ra, Pa_base, Rlen, t0, t1);
6566 if (!_squaring)
6567 reverse(Ra, Pb_base, Rlen, t0, t1);
6568 reverse(Ra, Pn_base, Rlen, t0, t1);
6569 }
6570
6571 // Push all call-saved registers and also Pm_base which we'll need
6572 // at the end.
6573 save_regs();
6574
6575 #ifndef PRODUCT
6576 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
6577 {
6578 ldr(Rn, Address(Pn_base, 0));
6579 mul(Rlo_mn, Rn, inv);
6580 subs(zr, Rlo_mn, -1);
6581 Label ok;
6582 br(EQ, ok); {
6583 stop("broken inverse in Montgomery multiply");
6584 } bind(ok);
6585 }
6586 #endif
6587
6588 mov(Pm_base, Ra);
6589
6590 mov(t0, zr);
6591 mov(t1, zr);
6592 mov(t2, zr);
6593
6594 block_comment("for (int i = 0; i < len; i++) {");
6595 mov(Ri, zr); {
6596 Label loop, end;
6597 cmpw(Ri, Rlen);
6598 br(Assembler::GE, end);
6599
6600 bind(loop);
6601 pre1(Ri);
6602
6603 block_comment(" for (j = i; j; j--) {"); {
6604 movw(Rj, Ri);
6605 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
6606 } block_comment(" } // j");
6607
6608 post1();
6609 addw(Ri, Ri, 1);
6610 cmpw(Ri, Rlen);
6611 br(Assembler::LT, loop);
6612 bind(end);
6613 block_comment("} // i");
6614 }
6615
6616 block_comment("for (int i = len; i < 2*len; i++) {");
6617 mov(Ri, Rlen); {
6618 Label loop, end;
6619 cmpw(Ri, Rlen, Assembler::LSL, 1);
6620 br(Assembler::GE, end);
6621
6622 bind(loop);
6623 pre2(Ri, Rlen);
6624
6625 block_comment(" for (j = len*2-i-1; j; j--) {"); {
6626 lslw(Rj, Rlen, 1);
6627 subw(Rj, Rj, Ri);
6628 subw(Rj, Rj, 1);
6629 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
6630 } block_comment(" } // j");
6631
6632 post2(Ri, Rlen);
6633 addw(Ri, Ri, 1);
6634 cmpw(Ri, Rlen, Assembler::LSL, 1);
6635 br(Assembler::LT, loop);
6636 bind(end);
6637 }
6638 block_comment("} // i");
6639
6640 normalize(Rlen);
6641
6642 mov(Ra, Pm_base); // Save Pm_base in Ra
6643 restore_regs(); // Restore caller's Pm_base
6644
6645 // Copy our result into caller's Pm_base
6646 reverse(Pm_base, Ra, Rlen, t0, t1);
6647
6648 leave();
6649 bind(nothing);
6650 ret(lr);
6651
6652 return entry;
6653 }
6654 // In C, approximately:
6655
6656 // void
6657 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
6658 // julong Pn_base[], julong Pm_base[],
6659 // julong inv, int len) {
6660 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
6661 // julong *Pa, *Pb, *Pn, *Pm;
6662 // julong Ra, Rb, Rn, Rm;
6663
6664 // int i;
6665
6666 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
6667
6668 // for (i = 0; i < len; i++) {
6669 // int j;
6670
6671 // Pa = Pa_base;
6672 // Pb = Pb_base + i;
6673 // Pm = Pm_base;
6674 // Pn = Pn_base + i;
6675
6676 // Ra = *Pa;
6677 // Rb = *Pb;
6678 // Rm = *Pm;
6679 // Rn = *Pn;
6680
6681 // int iters = i;
6682 // for (j = 0; iters--; j++) {
6683 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
6684 // MACC(Ra, Rb, t0, t1, t2);
6685 // Ra = *++Pa;
6686 // Rb = *--Pb;
6687 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6688 // MACC(Rm, Rn, t0, t1, t2);
6689 // Rm = *++Pm;
6690 // Rn = *--Pn;
6691 // }
6692
6693 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
6694 // MACC(Ra, Rb, t0, t1, t2);
6695 // *Pm = Rm = t0 * inv;
6696 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
6697 // MACC(Rm, Rn, t0, t1, t2);
6698
6699 // assert(t0 == 0, "broken Montgomery multiply");
6700
6701 // t0 = t1; t1 = t2; t2 = 0;
6702 // }
6703
6704 // for (i = len; i < 2*len; i++) {
6705 // int j;
6706
6707 // Pa = Pa_base + i-len;
6708 // Pb = Pb_base + len;
6709 // Pm = Pm_base + i-len;
6710 // Pn = Pn_base + len;
6711
6712 // Ra = *++Pa;
6713 // Rb = *--Pb;
6714 // Rm = *++Pm;
6715 // Rn = *--Pn;
6716
6717 // int iters = len*2-i-1;
6718 // for (j = i-len+1; iters--; j++) {
6719 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
6720 // MACC(Ra, Rb, t0, t1, t2);
6721 // Ra = *++Pa;
6722 // Rb = *--Pb;
6723 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6724 // MACC(Rm, Rn, t0, t1, t2);
6725 // Rm = *++Pm;
6726 // Rn = *--Pn;
6727 // }
6728
6729 // Pm_base[i-len] = t0;
6730 // t0 = t1; t1 = t2; t2 = 0;
6731 // }
6732
6733 // while (t0)
6734 // t0 = sub(Pm_base, Pn_base, t0, len);
6735 // }
6736
6737 /**
6738 * Fast Montgomery squaring. This uses asymptotically 25% fewer
6739 * multiplies than Montgomery multiplication so it should be up to
6740 * 25% faster. However, its loop control is more complex and it
6741 * may actually run slower on some machines.
6742 *
6743 * Arguments:
6744 *
6745 * Inputs:
6746 * c_rarg0 - int array elements a
6747 * c_rarg1 - int array elements n (the modulus)
6748 * c_rarg2 - int length
6749 * c_rarg3 - int inv
6750 * c_rarg4 - int array elements m (the result)
6751 *
6752 */
generate_square()6753 address generate_square() {
6754 Label argh;
6755 bind(argh);
6756 stop("MontgomeryMultiply total_allocation must be <= 8192");
6757
6758 align(CodeEntryAlignment);
6759 address entry = pc();
6760
6761 enter();
6762
6763 // Make room.
6764 cmpw(Rlen, 512);
6765 br(Assembler::HI, argh);
6766 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
6767 andr(sp, Ra, -2 * wordSize);
6768
6769 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
6770
6771 {
6772 // Copy input args, reversing as we go. We use Ra as a
6773 // temporary variable.
6774 reverse(Ra, Pa_base, Rlen, t0, t1);
6775 reverse(Ra, Pn_base, Rlen, t0, t1);
6776 }
6777
6778 // Push all call-saved registers and also Pm_base which we'll need
6779 // at the end.
6780 save_regs();
6781
6782 mov(Pm_base, Ra);
6783
6784 mov(t0, zr);
6785 mov(t1, zr);
6786 mov(t2, zr);
6787
6788 block_comment("for (int i = 0; i < len; i++) {");
6789 mov(Ri, zr); {
6790 Label loop, end;
6791 bind(loop);
6792 cmp(Ri, Rlen);
6793 br(Assembler::GE, end);
6794
6795 pre1(Ri);
6796
6797 block_comment("for (j = (i+1)/2; j; j--) {"); {
6798 add(Rj, Ri, 1);
6799 lsr(Rj, Rj, 1);
6800 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
6801 } block_comment(" } // j");
6802
6803 last_squaring(Ri);
6804
6805 block_comment(" for (j = i/2; j; j--) {"); {
6806 lsr(Rj, Ri, 1);
6807 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
6808 } block_comment(" } // j");
6809
6810 post1_squaring();
6811 add(Ri, Ri, 1);
6812 cmp(Ri, Rlen);
6813 br(Assembler::LT, loop);
6814
6815 bind(end);
6816 block_comment("} // i");
6817 }
6818
6819 block_comment("for (int i = len; i < 2*len; i++) {");
6820 mov(Ri, Rlen); {
6821 Label loop, end;
6822 bind(loop);
6823 cmp(Ri, Rlen, Assembler::LSL, 1);
6824 br(Assembler::GE, end);
6825
6826 pre2(Ri, Rlen);
6827
6828 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
6829 lsl(Rj, Rlen, 1);
6830 sub(Rj, Rj, Ri);
6831 sub(Rj, Rj, 1);
6832 lsr(Rj, Rj, 1);
6833 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
6834 } block_comment(" } // j");
6835
6836 last_squaring(Ri);
6837
6838 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
6839 lsl(Rj, Rlen, 1);
6840 sub(Rj, Rj, Ri);
6841 lsr(Rj, Rj, 1);
6842 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
6843 } block_comment(" } // j");
6844
6845 post2(Ri, Rlen);
6846 add(Ri, Ri, 1);
6847 cmp(Ri, Rlen, Assembler::LSL, 1);
6848
6849 br(Assembler::LT, loop);
6850 bind(end);
6851 block_comment("} // i");
6852 }
6853
6854 normalize(Rlen);
6855
6856 mov(Ra, Pm_base); // Save Pm_base in Ra
6857 restore_regs(); // Restore caller's Pm_base
6858
6859 // Copy our result into caller's Pm_base
6860 reverse(Pm_base, Ra, Rlen, t0, t1);
6861
6862 leave();
6863 ret(lr);
6864
6865 return entry;
6866 }
6867 // In C, approximately:
6868
6869 // void
6870 // montgomery_square(julong Pa_base[], julong Pn_base[],
6871 // julong Pm_base[], julong inv, int len) {
6872 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
6873 // julong *Pa, *Pb, *Pn, *Pm;
6874 // julong Ra, Rb, Rn, Rm;
6875
6876 // int i;
6877
6878 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
6879
6880 // for (i = 0; i < len; i++) {
6881 // int j;
6882
6883 // Pa = Pa_base;
6884 // Pb = Pa_base + i;
6885 // Pm = Pm_base;
6886 // Pn = Pn_base + i;
6887
6888 // Ra = *Pa;
6889 // Rb = *Pb;
6890 // Rm = *Pm;
6891 // Rn = *Pn;
6892
6893 // int iters = (i+1)/2;
6894 // for (j = 0; iters--; j++) {
6895 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
6896 // MACC2(Ra, Rb, t0, t1, t2);
6897 // Ra = *++Pa;
6898 // Rb = *--Pb;
6899 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6900 // MACC(Rm, Rn, t0, t1, t2);
6901 // Rm = *++Pm;
6902 // Rn = *--Pn;
6903 // }
6904 // if ((i & 1) == 0) {
6905 // assert(Ra == Pa_base[j], "must be");
6906 // MACC(Ra, Ra, t0, t1, t2);
6907 // }
6908 // iters = i/2;
6909 // assert(iters == i-j, "must be");
6910 // for (; iters--; j++) {
6911 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6912 // MACC(Rm, Rn, t0, t1, t2);
6913 // Rm = *++Pm;
6914 // Rn = *--Pn;
6915 // }
6916
6917 // *Pm = Rm = t0 * inv;
6918 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
6919 // MACC(Rm, Rn, t0, t1, t2);
6920
6921 // assert(t0 == 0, "broken Montgomery multiply");
6922
6923 // t0 = t1; t1 = t2; t2 = 0;
6924 // }
6925
6926 // for (i = len; i < 2*len; i++) {
6927 // int start = i-len+1;
6928 // int end = start + (len - start)/2;
6929 // int j;
6930
6931 // Pa = Pa_base + i-len;
6932 // Pb = Pa_base + len;
6933 // Pm = Pm_base + i-len;
6934 // Pn = Pn_base + len;
6935
6936 // Ra = *++Pa;
6937 // Rb = *--Pb;
6938 // Rm = *++Pm;
6939 // Rn = *--Pn;
6940
6941 // int iters = (2*len-i-1)/2;
6942 // assert(iters == end-start, "must be");
6943 // for (j = start; iters--; j++) {
6944 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
6945 // MACC2(Ra, Rb, t0, t1, t2);
6946 // Ra = *++Pa;
6947 // Rb = *--Pb;
6948 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6949 // MACC(Rm, Rn, t0, t1, t2);
6950 // Rm = *++Pm;
6951 // Rn = *--Pn;
6952 // }
6953 // if ((i & 1) == 0) {
6954 // assert(Ra == Pa_base[j], "must be");
6955 // MACC(Ra, Ra, t0, t1, t2);
6956 // }
6957 // iters = (2*len-i)/2;
6958 // assert(iters == len-j, "must be");
6959 // for (; iters--; j++) {
6960 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6961 // MACC(Rm, Rn, t0, t1, t2);
6962 // Rm = *++Pm;
6963 // Rn = *--Pn;
6964 // }
6965 // Pm_base[i-len] = t0;
6966 // t0 = t1; t1 = t2; t2 = 0;
6967 // }
6968
6969 // while (t0)
6970 // t0 = sub(Pm_base, Pn_base, t0, len);
6971 // }
6972 };
6973
6974
6975 // Initialization
generate_initial()6976 void generate_initial() {
6977 // Generate initial stubs and initializes the entry points
6978
6979 // entry points that exist in all platforms Note: This is code
6980 // that could be shared among different platforms - however the
6981 // benefit seems to be smaller than the disadvantage of having a
6982 // much more complicated generator structure. See also comment in
6983 // stubRoutines.hpp.
6984
6985 StubRoutines::_forward_exception_entry = generate_forward_exception();
6986
6987 StubRoutines::_call_stub_entry =
6988 generate_call_stub(StubRoutines::_call_stub_return_address);
6989
6990 // is referenced by megamorphic call
6991 StubRoutines::_catch_exception_entry = generate_catch_exception();
6992
6993 // Build this early so it's available for the interpreter.
6994 StubRoutines::_throw_StackOverflowError_entry =
6995 generate_throw_exception("StackOverflowError throw_exception",
6996 CAST_FROM_FN_PTR(address,
6997 SharedRuntime::throw_StackOverflowError));
6998 StubRoutines::_throw_delayed_StackOverflowError_entry =
6999 generate_throw_exception("delayed StackOverflowError throw_exception",
7000 CAST_FROM_FN_PTR(address,
7001 SharedRuntime::throw_delayed_StackOverflowError));
7002 if (UseCRC32Intrinsics) {
7003 // set table address before stub generation which use it
7004 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7005 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7006 }
7007
7008 if (UseCRC32CIntrinsics) {
7009 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7010 }
7011
7012 // Disabled until JDK-8210858 is fixed
7013 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7014 // StubRoutines::_dlog = generate_dlog();
7015 // }
7016
7017 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7018 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7019 }
7020
7021 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7022 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7023 }
7024
7025 // Safefetch stubs.
7026 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
7027 &StubRoutines::_safefetch32_fault_pc,
7028 &StubRoutines::_safefetch32_continuation_pc);
7029 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7030 &StubRoutines::_safefetchN_fault_pc,
7031 &StubRoutines::_safefetchN_continuation_pc);
7032 }
7033
generate_all()7034 void generate_all() {
7035 // support for verify_oop (must happen after universe_init)
7036 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7037 StubRoutines::_throw_AbstractMethodError_entry =
7038 generate_throw_exception("AbstractMethodError throw_exception",
7039 CAST_FROM_FN_PTR(address,
7040 SharedRuntime::
7041 throw_AbstractMethodError));
7042
7043 StubRoutines::_throw_IncompatibleClassChangeError_entry =
7044 generate_throw_exception("IncompatibleClassChangeError throw_exception",
7045 CAST_FROM_FN_PTR(address,
7046 SharedRuntime::
7047 throw_IncompatibleClassChangeError));
7048
7049 StubRoutines::_throw_NullPointerException_at_call_entry =
7050 generate_throw_exception("NullPointerException at call throw_exception",
7051 CAST_FROM_FN_PTR(address,
7052 SharedRuntime::
7053 throw_NullPointerException_at_call));
7054
7055 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
7056
7057 // arraycopy stubs used by compilers
7058 generate_arraycopy_stubs();
7059
7060 // has negatives stub for large arrays.
7061 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
7062
7063 // array equals stub for large arrays.
7064 if (!UseSimpleArrayEquals) {
7065 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7066 }
7067
7068 generate_compare_long_strings();
7069
7070 generate_string_indexof_stubs();
7071
7072 // byte_array_inflate stub for large arrays.
7073 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7074
7075 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7076 if (bs_nm != NULL) {
7077 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7078 }
7079 #ifdef COMPILER2
7080 if (UseMultiplyToLenIntrinsic) {
7081 StubRoutines::_multiplyToLen = generate_multiplyToLen();
7082 }
7083
7084 if (UseSquareToLenIntrinsic) {
7085 StubRoutines::_squareToLen = generate_squareToLen();
7086 }
7087
7088 if (UseMulAddIntrinsic) {
7089 StubRoutines::_mulAdd = generate_mulAdd();
7090 }
7091
7092 if (UseSIMDForBigIntegerShiftIntrinsics) {
7093 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7094 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7095 }
7096
7097 if (UseMontgomeryMultiplyIntrinsic) {
7098 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7099 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7100 StubRoutines::_montgomeryMultiply = g.generate_multiply();
7101 }
7102
7103 if (UseMontgomerySquareIntrinsic) {
7104 StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7105 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7106 // We use generate_multiply() rather than generate_square()
7107 // because it's faster for the sizes of modulus we care about.
7108 StubRoutines::_montgomerySquare = g.generate_multiply();
7109 }
7110 #endif // COMPILER2
7111
7112 // generate GHASH intrinsics code
7113 if (UseGHASHIntrinsics) {
7114 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7115 }
7116
7117 if (UseBASE64Intrinsics) {
7118 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7119 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7120 }
7121
7122 // data cache line writeback
7123 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7124 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7125
7126 if (UseAESIntrinsics) {
7127 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7128 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7129 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7130 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7131 }
7132
7133 if (UseSHA1Intrinsics) {
7134 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
7135 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
7136 }
7137 if (UseSHA256Intrinsics) {
7138 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
7139 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
7140 }
7141 if (UseSHA512Intrinsics) {
7142 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
7143 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
7144 }
7145 if (UseSHA3Intrinsics) {
7146 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress");
7147 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB");
7148 }
7149
7150 // generate Adler32 intrinsics code
7151 if (UseAdler32Intrinsics) {
7152 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7153 }
7154
7155 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
7156
7157 generate_atomic_entry_points();
7158
7159 #endif // LINUX
7160
7161 StubRoutines::aarch64::set_completed();
7162 }
7163
7164 public:
StubGenerator(CodeBuffer * code,bool all)7165 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7166 if (all) {
7167 generate_all();
7168 } else {
7169 generate_initial();
7170 }
7171 }
7172 }; // end class declaration
7173
7174 #define UCM_TABLE_MAX_ENTRIES 8
StubGenerator_generate(CodeBuffer * code,bool all)7175 void StubGenerator_generate(CodeBuffer* code, bool all) {
7176 if (UnsafeCopyMemory::_table == NULL) {
7177 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7178 }
7179 StubGenerator g(code, all);
7180 }
7181
7182
7183 #if defined(LINUX) || defined(_ALLBSD_SOURCE)
7184
7185 // Define pointers to atomic stubs and initialize them to point to the
7186 // code in atomic_aarch64.S.
7187
7188 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
7189 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7190 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
7191 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7192 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7193
7194 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7195 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7196 DEFAULT_ATOMIC_OP(xchg, 4, )
7197 DEFAULT_ATOMIC_OP(xchg, 8, )
7198 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7199 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7200 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7201 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7202 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7203 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7204
7205 #undef DEFAULT_ATOMIC_OP
7206
7207 #endif // LINUX
7208