1 /*
2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "gc/shared/barrierSet.hpp"
30 #include "gc/shared/barrierSetAssembler.hpp"
31 #include "interpreter/interpreter.hpp"
32 #include "memory/universe.hpp"
33 #include "nativeInst_aarch64.hpp"
34 #include "oops/instanceOop.hpp"
35 #include "oops/method.hpp"
36 #include "oops/objArrayKlass.hpp"
37 #include "oops/oop.inline.hpp"
38 #include "prims/methodHandles.hpp"
39 #include "runtime/frame.inline.hpp"
40 #include "runtime/handles.inline.hpp"
41 #include "runtime/sharedRuntime.hpp"
42 #include "runtime/stubCodeGenerator.hpp"
43 #include "runtime/stubRoutines.hpp"
44 #include "runtime/thread.inline.hpp"
45 #include "utilities/align.hpp"
46 #ifdef COMPILER2
47 #include "opto/runtime.hpp"
48 #endif
49 #if INCLUDE_ZGC
50 #include "gc/z/zThreadLocalData.hpp"
51 #endif
52
53 // Declaration and definition of StubGenerator (no .hpp file).
54 // For a more detailed description of the stub routine structure
55 // see the comment in stubRoutines.hpp
56
57 #undef __
58 #define __ _masm->
59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
60
61 #ifdef PRODUCT
62 #define BLOCK_COMMENT(str) /* nothing */
63 #else
64 #define BLOCK_COMMENT(str) __ block_comment(str)
65 #endif
66
67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
68
69 // Stub Code definitions
70
71 class StubGenerator: public StubCodeGenerator {
72 private:
73
74 #ifdef PRODUCT
75 #define inc_counter_np(counter) ((void)0)
76 #else
77 void inc_counter_np_(int& counter) {
78 __ lea(rscratch2, ExternalAddress((address)&counter));
79 __ ldrw(rscratch1, Address(rscratch2));
80 __ addw(rscratch1, rscratch1, 1);
81 __ strw(rscratch1, Address(rscratch2));
82 }
83 #define inc_counter_np(counter) \
84 BLOCK_COMMENT("inc_counter " #counter); \
85 inc_counter_np_(counter);
86 #endif
87
88 // Call stubs are used to call Java from C
89 //
90 // Arguments:
91 // c_rarg0: call wrapper address address
92 // c_rarg1: result address
93 // c_rarg2: result type BasicType
94 // c_rarg3: method Method*
95 // c_rarg4: (interpreter) entry point address
96 // c_rarg5: parameters intptr_t*
97 // c_rarg6: parameter size (in words) int
98 // c_rarg7: thread Thread*
99 //
100 // There is no return from the stub itself as any Java result
101 // is written to result
102 //
103 // we save r30 (lr) as the return PC at the base of the frame and
104 // link r29 (fp) below it as the frame pointer installing sp (r31)
105 // into fp.
106 //
107 // we save r0-r7, which accounts for all the c arguments.
108 //
109 // TODO: strictly do we need to save them all? they are treated as
110 // volatile by C so could we omit saving the ones we are going to
111 // place in global registers (thread? method?) or those we only use
112 // during setup of the Java call?
113 //
114 // we don't need to save r8 which C uses as an indirect result location
115 // return register.
116 //
117 // we don't need to save r9-r15 which both C and Java treat as
118 // volatile
119 //
120 // we don't need to save r16-18 because Java does not use them
121 //
122 // we save r19-r28 which Java uses as scratch registers and C
123 // expects to be callee-save
124 //
125 // we save the bottom 64 bits of each value stored in v8-v15; it is
126 // the responsibility of the caller to preserve larger values.
127 //
128 // so the stub frame looks like this when we enter Java code
129 //
130 // [ return_from_Java ] <--- sp
131 // [ argument word n ]
132 // ...
133 // -27 [ argument word 1 ]
134 // -26 [ saved v15 ] <--- sp_after_call
135 // -25 [ saved v14 ]
136 // -24 [ saved v13 ]
137 // -23 [ saved v12 ]
138 // -22 [ saved v11 ]
139 // -21 [ saved v10 ]
140 // -20 [ saved v9 ]
141 // -19 [ saved v8 ]
142 // -18 [ saved r28 ]
143 // -17 [ saved r27 ]
144 // -16 [ saved r26 ]
145 // -15 [ saved r25 ]
146 // -14 [ saved r24 ]
147 // -13 [ saved r23 ]
148 // -12 [ saved r22 ]
149 // -11 [ saved r21 ]
150 // -10 [ saved r20 ]
151 // -9 [ saved r19 ]
152 // -8 [ call wrapper (r0) ]
153 // -7 [ result (r1) ]
154 // -6 [ result type (r2) ]
155 // -5 [ method (r3) ]
156 // -4 [ entry point (r4) ]
157 // -3 [ parameters (r5) ]
158 // -2 [ parameter size (r6) ]
159 // -1 [ thread (r7) ]
160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
161 // 1 [ saved lr (r30) ]
162
163 // Call stub stack layout word offsets from fp
164 enum call_stub_layout {
165 sp_after_call_off = -26,
166
167 d15_off = -26,
168 d13_off = -24,
169 d11_off = -22,
170 d9_off = -20,
171
172 r28_off = -18,
173 r26_off = -16,
174 r24_off = -14,
175 r22_off = -12,
176 r20_off = -10,
177 call_wrapper_off = -8,
178 result_off = -7,
179 result_type_off = -6,
180 method_off = -5,
181 entry_point_off = -4,
182 parameter_size_off = -2,
183 thread_off = -1,
184 fp_f = 0,
185 retaddr_off = 1,
186 };
187
generate_call_stub(address & return_address)188 address generate_call_stub(address& return_address) {
189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
191 "adjust this code");
192
193 StubCodeMark mark(this, "StubRoutines", "call_stub");
194 address start = __ pc();
195
196 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
197
198 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
199 const Address result (rfp, result_off * wordSize);
200 const Address result_type (rfp, result_type_off * wordSize);
201 const Address method (rfp, method_off * wordSize);
202 const Address entry_point (rfp, entry_point_off * wordSize);
203 const Address parameter_size(rfp, parameter_size_off * wordSize);
204
205 const Address thread (rfp, thread_off * wordSize);
206
207 const Address d15_save (rfp, d15_off * wordSize);
208 const Address d13_save (rfp, d13_off * wordSize);
209 const Address d11_save (rfp, d11_off * wordSize);
210 const Address d9_save (rfp, d9_off * wordSize);
211
212 const Address r28_save (rfp, r28_off * wordSize);
213 const Address r26_save (rfp, r26_off * wordSize);
214 const Address r24_save (rfp, r24_off * wordSize);
215 const Address r22_save (rfp, r22_off * wordSize);
216 const Address r20_save (rfp, r20_off * wordSize);
217
218 // stub code
219
220 address aarch64_entry = __ pc();
221
222 // set up frame and move sp to end of save area
223 __ enter();
224 __ sub(sp, rfp, -sp_after_call_off * wordSize);
225
226 // save register parameters and Java scratch/global registers
227 // n.b. we save thread even though it gets installed in
228 // rthread because we want to sanity check rthread later
229 __ str(c_rarg7, thread);
230 __ strw(c_rarg6, parameter_size);
231 __ stp(c_rarg4, c_rarg5, entry_point);
232 __ stp(c_rarg2, c_rarg3, result_type);
233 __ stp(c_rarg0, c_rarg1, call_wrapper);
234
235 __ stp(r20, r19, r20_save);
236 __ stp(r22, r21, r22_save);
237 __ stp(r24, r23, r24_save);
238 __ stp(r26, r25, r26_save);
239 __ stp(r28, r27, r28_save);
240
241 __ stpd(v9, v8, d9_save);
242 __ stpd(v11, v10, d11_save);
243 __ stpd(v13, v12, d13_save);
244 __ stpd(v15, v14, d15_save);
245
246 // install Java thread in global register now we have saved
247 // whatever value it held
248 __ mov(rthread, c_rarg7);
249 // And method
250 __ mov(rmethod, c_rarg3);
251
252 // set up the heapbase register
253 __ reinit_heapbase();
254
255 #ifdef ASSERT
256 // make sure we have no pending exceptions
257 {
258 Label L;
259 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
260 __ cmp(rscratch1, (u1)NULL_WORD);
261 __ br(Assembler::EQ, L);
262 __ stop("StubRoutines::call_stub: entered with pending exception");
263 __ BIND(L);
264 }
265 #endif
266 // pass parameters if any
267 __ mov(esp, sp);
268 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
269 __ andr(sp, rscratch1, -2 * wordSize);
270
271 BLOCK_COMMENT("pass parameters if any");
272 Label parameters_done;
273 // parameter count is still in c_rarg6
274 // and parameter pointer identifying param 1 is in c_rarg5
275 __ cbzw(c_rarg6, parameters_done);
276
277 address loop = __ pc();
278 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
279 __ subsw(c_rarg6, c_rarg6, 1);
280 __ push(rscratch1);
281 __ br(Assembler::GT, loop);
282
283 __ BIND(parameters_done);
284
285 // call Java entry -- passing methdoOop, and current sp
286 // rmethod: Method*
287 // r13: sender sp
288 BLOCK_COMMENT("call Java function");
289 __ mov(r13, sp);
290 __ blr(c_rarg4);
291
292 // we do this here because the notify will already have been done
293 // if we get to the next instruction via an exception
294 //
295 // n.b. adding this instruction here affects the calculation of
296 // whether or not a routine returns to the call stub (used when
297 // doing stack walks) since the normal test is to check the return
298 // pc against the address saved below. so we may need to allow for
299 // this extra instruction in the check.
300
301 // save current address for use by exception handling code
302
303 return_address = __ pc();
304
305 // store result depending on type (everything that is not
306 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
307 // n.b. this assumes Java returns an integral result in r0
308 // and a floating result in j_farg0
309 __ ldr(j_rarg2, result);
310 Label is_long, is_float, is_double, exit;
311 __ ldr(j_rarg1, result_type);
312 __ cmp(j_rarg1, (u1)T_OBJECT);
313 __ br(Assembler::EQ, is_long);
314 __ cmp(j_rarg1, (u1)T_LONG);
315 __ br(Assembler::EQ, is_long);
316 __ cmp(j_rarg1, (u1)T_FLOAT);
317 __ br(Assembler::EQ, is_float);
318 __ cmp(j_rarg1, (u1)T_DOUBLE);
319 __ br(Assembler::EQ, is_double);
320
321 // handle T_INT case
322 __ strw(r0, Address(j_rarg2));
323
324 __ BIND(exit);
325
326 // pop parameters
327 __ sub(esp, rfp, -sp_after_call_off * wordSize);
328
329 #ifdef ASSERT
330 // verify that threads correspond
331 {
332 Label L, S;
333 __ ldr(rscratch1, thread);
334 __ cmp(rthread, rscratch1);
335 __ br(Assembler::NE, S);
336 __ get_thread(rscratch1);
337 __ cmp(rthread, rscratch1);
338 __ br(Assembler::EQ, L);
339 __ BIND(S);
340 __ stop("StubRoutines::call_stub: threads must correspond");
341 __ BIND(L);
342 }
343 #endif
344
345 // restore callee-save registers
346 __ ldpd(v15, v14, d15_save);
347 __ ldpd(v13, v12, d13_save);
348 __ ldpd(v11, v10, d11_save);
349 __ ldpd(v9, v8, d9_save);
350
351 __ ldp(r28, r27, r28_save);
352 __ ldp(r26, r25, r26_save);
353 __ ldp(r24, r23, r24_save);
354 __ ldp(r22, r21, r22_save);
355 __ ldp(r20, r19, r20_save);
356
357 __ ldp(c_rarg0, c_rarg1, call_wrapper);
358 __ ldrw(c_rarg2, result_type);
359 __ ldr(c_rarg3, method);
360 __ ldp(c_rarg4, c_rarg5, entry_point);
361 __ ldp(c_rarg6, c_rarg7, parameter_size);
362
363 // leave frame and return to caller
364 __ leave();
365 __ ret(lr);
366
367 // handle return types different from T_INT
368
369 __ BIND(is_long);
370 __ str(r0, Address(j_rarg2, 0));
371 __ br(Assembler::AL, exit);
372
373 __ BIND(is_float);
374 __ strs(j_farg0, Address(j_rarg2, 0));
375 __ br(Assembler::AL, exit);
376
377 __ BIND(is_double);
378 __ strd(j_farg0, Address(j_rarg2, 0));
379 __ br(Assembler::AL, exit);
380
381 return start;
382 }
383
384 // Return point for a Java call if there's an exception thrown in
385 // Java code. The exception is caught and transformed into a
386 // pending exception stored in JavaThread that can be tested from
387 // within the VM.
388 //
389 // Note: Usually the parameters are removed by the callee. In case
390 // of an exception crossing an activation frame boundary, that is
391 // not the case if the callee is compiled code => need to setup the
392 // rsp.
393 //
394 // r0: exception oop
395
generate_catch_exception()396 address generate_catch_exception() {
397 StubCodeMark mark(this, "StubRoutines", "catch_exception");
398 address start = __ pc();
399
400 // same as in generate_call_stub():
401 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
402 const Address thread (rfp, thread_off * wordSize);
403
404 #ifdef ASSERT
405 // verify that threads correspond
406 {
407 Label L, S;
408 __ ldr(rscratch1, thread);
409 __ cmp(rthread, rscratch1);
410 __ br(Assembler::NE, S);
411 __ get_thread(rscratch1);
412 __ cmp(rthread, rscratch1);
413 __ br(Assembler::EQ, L);
414 __ bind(S);
415 __ stop("StubRoutines::catch_exception: threads must correspond");
416 __ bind(L);
417 }
418 #endif
419
420 // set pending exception
421 __ verify_oop(r0);
422
423 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
424 __ mov(rscratch1, (address)__FILE__);
425 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
426 __ movw(rscratch1, (int)__LINE__);
427 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
428
429 // complete return to VM
430 assert(StubRoutines::_call_stub_return_address != NULL,
431 "_call_stub_return_address must have been generated before");
432 __ b(StubRoutines::_call_stub_return_address);
433
434 return start;
435 }
436
437 // Continuation point for runtime calls returning with a pending
438 // exception. The pending exception check happened in the runtime
439 // or native call stub. The pending exception in Thread is
440 // converted into a Java-level exception.
441 //
442 // Contract with Java-level exception handlers:
443 // r0: exception
444 // r3: throwing pc
445 //
446 // NOTE: At entry of this stub, exception-pc must be in LR !!
447
448 // NOTE: this is always used as a jump target within generated code
449 // so it just needs to be generated code wiht no x86 prolog
450
generate_forward_exception()451 address generate_forward_exception() {
452 StubCodeMark mark(this, "StubRoutines", "forward exception");
453 address start = __ pc();
454
455 // Upon entry, LR points to the return address returning into
456 // Java (interpreted or compiled) code; i.e., the return address
457 // becomes the throwing pc.
458 //
459 // Arguments pushed before the runtime call are still on the stack
460 // but the exception handler will reset the stack pointer ->
461 // ignore them. A potential result in registers can be ignored as
462 // well.
463
464 #ifdef ASSERT
465 // make sure this code is only executed if there is a pending exception
466 {
467 Label L;
468 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
469 __ cbnz(rscratch1, L);
470 __ stop("StubRoutines::forward exception: no pending exception (1)");
471 __ bind(L);
472 }
473 #endif
474
475 // compute exception handler into r19
476
477 // call the VM to find the handler address associated with the
478 // caller address. pass thread in r0 and caller pc (ret address)
479 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
480 // the stack.
481 __ mov(c_rarg1, lr);
482 // lr will be trashed by the VM call so we move it to R19
483 // (callee-saved) because we also need to pass it to the handler
484 // returned by this call.
485 __ mov(r19, lr);
486 BLOCK_COMMENT("call exception_handler_for_return_address");
487 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
488 SharedRuntime::exception_handler_for_return_address),
489 rthread, c_rarg1);
490 // we should not really care that lr is no longer the callee
491 // address. we saved the value the handler needs in r19 so we can
492 // just copy it to r3. however, the C2 handler will push its own
493 // frame and then calls into the VM and the VM code asserts that
494 // the PC for the frame above the handler belongs to a compiled
495 // Java method. So, we restore lr here to satisfy that assert.
496 __ mov(lr, r19);
497 // setup r0 & r3 & clear pending exception
498 __ mov(r3, r19);
499 __ mov(r19, r0);
500 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
501 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
502
503 #ifdef ASSERT
504 // make sure exception is set
505 {
506 Label L;
507 __ cbnz(r0, L);
508 __ stop("StubRoutines::forward exception: no pending exception (2)");
509 __ bind(L);
510 }
511 #endif
512
513 // continue at exception handler
514 // r0: exception
515 // r3: throwing pc
516 // r19: exception handler
517 __ verify_oop(r0);
518 __ br(r19);
519
520 return start;
521 }
522
523 // Non-destructive plausibility checks for oops
524 //
525 // Arguments:
526 // r0: oop to verify
527 // rscratch1: error message
528 //
529 // Stack after saving c_rarg3:
530 // [tos + 0]: saved c_rarg3
531 // [tos + 1]: saved c_rarg2
532 // [tos + 2]: saved lr
533 // [tos + 3]: saved rscratch2
534 // [tos + 4]: saved r0
535 // [tos + 5]: saved rscratch1
generate_verify_oop()536 address generate_verify_oop() {
537
538 StubCodeMark mark(this, "StubRoutines", "verify_oop");
539 address start = __ pc();
540
541 Label exit, error;
542
543 // save c_rarg2 and c_rarg3
544 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
545
546 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
547 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
548 __ ldr(c_rarg3, Address(c_rarg2));
549 __ add(c_rarg3, c_rarg3, 1);
550 __ str(c_rarg3, Address(c_rarg2));
551
552 // object is in r0
553 // make sure object is 'reasonable'
554 __ cbz(r0, exit); // if obj is NULL it is OK
555
556 #if INCLUDE_ZGC
557 if (UseZGC) {
558 // Check if mask is good.
559 // verifies that ZAddressBadMask & r0 == 0
560 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
561 __ andr(c_rarg2, r0, c_rarg3);
562 __ cbnz(c_rarg2, error);
563 }
564 #endif
565
566 // Check if the oop is in the right area of memory
567 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
568 __ andr(c_rarg2, r0, c_rarg3);
569 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
570
571 // Compare c_rarg2 and c_rarg3. We don't use a compare
572 // instruction here because the flags register is live.
573 __ eor(c_rarg2, c_rarg2, c_rarg3);
574 __ cbnz(c_rarg2, error);
575
576 // make sure klass is 'reasonable', which is not zero.
577 __ load_klass(r0, r0); // get klass
578 __ cbz(r0, error); // if klass is NULL it is broken
579
580 // return if everything seems ok
581 __ bind(exit);
582
583 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
584 __ ret(lr);
585
586 // handle errors
587 __ bind(error);
588 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
589
590 __ push(RegSet::range(r0, r29), sp);
591 // debug(char* msg, int64_t pc, int64_t regs[])
592 __ mov(c_rarg0, rscratch1); // pass address of error message
593 __ mov(c_rarg1, lr); // pass return address
594 __ mov(c_rarg2, sp); // pass address of regs on stack
595 #ifndef PRODUCT
596 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
597 #endif
598 BLOCK_COMMENT("call MacroAssembler::debug");
599 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
600 __ blr(rscratch1);
601
602 return start;
603 }
604
array_overlap_test(Label & L_no_overlap,Address::sxtw sf)605 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
606
607 // The inner part of zero_words(). This is the bulk operation,
608 // zeroing words in blocks, possibly using DC ZVA to do it. The
609 // caller is responsible for zeroing the last few words.
610 //
611 // Inputs:
612 // r10: the HeapWord-aligned base address of an array to zero.
613 // r11: the count in HeapWords, r11 > 0.
614 //
615 // Returns r10 and r11, adjusted for the caller to clear.
616 // r10: the base address of the tail of words left to clear.
617 // r11: the number of words in the tail.
618 // r11 < MacroAssembler::zero_words_block_size.
619
generate_zero_blocks()620 address generate_zero_blocks() {
621 Label done;
622 Label base_aligned;
623
624 Register base = r10, cnt = r11;
625
626 __ align(CodeEntryAlignment);
627 StubCodeMark mark(this, "StubRoutines", "zero_blocks");
628 address start = __ pc();
629
630 if (UseBlockZeroing) {
631 int zva_length = VM_Version::zva_length();
632
633 // Ensure ZVA length can be divided by 16. This is required by
634 // the subsequent operations.
635 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
636
637 __ tbz(base, 3, base_aligned);
638 __ str(zr, Address(__ post(base, 8)));
639 __ sub(cnt, cnt, 1);
640 __ bind(base_aligned);
641
642 // Ensure count >= zva_length * 2 so that it still deserves a zva after
643 // alignment.
644 Label small;
645 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
646 __ subs(rscratch1, cnt, low_limit >> 3);
647 __ br(Assembler::LT, small);
648 __ zero_dcache_blocks(base, cnt);
649 __ bind(small);
650 }
651
652 {
653 // Number of stp instructions we'll unroll
654 const int unroll =
655 MacroAssembler::zero_words_block_size / 2;
656 // Clear the remaining blocks.
657 Label loop;
658 __ subs(cnt, cnt, unroll * 2);
659 __ br(Assembler::LT, done);
660 __ bind(loop);
661 for (int i = 0; i < unroll; i++)
662 __ stp(zr, zr, __ post(base, 16));
663 __ subs(cnt, cnt, unroll * 2);
664 __ br(Assembler::GE, loop);
665 __ bind(done);
666 __ add(cnt, cnt, unroll * 2);
667 }
668
669 __ ret(lr);
670
671 return start;
672 }
673
674
675 typedef enum {
676 copy_forwards = 1,
677 copy_backwards = -1
678 } copy_direction;
679
680 // Bulk copy of blocks of 8 words.
681 //
682 // count is a count of words.
683 //
684 // Precondition: count >= 8
685 //
686 // Postconditions:
687 //
688 // The least significant bit of count contains the remaining count
689 // of words to copy. The rest of count is trash.
690 //
691 // s and d are adjusted to point to the remaining words to copy
692 //
generate_copy_longs(Label & start,Register s,Register d,Register count,copy_direction direction)693 void generate_copy_longs(Label &start, Register s, Register d, Register count,
694 copy_direction direction) {
695 int unit = wordSize * direction;
696 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
697
698 int offset;
699 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
700 t4 = r7, t5 = r10, t6 = r11, t7 = r12;
701 const Register stride = r13;
702
703 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
704 assert_different_registers(s, d, count, rscratch1);
705
706 Label again, drain;
707 const char *stub_name;
708 if (direction == copy_forwards)
709 stub_name = "forward_copy_longs";
710 else
711 stub_name = "backward_copy_longs";
712
713 __ align(CodeEntryAlignment);
714
715 StubCodeMark mark(this, "StubRoutines", stub_name);
716
717 __ bind(start);
718
719 Label unaligned_copy_long;
720 if (AvoidUnalignedAccesses) {
721 __ tbnz(d, 3, unaligned_copy_long);
722 }
723
724 if (direction == copy_forwards) {
725 __ sub(s, s, bias);
726 __ sub(d, d, bias);
727 }
728
729 #ifdef ASSERT
730 // Make sure we are never given < 8 words
731 {
732 Label L;
733 __ cmp(count, (u1)8);
734 __ br(Assembler::GE, L);
735 __ stop("genrate_copy_longs called with < 8 words");
736 __ bind(L);
737 }
738 #endif
739
740 // Fill 8 registers
741 if (UseSIMDForMemoryOps) {
742 __ ldpq(v0, v1, Address(s, 4 * unit));
743 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
744 } else {
745 __ ldp(t0, t1, Address(s, 2 * unit));
746 __ ldp(t2, t3, Address(s, 4 * unit));
747 __ ldp(t4, t5, Address(s, 6 * unit));
748 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
749 }
750
751 __ subs(count, count, 16);
752 __ br(Assembler::LO, drain);
753
754 int prefetch = PrefetchCopyIntervalInBytes;
755 bool use_stride = false;
756 if (direction == copy_backwards) {
757 use_stride = prefetch > 256;
758 prefetch = -prefetch;
759 if (use_stride) __ mov(stride, prefetch);
760 }
761
762 __ bind(again);
763
764 if (PrefetchCopyIntervalInBytes > 0)
765 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
766
767 if (UseSIMDForMemoryOps) {
768 __ stpq(v0, v1, Address(d, 4 * unit));
769 __ ldpq(v0, v1, Address(s, 4 * unit));
770 __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
771 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
772 } else {
773 __ stp(t0, t1, Address(d, 2 * unit));
774 __ ldp(t0, t1, Address(s, 2 * unit));
775 __ stp(t2, t3, Address(d, 4 * unit));
776 __ ldp(t2, t3, Address(s, 4 * unit));
777 __ stp(t4, t5, Address(d, 6 * unit));
778 __ ldp(t4, t5, Address(s, 6 * unit));
779 __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
780 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
781 }
782
783 __ subs(count, count, 8);
784 __ br(Assembler::HS, again);
785
786 // Drain
787 __ bind(drain);
788 if (UseSIMDForMemoryOps) {
789 __ stpq(v0, v1, Address(d, 4 * unit));
790 __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
791 } else {
792 __ stp(t0, t1, Address(d, 2 * unit));
793 __ stp(t2, t3, Address(d, 4 * unit));
794 __ stp(t4, t5, Address(d, 6 * unit));
795 __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
796 }
797
798 {
799 Label L1, L2;
800 __ tbz(count, exact_log2(4), L1);
801 if (UseSIMDForMemoryOps) {
802 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
803 __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
804 } else {
805 __ ldp(t0, t1, Address(s, 2 * unit));
806 __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
807 __ stp(t0, t1, Address(d, 2 * unit));
808 __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
809 }
810 __ bind(L1);
811
812 if (direction == copy_forwards) {
813 __ add(s, s, bias);
814 __ add(d, d, bias);
815 }
816
817 __ tbz(count, 1, L2);
818 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
819 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
820 __ bind(L2);
821 }
822
823 __ ret(lr);
824
825 if (AvoidUnalignedAccesses) {
826 Label drain, again;
827 // Register order for storing. Order is different for backward copy.
828
829 __ bind(unaligned_copy_long);
830
831 // source address is even aligned, target odd aligned
832 //
833 // when forward copying word pairs we read long pairs at offsets
834 // {0, 2, 4, 6} (in long words). when backwards copying we read
835 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
836 // address by -2 in the forwards case so we can compute the
837 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
838 // or -1.
839 //
840 // when forward copying we need to store 1 word, 3 pairs and
841 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
842 // zero offset We adjust the destination by -1 which means we
843 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
844 //
845 // When backwards copyng we need to store 1 word, 3 pairs and
846 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
847 // offsets {1, 3, 5, 7, 8} * unit.
848
849 if (direction == copy_forwards) {
850 __ sub(s, s, 16);
851 __ sub(d, d, 8);
852 }
853
854 // Fill 8 registers
855 //
856 // for forwards copy s was offset by -16 from the original input
857 // value of s so the register contents are at these offsets
858 // relative to the 64 bit block addressed by that original input
859 // and so on for each successive 64 byte block when s is updated
860 //
861 // t0 at offset 0, t1 at offset 8
862 // t2 at offset 16, t3 at offset 24
863 // t4 at offset 32, t5 at offset 40
864 // t6 at offset 48, t7 at offset 56
865
866 // for backwards copy s was not offset so the register contents
867 // are at these offsets into the preceding 64 byte block
868 // relative to that original input and so on for each successive
869 // preceding 64 byte block when s is updated. this explains the
870 // slightly counter-intuitive looking pattern of register usage
871 // in the stp instructions for backwards copy.
872 //
873 // t0 at offset -16, t1 at offset -8
874 // t2 at offset -32, t3 at offset -24
875 // t4 at offset -48, t5 at offset -40
876 // t6 at offset -64, t7 at offset -56
877
878 __ ldp(t0, t1, Address(s, 2 * unit));
879 __ ldp(t2, t3, Address(s, 4 * unit));
880 __ ldp(t4, t5, Address(s, 6 * unit));
881 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
882
883 __ subs(count, count, 16);
884 __ br(Assembler::LO, drain);
885
886 int prefetch = PrefetchCopyIntervalInBytes;
887 bool use_stride = false;
888 if (direction == copy_backwards) {
889 use_stride = prefetch > 256;
890 prefetch = -prefetch;
891 if (use_stride) __ mov(stride, prefetch);
892 }
893
894 __ bind(again);
895
896 if (PrefetchCopyIntervalInBytes > 0)
897 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
898
899 if (direction == copy_forwards) {
900 // allowing for the offset of -8 the store instructions place
901 // registers into the target 64 bit block at the following
902 // offsets
903 //
904 // t0 at offset 0
905 // t1 at offset 8, t2 at offset 16
906 // t3 at offset 24, t4 at offset 32
907 // t5 at offset 40, t6 at offset 48
908 // t7 at offset 56
909
910 __ str(t0, Address(d, 1 * unit));
911 __ stp(t1, t2, Address(d, 2 * unit));
912 __ ldp(t0, t1, Address(s, 2 * unit));
913 __ stp(t3, t4, Address(d, 4 * unit));
914 __ ldp(t2, t3, Address(s, 4 * unit));
915 __ stp(t5, t6, Address(d, 6 * unit));
916 __ ldp(t4, t5, Address(s, 6 * unit));
917 __ str(t7, Address(__ pre(d, 8 * unit)));
918 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
919 } else {
920 // d was not offset when we started so the registers are
921 // written into the 64 bit block preceding d with the following
922 // offsets
923 //
924 // t1 at offset -8
925 // t3 at offset -24, t0 at offset -16
926 // t5 at offset -48, t2 at offset -32
927 // t7 at offset -56, t4 at offset -48
928 // t6 at offset -64
929 //
930 // note that this matches the offsets previously noted for the
931 // loads
932
933 __ str(t1, Address(d, 1 * unit));
934 __ stp(t3, t0, Address(d, 3 * unit));
935 __ ldp(t0, t1, Address(s, 2 * unit));
936 __ stp(t5, t2, Address(d, 5 * unit));
937 __ ldp(t2, t3, Address(s, 4 * unit));
938 __ stp(t7, t4, Address(d, 7 * unit));
939 __ ldp(t4, t5, Address(s, 6 * unit));
940 __ str(t6, Address(__ pre(d, 8 * unit)));
941 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
942 }
943
944 __ subs(count, count, 8);
945 __ br(Assembler::HS, again);
946
947 // Drain
948 //
949 // this uses the same pattern of offsets and register arguments
950 // as above
951 __ bind(drain);
952 if (direction == copy_forwards) {
953 __ str(t0, Address(d, 1 * unit));
954 __ stp(t1, t2, Address(d, 2 * unit));
955 __ stp(t3, t4, Address(d, 4 * unit));
956 __ stp(t5, t6, Address(d, 6 * unit));
957 __ str(t7, Address(__ pre(d, 8 * unit)));
958 } else {
959 __ str(t1, Address(d, 1 * unit));
960 __ stp(t3, t0, Address(d, 3 * unit));
961 __ stp(t5, t2, Address(d, 5 * unit));
962 __ stp(t7, t4, Address(d, 7 * unit));
963 __ str(t6, Address(__ pre(d, 8 * unit)));
964 }
965 // now we need to copy any remaining part block which may
966 // include a 4 word block subblock and/or a 2 word subblock.
967 // bits 2 and 1 in the count are the tell-tale for whetehr we
968 // have each such subblock
969 {
970 Label L1, L2;
971 __ tbz(count, exact_log2(4), L1);
972 // this is the same as above but copying only 4 longs hence
973 // with ony one intervening stp between the str instructions
974 // but note that the offsets and registers still follow the
975 // same pattern
976 __ ldp(t0, t1, Address(s, 2 * unit));
977 __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
978 if (direction == copy_forwards) {
979 __ str(t0, Address(d, 1 * unit));
980 __ stp(t1, t2, Address(d, 2 * unit));
981 __ str(t3, Address(__ pre(d, 4 * unit)));
982 } else {
983 __ str(t1, Address(d, 1 * unit));
984 __ stp(t3, t0, Address(d, 3 * unit));
985 __ str(t2, Address(__ pre(d, 4 * unit)));
986 }
987 __ bind(L1);
988
989 __ tbz(count, 1, L2);
990 // this is the same as above but copying only 2 longs hence
991 // there is no intervening stp between the str instructions
992 // but note that the offset and register patterns are still
993 // the same
994 __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
995 if (direction == copy_forwards) {
996 __ str(t0, Address(d, 1 * unit));
997 __ str(t1, Address(__ pre(d, 2 * unit)));
998 } else {
999 __ str(t1, Address(d, 1 * unit));
1000 __ str(t0, Address(__ pre(d, 2 * unit)));
1001 }
1002 __ bind(L2);
1003
1004 // for forwards copy we need to re-adjust the offsets we
1005 // applied so that s and d are follow the last words written
1006
1007 if (direction == copy_forwards) {
1008 __ add(s, s, 16);
1009 __ add(d, d, 8);
1010 }
1011
1012 }
1013
1014 __ ret(lr);
1015 }
1016 }
1017
1018 // Small copy: less than 16 bytes.
1019 //
1020 // NB: Ignores all of the bits of count which represent more than 15
1021 // bytes, so a caller doesn't have to mask them.
1022
copy_memory_small(Register s,Register d,Register count,Register tmp,int step)1023 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1024 bool is_backwards = step < 0;
1025 size_t granularity = uabs(step);
1026 int direction = is_backwards ? -1 : 1;
1027 int unit = wordSize * direction;
1028
1029 Label Lword, Lint, Lshort, Lbyte;
1030
1031 assert(granularity
1032 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1033
1034 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1035
1036 // ??? I don't know if this bit-test-and-branch is the right thing
1037 // to do. It does a lot of jumping, resulting in several
1038 // mispredicted branches. It might make more sense to do this
1039 // with something like Duff's device with a single computed branch.
1040
1041 __ tbz(count, 3 - exact_log2(granularity), Lword);
1042 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1043 __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1044 __ bind(Lword);
1045
1046 if (granularity <= sizeof (jint)) {
1047 __ tbz(count, 2 - exact_log2(granularity), Lint);
1048 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1049 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1050 __ bind(Lint);
1051 }
1052
1053 if (granularity <= sizeof (jshort)) {
1054 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1055 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1056 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1057 __ bind(Lshort);
1058 }
1059
1060 if (granularity <= sizeof (jbyte)) {
1061 __ tbz(count, 0, Lbyte);
1062 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1063 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1064 __ bind(Lbyte);
1065 }
1066 }
1067
1068 Label copy_f, copy_b;
1069
1070 // All-singing all-dancing memory copy.
1071 //
1072 // Copy count units of memory from s to d. The size of a unit is
1073 // step, which can be positive or negative depending on the direction
1074 // of copy. If is_aligned is false, we align the source address.
1075 //
1076
copy_memory(bool is_aligned,Register s,Register d,Register count,Register tmp,int step)1077 void copy_memory(bool is_aligned, Register s, Register d,
1078 Register count, Register tmp, int step) {
1079 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1080 bool is_backwards = step < 0;
1081 int granularity = uabs(step);
1082 const Register t0 = r3, t1 = r4;
1083
1084 // <= 96 bytes do inline. Direction doesn't matter because we always
1085 // load all the data before writing anything
1086 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1087 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1088 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1089 const Register send = r17, dend = r18;
1090
1091 if (PrefetchCopyIntervalInBytes > 0)
1092 __ prfm(Address(s, 0), PLDL1KEEP);
1093 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1094 __ br(Assembler::HI, copy_big);
1095
1096 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1097 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1098
1099 __ cmp(count, u1(16/granularity));
1100 __ br(Assembler::LS, copy16);
1101
1102 __ cmp(count, u1(64/granularity));
1103 __ br(Assembler::HI, copy80);
1104
1105 __ cmp(count, u1(32/granularity));
1106 __ br(Assembler::LS, copy32);
1107
1108 // 33..64 bytes
1109 if (UseSIMDForMemoryOps) {
1110 __ ldpq(v0, v1, Address(s, 0));
1111 __ ldpq(v2, v3, Address(send, -32));
1112 __ stpq(v0, v1, Address(d, 0));
1113 __ stpq(v2, v3, Address(dend, -32));
1114 } else {
1115 __ ldp(t0, t1, Address(s, 0));
1116 __ ldp(t2, t3, Address(s, 16));
1117 __ ldp(t4, t5, Address(send, -32));
1118 __ ldp(t6, t7, Address(send, -16));
1119
1120 __ stp(t0, t1, Address(d, 0));
1121 __ stp(t2, t3, Address(d, 16));
1122 __ stp(t4, t5, Address(dend, -32));
1123 __ stp(t6, t7, Address(dend, -16));
1124 }
1125 __ b(finish);
1126
1127 // 17..32 bytes
1128 __ bind(copy32);
1129 __ ldp(t0, t1, Address(s, 0));
1130 __ ldp(t2, t3, Address(send, -16));
1131 __ stp(t0, t1, Address(d, 0));
1132 __ stp(t2, t3, Address(dend, -16));
1133 __ b(finish);
1134
1135 // 65..80/96 bytes
1136 // (96 bytes if SIMD because we do 32 byes per instruction)
1137 __ bind(copy80);
1138 if (UseSIMDForMemoryOps) {
1139 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1140 __ ldpq(v4, v5, Address(send, -32));
1141 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1142 __ stpq(v4, v5, Address(dend, -32));
1143 } else {
1144 __ ldp(t0, t1, Address(s, 0));
1145 __ ldp(t2, t3, Address(s, 16));
1146 __ ldp(t4, t5, Address(s, 32));
1147 __ ldp(t6, t7, Address(s, 48));
1148 __ ldp(t8, t9, Address(send, -16));
1149
1150 __ stp(t0, t1, Address(d, 0));
1151 __ stp(t2, t3, Address(d, 16));
1152 __ stp(t4, t5, Address(d, 32));
1153 __ stp(t6, t7, Address(d, 48));
1154 __ stp(t8, t9, Address(dend, -16));
1155 }
1156 __ b(finish);
1157
1158 // 0..16 bytes
1159 __ bind(copy16);
1160 __ cmp(count, u1(8/granularity));
1161 __ br(Assembler::LO, copy8);
1162
1163 // 8..16 bytes
1164 __ ldr(t0, Address(s, 0));
1165 __ ldr(t1, Address(send, -8));
1166 __ str(t0, Address(d, 0));
1167 __ str(t1, Address(dend, -8));
1168 __ b(finish);
1169
1170 if (granularity < 8) {
1171 // 4..7 bytes
1172 __ bind(copy8);
1173 __ tbz(count, 2 - exact_log2(granularity), copy4);
1174 __ ldrw(t0, Address(s, 0));
1175 __ ldrw(t1, Address(send, -4));
1176 __ strw(t0, Address(d, 0));
1177 __ strw(t1, Address(dend, -4));
1178 __ b(finish);
1179 if (granularity < 4) {
1180 // 0..3 bytes
1181 __ bind(copy4);
1182 __ cbz(count, finish); // get rid of 0 case
1183 if (granularity == 2) {
1184 __ ldrh(t0, Address(s, 0));
1185 __ strh(t0, Address(d, 0));
1186 } else { // granularity == 1
1187 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1188 // the first and last byte.
1189 // Handle the 3 byte case by loading and storing base + count/2
1190 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1191 // This does means in the 1 byte case we load/store the same
1192 // byte 3 times.
1193 __ lsr(count, count, 1);
1194 __ ldrb(t0, Address(s, 0));
1195 __ ldrb(t1, Address(send, -1));
1196 __ ldrb(t2, Address(s, count));
1197 __ strb(t0, Address(d, 0));
1198 __ strb(t1, Address(dend, -1));
1199 __ strb(t2, Address(d, count));
1200 }
1201 __ b(finish);
1202 }
1203 }
1204
1205 __ bind(copy_big);
1206 if (is_backwards) {
1207 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1208 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1209 }
1210
1211 // Now we've got the small case out of the way we can align the
1212 // source address on a 2-word boundary.
1213
1214 Label aligned;
1215
1216 if (is_aligned) {
1217 // We may have to adjust by 1 word to get s 2-word-aligned.
1218 __ tbz(s, exact_log2(wordSize), aligned);
1219 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1220 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1221 __ sub(count, count, wordSize/granularity);
1222 } else {
1223 if (is_backwards) {
1224 __ andr(rscratch2, s, 2 * wordSize - 1);
1225 } else {
1226 __ neg(rscratch2, s);
1227 __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1228 }
1229 // rscratch2 is the byte adjustment needed to align s.
1230 __ cbz(rscratch2, aligned);
1231 int shift = exact_log2(granularity);
1232 if (shift) __ lsr(rscratch2, rscratch2, shift);
1233 __ sub(count, count, rscratch2);
1234
1235 #if 0
1236 // ?? This code is only correct for a disjoint copy. It may or
1237 // may not make sense to use it in that case.
1238
1239 // Copy the first pair; s and d may not be aligned.
1240 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1241 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1242
1243 // Align s and d, adjust count
1244 if (is_backwards) {
1245 __ sub(s, s, rscratch2);
1246 __ sub(d, d, rscratch2);
1247 } else {
1248 __ add(s, s, rscratch2);
1249 __ add(d, d, rscratch2);
1250 }
1251 #else
1252 copy_memory_small(s, d, rscratch2, rscratch1, step);
1253 #endif
1254 }
1255
1256 __ bind(aligned);
1257
1258 // s is now 2-word-aligned.
1259
1260 // We have a count of units and some trailing bytes. Adjust the
1261 // count and do a bulk copy of words.
1262 __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1263 if (direction == copy_forwards)
1264 __ bl(copy_f);
1265 else
1266 __ bl(copy_b);
1267
1268 // And the tail.
1269 copy_memory_small(s, d, count, tmp, step);
1270
1271 if (granularity >= 8) __ bind(copy8);
1272 if (granularity >= 4) __ bind(copy4);
1273 __ bind(finish);
1274 }
1275
1276
clobber_registers()1277 void clobber_registers() {
1278 #ifdef ASSERT
1279 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1280 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1281 for (Register r = r3; r <= r18; r++)
1282 if (r != rscratch1) __ mov(r, rscratch1);
1283 #endif
1284 }
1285
1286 // Scan over array at a for count oops, verifying each one.
1287 // Preserves a and count, clobbers rscratch1 and rscratch2.
verify_oop_array(size_t size,Register a,Register count,Register temp)1288 void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1289 Label loop, end;
1290 __ mov(rscratch1, a);
1291 __ mov(rscratch2, zr);
1292 __ bind(loop);
1293 __ cmp(rscratch2, count);
1294 __ br(Assembler::HS, end);
1295 if (size == (size_t)wordSize) {
1296 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1297 __ verify_oop(temp);
1298 } else {
1299 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1300 __ decode_heap_oop(temp); // calls verify_oop
1301 }
1302 __ add(rscratch2, rscratch2, size);
1303 __ b(loop);
1304 __ bind(end);
1305 }
1306
1307 // Arguments:
1308 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1309 // ignored
1310 // is_oop - true => oop array, so generate store check code
1311 // name - stub name string
1312 //
1313 // Inputs:
1314 // c_rarg0 - source array address
1315 // c_rarg1 - destination array address
1316 // c_rarg2 - element count, treated as ssize_t, can be zero
1317 //
1318 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1319 // the hardware handle it. The two dwords within qwords that span
1320 // cache line boundaries will still be loaded and stored atomicly.
1321 //
1322 // Side Effects:
1323 // disjoint_int_copy_entry is set to the no-overlap entry point
1324 // used by generate_conjoint_int_oop_copy().
1325 //
generate_disjoint_copy(size_t size,bool aligned,bool is_oop,address * entry,const char * name,bool dest_uninitialized=false)1326 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1327 const char *name, bool dest_uninitialized = false) {
1328 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1329 RegSet saved_reg = RegSet::of(s, d, count);
1330 __ align(CodeEntryAlignment);
1331 StubCodeMark mark(this, "StubRoutines", name);
1332 address start = __ pc();
1333 __ enter();
1334
1335 if (entry != NULL) {
1336 *entry = __ pc();
1337 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1338 BLOCK_COMMENT("Entry:");
1339 }
1340
1341 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1342 if (dest_uninitialized) {
1343 decorators |= IS_DEST_UNINITIALIZED;
1344 }
1345 if (aligned) {
1346 decorators |= ARRAYCOPY_ALIGNED;
1347 }
1348
1349 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1350 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1351
1352 if (is_oop) {
1353 // save regs before copy_memory
1354 __ push(RegSet::of(d, count), sp);
1355 }
1356 {
1357 // UnsafeCopyMemory page error: continue after ucm
1358 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1359 UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1360 copy_memory(aligned, s, d, count, rscratch1, size);
1361 }
1362
1363 if (is_oop) {
1364 __ pop(RegSet::of(d, count), sp);
1365 if (VerifyOops)
1366 verify_oop_array(size, d, count, r16);
1367 }
1368
1369 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1370
1371 __ leave();
1372 __ mov(r0, zr); // return 0
1373 __ ret(lr);
1374 return start;
1375 }
1376
1377 // Arguments:
1378 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1379 // ignored
1380 // is_oop - true => oop array, so generate store check code
1381 // name - stub name string
1382 //
1383 // Inputs:
1384 // c_rarg0 - source array address
1385 // c_rarg1 - destination array address
1386 // c_rarg2 - element count, treated as ssize_t, can be zero
1387 //
1388 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1389 // the hardware handle it. The two dwords within qwords that span
1390 // cache line boundaries will still be loaded and stored atomicly.
1391 //
generate_conjoint_copy(size_t size,bool aligned,bool is_oop,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1392 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1393 address *entry, const char *name,
1394 bool dest_uninitialized = false) {
1395 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1396 RegSet saved_regs = RegSet::of(s, d, count);
1397 StubCodeMark mark(this, "StubRoutines", name);
1398 address start = __ pc();
1399 __ enter();
1400
1401 if (entry != NULL) {
1402 *entry = __ pc();
1403 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1404 BLOCK_COMMENT("Entry:");
1405 }
1406
1407 // use fwd copy when (d-s) above_equal (count*size)
1408 __ sub(rscratch1, d, s);
1409 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1410 __ br(Assembler::HS, nooverlap_target);
1411
1412 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1413 if (dest_uninitialized) {
1414 decorators |= IS_DEST_UNINITIALIZED;
1415 }
1416 if (aligned) {
1417 decorators |= ARRAYCOPY_ALIGNED;
1418 }
1419
1420 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1421 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1422
1423 if (is_oop) {
1424 // save regs before copy_memory
1425 __ push(RegSet::of(d, count), sp);
1426 }
1427 {
1428 // UnsafeCopyMemory page error: continue after ucm
1429 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1430 UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1431 copy_memory(aligned, s, d, count, rscratch1, -size);
1432 }
1433 if (is_oop) {
1434 __ pop(RegSet::of(d, count), sp);
1435 if (VerifyOops)
1436 verify_oop_array(size, d, count, r16);
1437 }
1438 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1439 __ leave();
1440 __ mov(r0, zr); // return 0
1441 __ ret(lr);
1442 return start;
1443 }
1444
1445 // Arguments:
1446 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1447 // ignored
1448 // name - stub name string
1449 //
1450 // Inputs:
1451 // c_rarg0 - source array address
1452 // c_rarg1 - destination array address
1453 // c_rarg2 - element count, treated as ssize_t, can be zero
1454 //
1455 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1456 // we let the hardware handle it. The one to eight bytes within words,
1457 // dwords or qwords that span cache line boundaries will still be loaded
1458 // and stored atomically.
1459 //
1460 // Side Effects:
1461 // disjoint_byte_copy_entry is set to the no-overlap entry point //
1462 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1463 // we let the hardware handle it. The one to eight bytes within words,
1464 // dwords or qwords that span cache line boundaries will still be loaded
1465 // and stored atomically.
1466 //
1467 // Side Effects:
1468 // disjoint_byte_copy_entry is set to the no-overlap entry point
1469 // used by generate_conjoint_byte_copy().
1470 //
generate_disjoint_byte_copy(bool aligned,address * entry,const char * name)1471 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1472 const bool not_oop = false;
1473 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1474 }
1475
1476 // Arguments:
1477 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1478 // ignored
1479 // name - stub name string
1480 //
1481 // Inputs:
1482 // c_rarg0 - source array address
1483 // c_rarg1 - destination array address
1484 // c_rarg2 - element count, treated as ssize_t, can be zero
1485 //
1486 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1487 // we let the hardware handle it. The one to eight bytes within words,
1488 // dwords or qwords that span cache line boundaries will still be loaded
1489 // and stored atomically.
1490 //
generate_conjoint_byte_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1491 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1492 address* entry, const char *name) {
1493 const bool not_oop = false;
1494 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1495 }
1496
1497 // Arguments:
1498 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1499 // ignored
1500 // name - stub name string
1501 //
1502 // Inputs:
1503 // c_rarg0 - source array address
1504 // c_rarg1 - destination array address
1505 // c_rarg2 - element count, treated as ssize_t, can be zero
1506 //
1507 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1508 // let the hardware handle it. The two or four words within dwords
1509 // or qwords that span cache line boundaries will still be loaded
1510 // and stored atomically.
1511 //
1512 // Side Effects:
1513 // disjoint_short_copy_entry is set to the no-overlap entry point
1514 // used by generate_conjoint_short_copy().
1515 //
generate_disjoint_short_copy(bool aligned,address * entry,const char * name)1516 address generate_disjoint_short_copy(bool aligned,
1517 address* entry, const char *name) {
1518 const bool not_oop = false;
1519 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1520 }
1521
1522 // Arguments:
1523 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1524 // ignored
1525 // name - stub name string
1526 //
1527 // Inputs:
1528 // c_rarg0 - source array address
1529 // c_rarg1 - destination array address
1530 // c_rarg2 - element count, treated as ssize_t, can be zero
1531 //
1532 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1533 // let the hardware handle it. The two or four words within dwords
1534 // or qwords that span cache line boundaries will still be loaded
1535 // and stored atomically.
1536 //
generate_conjoint_short_copy(bool aligned,address nooverlap_target,address * entry,const char * name)1537 address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1538 address *entry, const char *name) {
1539 const bool not_oop = false;
1540 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1541
1542 }
1543 // Arguments:
1544 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1545 // ignored
1546 // name - stub name string
1547 //
1548 // Inputs:
1549 // c_rarg0 - source array address
1550 // c_rarg1 - destination array address
1551 // c_rarg2 - element count, treated as ssize_t, can be zero
1552 //
1553 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1554 // the hardware handle it. The two dwords within qwords that span
1555 // cache line boundaries will still be loaded and stored atomicly.
1556 //
1557 // Side Effects:
1558 // disjoint_int_copy_entry is set to the no-overlap entry point
1559 // used by generate_conjoint_int_oop_copy().
1560 //
generate_disjoint_int_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1561 address generate_disjoint_int_copy(bool aligned, address *entry,
1562 const char *name, bool dest_uninitialized = false) {
1563 const bool not_oop = false;
1564 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1565 }
1566
1567 // Arguments:
1568 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1569 // ignored
1570 // name - stub name string
1571 //
1572 // Inputs:
1573 // c_rarg0 - source array address
1574 // c_rarg1 - destination array address
1575 // c_rarg2 - element count, treated as ssize_t, can be zero
1576 //
1577 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1578 // the hardware handle it. The two dwords within qwords that span
1579 // cache line boundaries will still be loaded and stored atomicly.
1580 //
generate_conjoint_int_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1581 address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1582 address *entry, const char *name,
1583 bool dest_uninitialized = false) {
1584 const bool not_oop = false;
1585 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1586 }
1587
1588
1589 // Arguments:
1590 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1591 // ignored
1592 // name - stub name string
1593 //
1594 // Inputs:
1595 // c_rarg0 - source array address
1596 // c_rarg1 - destination array address
1597 // c_rarg2 - element count, treated as size_t, can be zero
1598 //
1599 // Side Effects:
1600 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1601 // no-overlap entry point used by generate_conjoint_long_oop_copy().
1602 //
generate_disjoint_long_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized=false)1603 address generate_disjoint_long_copy(bool aligned, address *entry,
1604 const char *name, bool dest_uninitialized = false) {
1605 const bool not_oop = false;
1606 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1607 }
1608
1609 // Arguments:
1610 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1611 // ignored
1612 // name - stub name string
1613 //
1614 // Inputs:
1615 // c_rarg0 - source array address
1616 // c_rarg1 - destination array address
1617 // c_rarg2 - element count, treated as size_t, can be zero
1618 //
generate_conjoint_long_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized=false)1619 address generate_conjoint_long_copy(bool aligned,
1620 address nooverlap_target, address *entry,
1621 const char *name, bool dest_uninitialized = false) {
1622 const bool not_oop = false;
1623 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1624 }
1625
1626 // Arguments:
1627 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1628 // ignored
1629 // name - stub name string
1630 //
1631 // Inputs:
1632 // c_rarg0 - source array address
1633 // c_rarg1 - destination array address
1634 // c_rarg2 - element count, treated as size_t, can be zero
1635 //
1636 // Side Effects:
1637 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1638 // no-overlap entry point used by generate_conjoint_long_oop_copy().
1639 //
generate_disjoint_oop_copy(bool aligned,address * entry,const char * name,bool dest_uninitialized)1640 address generate_disjoint_oop_copy(bool aligned, address *entry,
1641 const char *name, bool dest_uninitialized) {
1642 const bool is_oop = true;
1643 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1644 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1645 }
1646
1647 // Arguments:
1648 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1649 // ignored
1650 // name - stub name string
1651 //
1652 // Inputs:
1653 // c_rarg0 - source array address
1654 // c_rarg1 - destination array address
1655 // c_rarg2 - element count, treated as size_t, can be zero
1656 //
generate_conjoint_oop_copy(bool aligned,address nooverlap_target,address * entry,const char * name,bool dest_uninitialized)1657 address generate_conjoint_oop_copy(bool aligned,
1658 address nooverlap_target, address *entry,
1659 const char *name, bool dest_uninitialized) {
1660 const bool is_oop = true;
1661 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1662 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1663 name, dest_uninitialized);
1664 }
1665
1666
1667 // Helper for generating a dynamic type check.
1668 // Smashes rscratch1, rscratch2.
generate_type_check(Register sub_klass,Register super_check_offset,Register super_klass,Label & L_success)1669 void generate_type_check(Register sub_klass,
1670 Register super_check_offset,
1671 Register super_klass,
1672 Label& L_success) {
1673 assert_different_registers(sub_klass, super_check_offset, super_klass);
1674
1675 BLOCK_COMMENT("type_check:");
1676
1677 Label L_miss;
1678
1679 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL,
1680 super_check_offset);
1681 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1682
1683 // Fall through on failure!
1684 __ BIND(L_miss);
1685 }
1686
1687 //
1688 // Generate checkcasting array copy stub
1689 //
1690 // Input:
1691 // c_rarg0 - source array address
1692 // c_rarg1 - destination array address
1693 // c_rarg2 - element count, treated as ssize_t, can be zero
1694 // c_rarg3 - size_t ckoff (super_check_offset)
1695 // c_rarg4 - oop ckval (super_klass)
1696 //
1697 // Output:
1698 // r0 == 0 - success
1699 // r0 == -1^K - failure, where K is partial transfer count
1700 //
generate_checkcast_copy(const char * name,address * entry,bool dest_uninitialized=false)1701 address generate_checkcast_copy(const char *name, address *entry,
1702 bool dest_uninitialized = false) {
1703
1704 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1705
1706 // Input registers (after setup_arg_regs)
1707 const Register from = c_rarg0; // source array address
1708 const Register to = c_rarg1; // destination array address
1709 const Register count = c_rarg2; // elementscount
1710 const Register ckoff = c_rarg3; // super_check_offset
1711 const Register ckval = c_rarg4; // super_klass
1712
1713 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1714 RegSet wb_post_saved_regs = RegSet::of(count);
1715
1716 // Registers used as temps (r18, r19, r20 are save-on-entry)
1717 const Register count_save = r21; // orig elementscount
1718 const Register start_to = r20; // destination array start address
1719 const Register copied_oop = r18; // actual oop copied
1720 const Register r19_klass = r19; // oop._klass
1721
1722 //---------------------------------------------------------------
1723 // Assembler stub will be used for this call to arraycopy
1724 // if the two arrays are subtypes of Object[] but the
1725 // destination array type is not equal to or a supertype
1726 // of the source type. Each element must be separately
1727 // checked.
1728
1729 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1730 copied_oop, r19_klass, count_save);
1731
1732 __ align(CodeEntryAlignment);
1733 StubCodeMark mark(this, "StubRoutines", name);
1734 address start = __ pc();
1735
1736 __ enter(); // required for proper stackwalking of RuntimeStub frame
1737
1738 #ifdef ASSERT
1739 // caller guarantees that the arrays really are different
1740 // otherwise, we would have to make conjoint checks
1741 { Label L;
1742 array_overlap_test(L, TIMES_OOP);
1743 __ stop("checkcast_copy within a single array");
1744 __ bind(L);
1745 }
1746 #endif //ASSERT
1747
1748 // Caller of this entry point must set up the argument registers.
1749 if (entry != NULL) {
1750 *entry = __ pc();
1751 BLOCK_COMMENT("Entry:");
1752 }
1753
1754 // Empty array: Nothing to do.
1755 __ cbz(count, L_done);
1756
1757 __ push(RegSet::of(r18, r19, r20, r21), sp);
1758
1759 #ifdef ASSERT
1760 BLOCK_COMMENT("assert consistent ckoff/ckval");
1761 // The ckoff and ckval must be mutually consistent,
1762 // even though caller generates both.
1763 { Label L;
1764 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1765 __ ldrw(start_to, Address(ckval, sco_offset));
1766 __ cmpw(ckoff, start_to);
1767 __ br(Assembler::EQ, L);
1768 __ stop("super_check_offset inconsistent");
1769 __ bind(L);
1770 }
1771 #endif //ASSERT
1772
1773 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1774 bool is_oop = true;
1775 if (dest_uninitialized) {
1776 decorators |= IS_DEST_UNINITIALIZED;
1777 }
1778
1779 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1780 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1781
1782 // save the original count
1783 __ mov(count_save, count);
1784
1785 // Copy from low to high addresses
1786 __ mov(start_to, to); // Save destination array start address
1787 __ b(L_load_element);
1788
1789 // ======== begin loop ========
1790 // (Loop is rotated; its entry is L_load_element.)
1791 // Loop control:
1792 // for (; count != 0; count--) {
1793 // copied_oop = load_heap_oop(from++);
1794 // ... generate_type_check ...;
1795 // store_heap_oop(to++, copied_oop);
1796 // }
1797 __ align(OptoLoopAlignment);
1798
1799 __ BIND(L_store_element);
1800 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop
1801 __ sub(count, count, 1);
1802 __ cbz(count, L_do_card_marks);
1803
1804 // ======== loop entry is here ========
1805 __ BIND(L_load_element);
1806 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1807 __ cbz(copied_oop, L_store_element);
1808
1809 __ load_klass(r19_klass, copied_oop);// query the object klass
1810 generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1811 // ======== end loop ========
1812
1813 // It was a real error; we must depend on the caller to finish the job.
1814 // Register count = remaining oops, count_orig = total oops.
1815 // Emit GC store barriers for the oops we have copied and report
1816 // their number to the caller.
1817
1818 __ subs(count, count_save, count); // K = partially copied oop count
1819 __ eon(count, count, zr); // report (-1^K) to caller
1820 __ br(Assembler::EQ, L_done_pop);
1821
1822 __ BIND(L_do_card_marks);
1823 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1824
1825 __ bind(L_done_pop);
1826 __ pop(RegSet::of(r18, r19, r20, r21), sp);
1827 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1828
1829 __ bind(L_done);
1830 __ mov(r0, count);
1831 __ leave();
1832 __ ret(lr);
1833
1834 return start;
1835 }
1836
1837 // Perform range checks on the proposed arraycopy.
1838 // Kills temp, but nothing else.
1839 // Also, clean the sign bits of src_pos and dst_pos.
arraycopy_range_checks(Register src,Register src_pos,Register dst,Register dst_pos,Register length,Register temp,Label & L_failed)1840 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
1841 Register src_pos, // source position (c_rarg1)
1842 Register dst, // destination array oo (c_rarg2)
1843 Register dst_pos, // destination position (c_rarg3)
1844 Register length,
1845 Register temp,
1846 Label& L_failed) {
1847 BLOCK_COMMENT("arraycopy_range_checks:");
1848
1849 assert_different_registers(rscratch1, temp);
1850
1851 // if (src_pos + length > arrayOop(src)->length()) FAIL;
1852 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1853 __ addw(temp, length, src_pos);
1854 __ cmpw(temp, rscratch1);
1855 __ br(Assembler::HI, L_failed);
1856
1857 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
1858 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1859 __ addw(temp, length, dst_pos);
1860 __ cmpw(temp, rscratch1);
1861 __ br(Assembler::HI, L_failed);
1862
1863 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1864 __ movw(src_pos, src_pos);
1865 __ movw(dst_pos, dst_pos);
1866
1867 BLOCK_COMMENT("arraycopy_range_checks done");
1868 }
1869
1870 // These stubs get called from some dumb test routine.
1871 // I'll write them properly when they're called from
1872 // something that's actually doing something.
fake_arraycopy_stub(address src,address dst,int count)1873 static void fake_arraycopy_stub(address src, address dst, int count) {
1874 assert(count == 0, "huh?");
1875 }
1876
1877
1878 //
1879 // Generate 'unsafe' array copy stub
1880 // Though just as safe as the other stubs, it takes an unscaled
1881 // size_t argument instead of an element count.
1882 //
1883 // Input:
1884 // c_rarg0 - source array address
1885 // c_rarg1 - destination array address
1886 // c_rarg2 - byte count, treated as ssize_t, can be zero
1887 //
1888 // Examines the alignment of the operands and dispatches
1889 // to a long, int, short, or byte copy loop.
1890 //
generate_unsafe_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address long_copy_entry)1891 address generate_unsafe_copy(const char *name,
1892 address byte_copy_entry,
1893 address short_copy_entry,
1894 address int_copy_entry,
1895 address long_copy_entry) {
1896 Label L_long_aligned, L_int_aligned, L_short_aligned;
1897 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1898
1899 __ align(CodeEntryAlignment);
1900 StubCodeMark mark(this, "StubRoutines", name);
1901 address start = __ pc();
1902 __ enter(); // required for proper stackwalking of RuntimeStub frame
1903
1904 // bump this on entry, not on exit:
1905 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1906
1907 __ orr(rscratch1, s, d);
1908 __ orr(rscratch1, rscratch1, count);
1909
1910 __ andr(rscratch1, rscratch1, BytesPerLong-1);
1911 __ cbz(rscratch1, L_long_aligned);
1912 __ andr(rscratch1, rscratch1, BytesPerInt-1);
1913 __ cbz(rscratch1, L_int_aligned);
1914 __ tbz(rscratch1, 0, L_short_aligned);
1915 __ b(RuntimeAddress(byte_copy_entry));
1916
1917 __ BIND(L_short_aligned);
1918 __ lsr(count, count, LogBytesPerShort); // size => short_count
1919 __ b(RuntimeAddress(short_copy_entry));
1920 __ BIND(L_int_aligned);
1921 __ lsr(count, count, LogBytesPerInt); // size => int_count
1922 __ b(RuntimeAddress(int_copy_entry));
1923 __ BIND(L_long_aligned);
1924 __ lsr(count, count, LogBytesPerLong); // size => long_count
1925 __ b(RuntimeAddress(long_copy_entry));
1926
1927 return start;
1928 }
1929
1930 //
1931 // Generate generic array copy stubs
1932 //
1933 // Input:
1934 // c_rarg0 - src oop
1935 // c_rarg1 - src_pos (32-bits)
1936 // c_rarg2 - dst oop
1937 // c_rarg3 - dst_pos (32-bits)
1938 // c_rarg4 - element count (32-bits)
1939 //
1940 // Output:
1941 // r0 == 0 - success
1942 // r0 == -1^K - failure, where K is partial transfer count
1943 //
generate_generic_copy(const char * name,address byte_copy_entry,address short_copy_entry,address int_copy_entry,address oop_copy_entry,address long_copy_entry,address checkcast_copy_entry)1944 address generate_generic_copy(const char *name,
1945 address byte_copy_entry, address short_copy_entry,
1946 address int_copy_entry, address oop_copy_entry,
1947 address long_copy_entry, address checkcast_copy_entry) {
1948
1949 Label L_failed, L_objArray;
1950 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1951
1952 // Input registers
1953 const Register src = c_rarg0; // source array oop
1954 const Register src_pos = c_rarg1; // source position
1955 const Register dst = c_rarg2; // destination array oop
1956 const Register dst_pos = c_rarg3; // destination position
1957 const Register length = c_rarg4;
1958
1959
1960 // Registers used as temps
1961 const Register dst_klass = c_rarg5;
1962
1963 __ align(CodeEntryAlignment);
1964
1965 StubCodeMark mark(this, "StubRoutines", name);
1966
1967 address start = __ pc();
1968
1969 __ enter(); // required for proper stackwalking of RuntimeStub frame
1970
1971 // bump this on entry, not on exit:
1972 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1973
1974 //-----------------------------------------------------------------------
1975 // Assembler stub will be used for this call to arraycopy
1976 // if the following conditions are met:
1977 //
1978 // (1) src and dst must not be null.
1979 // (2) src_pos must not be negative.
1980 // (3) dst_pos must not be negative.
1981 // (4) length must not be negative.
1982 // (5) src klass and dst klass should be the same and not NULL.
1983 // (6) src and dst should be arrays.
1984 // (7) src_pos + length must not exceed length of src.
1985 // (8) dst_pos + length must not exceed length of dst.
1986 //
1987
1988 // if (src == NULL) return -1;
1989 __ cbz(src, L_failed);
1990
1991 // if (src_pos < 0) return -1;
1992 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
1993
1994 // if (dst == NULL) return -1;
1995 __ cbz(dst, L_failed);
1996
1997 // if (dst_pos < 0) return -1;
1998 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
1999
2000 // registers used as temp
2001 const Register scratch_length = r16; // elements count to copy
2002 const Register scratch_src_klass = r17; // array klass
2003 const Register lh = r18; // layout helper
2004
2005 // if (length < 0) return -1;
2006 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2007 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2008
2009 __ load_klass(scratch_src_klass, src);
2010 #ifdef ASSERT
2011 // assert(src->klass() != NULL);
2012 {
2013 BLOCK_COMMENT("assert klasses not null {");
2014 Label L1, L2;
2015 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL
2016 __ bind(L1);
2017 __ stop("broken null klass");
2018 __ bind(L2);
2019 __ load_klass(rscratch1, dst);
2020 __ cbz(rscratch1, L1); // this would be broken also
2021 BLOCK_COMMENT("} assert klasses not null done");
2022 }
2023 #endif
2024
2025 // Load layout helper (32-bits)
2026 //
2027 // |array_tag| | header_size | element_type | |log2_element_size|
2028 // 32 30 24 16 8 2 0
2029 //
2030 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2031 //
2032
2033 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2034
2035 // Handle objArrays completely differently...
2036 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2037 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2038 __ movw(rscratch1, objArray_lh);
2039 __ eorw(rscratch2, lh, rscratch1);
2040 __ cbzw(rscratch2, L_objArray);
2041
2042 // if (src->klass() != dst->klass()) return -1;
2043 __ load_klass(rscratch2, dst);
2044 __ eor(rscratch2, rscratch2, scratch_src_klass);
2045 __ cbnz(rscratch2, L_failed);
2046
2047 // if (!src->is_Array()) return -1;
2048 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2049
2050 // At this point, it is known to be a typeArray (array_tag 0x3).
2051 #ifdef ASSERT
2052 {
2053 BLOCK_COMMENT("assert primitive array {");
2054 Label L;
2055 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2056 __ cmpw(lh, rscratch2);
2057 __ br(Assembler::GE, L);
2058 __ stop("must be a primitive array");
2059 __ bind(L);
2060 BLOCK_COMMENT("} assert primitive array done");
2061 }
2062 #endif
2063
2064 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2065 rscratch2, L_failed);
2066
2067 // TypeArrayKlass
2068 //
2069 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2070 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2071 //
2072
2073 const Register rscratch1_offset = rscratch1; // array offset
2074 const Register r18_elsize = lh; // element size
2075
2076 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2077 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2078 __ add(src, src, rscratch1_offset); // src array offset
2079 __ add(dst, dst, rscratch1_offset); // dst array offset
2080 BLOCK_COMMENT("choose copy loop based on element size");
2081
2082 // next registers should be set before the jump to corresponding stub
2083 const Register from = c_rarg0; // source array address
2084 const Register to = c_rarg1; // destination array address
2085 const Register count = c_rarg2; // elements count
2086
2087 // 'from', 'to', 'count' registers should be set in such order
2088 // since they are the same as 'src', 'src_pos', 'dst'.
2089
2090 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2091
2092 // The possible values of elsize are 0-3, i.e. exact_log2(element
2093 // size in bytes). We do a simple bitwise binary search.
2094 __ BIND(L_copy_bytes);
2095 __ tbnz(r18_elsize, 1, L_copy_ints);
2096 __ tbnz(r18_elsize, 0, L_copy_shorts);
2097 __ lea(from, Address(src, src_pos));// src_addr
2098 __ lea(to, Address(dst, dst_pos));// dst_addr
2099 __ movw(count, scratch_length); // length
2100 __ b(RuntimeAddress(byte_copy_entry));
2101
2102 __ BIND(L_copy_shorts);
2103 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2104 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2105 __ movw(count, scratch_length); // length
2106 __ b(RuntimeAddress(short_copy_entry));
2107
2108 __ BIND(L_copy_ints);
2109 __ tbnz(r18_elsize, 0, L_copy_longs);
2110 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2111 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2112 __ movw(count, scratch_length); // length
2113 __ b(RuntimeAddress(int_copy_entry));
2114
2115 __ BIND(L_copy_longs);
2116 #ifdef ASSERT
2117 {
2118 BLOCK_COMMENT("assert long copy {");
2119 Label L;
2120 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2121 __ cmpw(r18_elsize, LogBytesPerLong);
2122 __ br(Assembler::EQ, L);
2123 __ stop("must be long copy, but elsize is wrong");
2124 __ bind(L);
2125 BLOCK_COMMENT("} assert long copy done");
2126 }
2127 #endif
2128 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2129 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2130 __ movw(count, scratch_length); // length
2131 __ b(RuntimeAddress(long_copy_entry));
2132
2133 // ObjArrayKlass
2134 __ BIND(L_objArray);
2135 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2136
2137 Label L_plain_copy, L_checkcast_copy;
2138 // test array classes for subtyping
2139 __ load_klass(r18, dst);
2140 __ cmp(scratch_src_klass, r18); // usual case is exact equality
2141 __ br(Assembler::NE, L_checkcast_copy);
2142
2143 // Identically typed arrays can be copied without element-wise checks.
2144 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2145 rscratch2, L_failed);
2146
2147 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2148 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2149 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2150 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2151 __ movw(count, scratch_length); // length
2152 __ BIND(L_plain_copy);
2153 __ b(RuntimeAddress(oop_copy_entry));
2154
2155 __ BIND(L_checkcast_copy);
2156 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass)
2157 {
2158 // Before looking at dst.length, make sure dst is also an objArray.
2159 __ ldrw(rscratch1, Address(r18, lh_offset));
2160 __ movw(rscratch2, objArray_lh);
2161 __ eorw(rscratch1, rscratch1, rscratch2);
2162 __ cbnzw(rscratch1, L_failed);
2163
2164 // It is safe to examine both src.length and dst.length.
2165 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2166 r18, L_failed);
2167
2168 __ load_klass(dst_klass, dst); // reload
2169
2170 // Marshal the base address arguments now, freeing registers.
2171 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2172 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2173 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2174 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2175 __ movw(count, length); // length (reloaded)
2176 Register sco_temp = c_rarg3; // this register is free now
2177 assert_different_registers(from, to, count, sco_temp,
2178 dst_klass, scratch_src_klass);
2179 // assert_clean_int(count, sco_temp);
2180
2181 // Generate the type check.
2182 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2183 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2184
2185 // Smashes rscratch1, rscratch2
2186 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2187
2188 // Fetch destination element klass from the ObjArrayKlass header.
2189 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2190 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2191 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2192
2193 // the checkcast_copy loop needs two extra arguments:
2194 assert(c_rarg3 == sco_temp, "#3 already in place");
2195 // Set up arguments for checkcast_copy_entry.
2196 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2197 __ b(RuntimeAddress(checkcast_copy_entry));
2198 }
2199
2200 __ BIND(L_failed);
2201 __ mov(r0, -1);
2202 __ leave(); // required for proper stackwalking of RuntimeStub frame
2203 __ ret(lr);
2204
2205 return start;
2206 }
2207
2208 //
2209 // Generate stub for array fill. If "aligned" is true, the
2210 // "to" address is assumed to be heapword aligned.
2211 //
2212 // Arguments for generated stub:
2213 // to: c_rarg0
2214 // value: c_rarg1
2215 // count: c_rarg2 treated as signed
2216 //
generate_fill(BasicType t,bool aligned,const char * name)2217 address generate_fill(BasicType t, bool aligned, const char *name) {
2218 __ align(CodeEntryAlignment);
2219 StubCodeMark mark(this, "StubRoutines", name);
2220 address start = __ pc();
2221
2222 BLOCK_COMMENT("Entry:");
2223
2224 const Register to = c_rarg0; // source array address
2225 const Register value = c_rarg1; // value
2226 const Register count = c_rarg2; // elements count
2227
2228 const Register bz_base = r10; // base for block_zero routine
2229 const Register cnt_words = r11; // temp register
2230
2231 __ enter();
2232
2233 Label L_fill_elements, L_exit1;
2234
2235 int shift = -1;
2236 switch (t) {
2237 case T_BYTE:
2238 shift = 0;
2239 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2240 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2241 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2242 __ br(Assembler::LO, L_fill_elements);
2243 break;
2244 case T_SHORT:
2245 shift = 1;
2246 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2247 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2248 __ br(Assembler::LO, L_fill_elements);
2249 break;
2250 case T_INT:
2251 shift = 2;
2252 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2253 __ br(Assembler::LO, L_fill_elements);
2254 break;
2255 default: ShouldNotReachHere();
2256 }
2257
2258 // Align source address at 8 bytes address boundary.
2259 Label L_skip_align1, L_skip_align2, L_skip_align4;
2260 if (!aligned) {
2261 switch (t) {
2262 case T_BYTE:
2263 // One byte misalignment happens only for byte arrays.
2264 __ tbz(to, 0, L_skip_align1);
2265 __ strb(value, Address(__ post(to, 1)));
2266 __ subw(count, count, 1);
2267 __ bind(L_skip_align1);
2268 // Fallthrough
2269 case T_SHORT:
2270 // Two bytes misalignment happens only for byte and short (char) arrays.
2271 __ tbz(to, 1, L_skip_align2);
2272 __ strh(value, Address(__ post(to, 2)));
2273 __ subw(count, count, 2 >> shift);
2274 __ bind(L_skip_align2);
2275 // Fallthrough
2276 case T_INT:
2277 // Align to 8 bytes, we know we are 4 byte aligned to start.
2278 __ tbz(to, 2, L_skip_align4);
2279 __ strw(value, Address(__ post(to, 4)));
2280 __ subw(count, count, 4 >> shift);
2281 __ bind(L_skip_align4);
2282 break;
2283 default: ShouldNotReachHere();
2284 }
2285 }
2286
2287 //
2288 // Fill large chunks
2289 //
2290 __ lsrw(cnt_words, count, 3 - shift); // number of words
2291 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2292 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2293 if (UseBlockZeroing) {
2294 Label non_block_zeroing, rest;
2295 // If the fill value is zero we can use the fast zero_words().
2296 __ cbnz(value, non_block_zeroing);
2297 __ mov(bz_base, to);
2298 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2299 __ zero_words(bz_base, cnt_words);
2300 __ b(rest);
2301 __ bind(non_block_zeroing);
2302 __ fill_words(to, cnt_words, value);
2303 __ bind(rest);
2304 } else {
2305 __ fill_words(to, cnt_words, value);
2306 }
2307
2308 // Remaining count is less than 8 bytes. Fill it by a single store.
2309 // Note that the total length is no less than 8 bytes.
2310 if (t == T_BYTE || t == T_SHORT) {
2311 Label L_exit1;
2312 __ cbzw(count, L_exit1);
2313 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2314 __ str(value, Address(to, -8)); // overwrite some elements
2315 __ bind(L_exit1);
2316 __ leave();
2317 __ ret(lr);
2318 }
2319
2320 // Handle copies less than 8 bytes.
2321 Label L_fill_2, L_fill_4, L_exit2;
2322 __ bind(L_fill_elements);
2323 switch (t) {
2324 case T_BYTE:
2325 __ tbz(count, 0, L_fill_2);
2326 __ strb(value, Address(__ post(to, 1)));
2327 __ bind(L_fill_2);
2328 __ tbz(count, 1, L_fill_4);
2329 __ strh(value, Address(__ post(to, 2)));
2330 __ bind(L_fill_4);
2331 __ tbz(count, 2, L_exit2);
2332 __ strw(value, Address(to));
2333 break;
2334 case T_SHORT:
2335 __ tbz(count, 0, L_fill_4);
2336 __ strh(value, Address(__ post(to, 2)));
2337 __ bind(L_fill_4);
2338 __ tbz(count, 1, L_exit2);
2339 __ strw(value, Address(to));
2340 break;
2341 case T_INT:
2342 __ cbzw(count, L_exit2);
2343 __ strw(value, Address(to));
2344 break;
2345 default: ShouldNotReachHere();
2346 }
2347 __ bind(L_exit2);
2348 __ leave();
2349 __ ret(lr);
2350 return start;
2351 }
2352
generate_arraycopy_stubs()2353 void generate_arraycopy_stubs() {
2354 address entry;
2355 address entry_jbyte_arraycopy;
2356 address entry_jshort_arraycopy;
2357 address entry_jint_arraycopy;
2358 address entry_oop_arraycopy;
2359 address entry_jlong_arraycopy;
2360 address entry_checkcast_arraycopy;
2361
2362 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2363 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2364
2365 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2366
2367 //*** jbyte
2368 // Always need aligned and unaligned versions
2369 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
2370 "jbyte_disjoint_arraycopy");
2371 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry,
2372 &entry_jbyte_arraycopy,
2373 "jbyte_arraycopy");
2374 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2375 "arrayof_jbyte_disjoint_arraycopy");
2376 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL,
2377 "arrayof_jbyte_arraycopy");
2378
2379 //*** jshort
2380 // Always need aligned and unaligned versions
2381 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
2382 "jshort_disjoint_arraycopy");
2383 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry,
2384 &entry_jshort_arraycopy,
2385 "jshort_arraycopy");
2386 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2387 "arrayof_jshort_disjoint_arraycopy");
2388 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL,
2389 "arrayof_jshort_arraycopy");
2390
2391 //*** jint
2392 // Aligned versions
2393 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2394 "arrayof_jint_disjoint_arraycopy");
2395 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2396 "arrayof_jint_arraycopy");
2397 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2398 // entry_jint_arraycopy always points to the unaligned version
2399 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry,
2400 "jint_disjoint_arraycopy");
2401 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry,
2402 &entry_jint_arraycopy,
2403 "jint_arraycopy");
2404
2405 //*** jlong
2406 // It is always aligned
2407 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2408 "arrayof_jlong_disjoint_arraycopy");
2409 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2410 "arrayof_jlong_arraycopy");
2411 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2412 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2413
2414 //*** oops
2415 {
2416 // With compressed oops we need unaligned versions; notice that
2417 // we overwrite entry_oop_arraycopy.
2418 bool aligned = !UseCompressedOops;
2419
2420 StubRoutines::_arrayof_oop_disjoint_arraycopy
2421 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2422 /*dest_uninitialized*/false);
2423 StubRoutines::_arrayof_oop_arraycopy
2424 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2425 /*dest_uninitialized*/false);
2426 // Aligned versions without pre-barriers
2427 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2428 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2429 /*dest_uninitialized*/true);
2430 StubRoutines::_arrayof_oop_arraycopy_uninit
2431 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2432 /*dest_uninitialized*/true);
2433 }
2434
2435 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2436 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2437 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2438 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2439
2440 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2441 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2442 /*dest_uninitialized*/true);
2443
2444 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
2445 entry_jbyte_arraycopy,
2446 entry_jshort_arraycopy,
2447 entry_jint_arraycopy,
2448 entry_jlong_arraycopy);
2449
2450 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
2451 entry_jbyte_arraycopy,
2452 entry_jshort_arraycopy,
2453 entry_jint_arraycopy,
2454 entry_oop_arraycopy,
2455 entry_jlong_arraycopy,
2456 entry_checkcast_arraycopy);
2457
2458 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2459 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2460 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2461 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2462 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2463 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2464 }
2465
generate_math_stubs()2466 void generate_math_stubs() { Unimplemented(); }
2467
2468 // Arguments:
2469 //
2470 // Inputs:
2471 // c_rarg0 - source byte array address
2472 // c_rarg1 - destination byte array address
2473 // c_rarg2 - K (key) in little endian int array
2474 //
generate_aescrypt_encryptBlock()2475 address generate_aescrypt_encryptBlock() {
2476 __ align(CodeEntryAlignment);
2477 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2478
2479 Label L_doLast;
2480
2481 const Register from = c_rarg0; // source array address
2482 const Register to = c_rarg1; // destination array address
2483 const Register key = c_rarg2; // key array address
2484 const Register keylen = rscratch1;
2485
2486 address start = __ pc();
2487 __ enter();
2488
2489 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2490
2491 __ ld1(v0, __ T16B, from); // get 16 bytes of input
2492
2493 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2494 __ rev32(v1, __ T16B, v1);
2495 __ rev32(v2, __ T16B, v2);
2496 __ rev32(v3, __ T16B, v3);
2497 __ rev32(v4, __ T16B, v4);
2498 __ aese(v0, v1);
2499 __ aesmc(v0, v0);
2500 __ aese(v0, v2);
2501 __ aesmc(v0, v0);
2502 __ aese(v0, v3);
2503 __ aesmc(v0, v0);
2504 __ aese(v0, v4);
2505 __ aesmc(v0, v0);
2506
2507 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2508 __ rev32(v1, __ T16B, v1);
2509 __ rev32(v2, __ T16B, v2);
2510 __ rev32(v3, __ T16B, v3);
2511 __ rev32(v4, __ T16B, v4);
2512 __ aese(v0, v1);
2513 __ aesmc(v0, v0);
2514 __ aese(v0, v2);
2515 __ aesmc(v0, v0);
2516 __ aese(v0, v3);
2517 __ aesmc(v0, v0);
2518 __ aese(v0, v4);
2519 __ aesmc(v0, v0);
2520
2521 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2522 __ rev32(v1, __ T16B, v1);
2523 __ rev32(v2, __ T16B, v2);
2524
2525 __ cmpw(keylen, 44);
2526 __ br(Assembler::EQ, L_doLast);
2527
2528 __ aese(v0, v1);
2529 __ aesmc(v0, v0);
2530 __ aese(v0, v2);
2531 __ aesmc(v0, v0);
2532
2533 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2534 __ rev32(v1, __ T16B, v1);
2535 __ rev32(v2, __ T16B, v2);
2536
2537 __ cmpw(keylen, 52);
2538 __ br(Assembler::EQ, L_doLast);
2539
2540 __ aese(v0, v1);
2541 __ aesmc(v0, v0);
2542 __ aese(v0, v2);
2543 __ aesmc(v0, v0);
2544
2545 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2546 __ rev32(v1, __ T16B, v1);
2547 __ rev32(v2, __ T16B, v2);
2548
2549 __ BIND(L_doLast);
2550
2551 __ aese(v0, v1);
2552 __ aesmc(v0, v0);
2553 __ aese(v0, v2);
2554
2555 __ ld1(v1, __ T16B, key);
2556 __ rev32(v1, __ T16B, v1);
2557 __ eor(v0, __ T16B, v0, v1);
2558
2559 __ st1(v0, __ T16B, to);
2560
2561 __ mov(r0, 0);
2562
2563 __ leave();
2564 __ ret(lr);
2565
2566 return start;
2567 }
2568
2569 // Arguments:
2570 //
2571 // Inputs:
2572 // c_rarg0 - source byte array address
2573 // c_rarg1 - destination byte array address
2574 // c_rarg2 - K (key) in little endian int array
2575 //
generate_aescrypt_decryptBlock()2576 address generate_aescrypt_decryptBlock() {
2577 assert(UseAES, "need AES instructions and misaligned SSE support");
2578 __ align(CodeEntryAlignment);
2579 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2580 Label L_doLast;
2581
2582 const Register from = c_rarg0; // source array address
2583 const Register to = c_rarg1; // destination array address
2584 const Register key = c_rarg2; // key array address
2585 const Register keylen = rscratch1;
2586
2587 address start = __ pc();
2588 __ enter(); // required for proper stackwalking of RuntimeStub frame
2589
2590 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2591
2592 __ ld1(v0, __ T16B, from); // get 16 bytes of input
2593
2594 __ ld1(v5, __ T16B, __ post(key, 16));
2595 __ rev32(v5, __ T16B, v5);
2596
2597 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2598 __ rev32(v1, __ T16B, v1);
2599 __ rev32(v2, __ T16B, v2);
2600 __ rev32(v3, __ T16B, v3);
2601 __ rev32(v4, __ T16B, v4);
2602 __ aesd(v0, v1);
2603 __ aesimc(v0, v0);
2604 __ aesd(v0, v2);
2605 __ aesimc(v0, v0);
2606 __ aesd(v0, v3);
2607 __ aesimc(v0, v0);
2608 __ aesd(v0, v4);
2609 __ aesimc(v0, v0);
2610
2611 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2612 __ rev32(v1, __ T16B, v1);
2613 __ rev32(v2, __ T16B, v2);
2614 __ rev32(v3, __ T16B, v3);
2615 __ rev32(v4, __ T16B, v4);
2616 __ aesd(v0, v1);
2617 __ aesimc(v0, v0);
2618 __ aesd(v0, v2);
2619 __ aesimc(v0, v0);
2620 __ aesd(v0, v3);
2621 __ aesimc(v0, v0);
2622 __ aesd(v0, v4);
2623 __ aesimc(v0, v0);
2624
2625 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2626 __ rev32(v1, __ T16B, v1);
2627 __ rev32(v2, __ T16B, v2);
2628
2629 __ cmpw(keylen, 44);
2630 __ br(Assembler::EQ, L_doLast);
2631
2632 __ aesd(v0, v1);
2633 __ aesimc(v0, v0);
2634 __ aesd(v0, v2);
2635 __ aesimc(v0, v0);
2636
2637 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2638 __ rev32(v1, __ T16B, v1);
2639 __ rev32(v2, __ T16B, v2);
2640
2641 __ cmpw(keylen, 52);
2642 __ br(Assembler::EQ, L_doLast);
2643
2644 __ aesd(v0, v1);
2645 __ aesimc(v0, v0);
2646 __ aesd(v0, v2);
2647 __ aesimc(v0, v0);
2648
2649 __ ld1(v1, v2, __ T16B, __ post(key, 32));
2650 __ rev32(v1, __ T16B, v1);
2651 __ rev32(v2, __ T16B, v2);
2652
2653 __ BIND(L_doLast);
2654
2655 __ aesd(v0, v1);
2656 __ aesimc(v0, v0);
2657 __ aesd(v0, v2);
2658
2659 __ eor(v0, __ T16B, v0, v5);
2660
2661 __ st1(v0, __ T16B, to);
2662
2663 __ mov(r0, 0);
2664
2665 __ leave();
2666 __ ret(lr);
2667
2668 return start;
2669 }
2670
2671 // Arguments:
2672 //
2673 // Inputs:
2674 // c_rarg0 - source byte array address
2675 // c_rarg1 - destination byte array address
2676 // c_rarg2 - K (key) in little endian int array
2677 // c_rarg3 - r vector byte array address
2678 // c_rarg4 - input length
2679 //
2680 // Output:
2681 // x0 - input length
2682 //
generate_cipherBlockChaining_encryptAESCrypt()2683 address generate_cipherBlockChaining_encryptAESCrypt() {
2684 assert(UseAES, "need AES instructions and misaligned SSE support");
2685 __ align(CodeEntryAlignment);
2686 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2687
2688 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2689
2690 const Register from = c_rarg0; // source array address
2691 const Register to = c_rarg1; // destination array address
2692 const Register key = c_rarg2; // key array address
2693 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2694 // and left with the results of the last encryption block
2695 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2696 const Register keylen = rscratch1;
2697
2698 address start = __ pc();
2699
2700 __ enter();
2701
2702 __ movw(rscratch2, len_reg);
2703
2704 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2705
2706 __ ld1(v0, __ T16B, rvec);
2707
2708 __ cmpw(keylen, 52);
2709 __ br(Assembler::CC, L_loadkeys_44);
2710 __ br(Assembler::EQ, L_loadkeys_52);
2711
2712 __ ld1(v17, v18, __ T16B, __ post(key, 32));
2713 __ rev32(v17, __ T16B, v17);
2714 __ rev32(v18, __ T16B, v18);
2715 __ BIND(L_loadkeys_52);
2716 __ ld1(v19, v20, __ T16B, __ post(key, 32));
2717 __ rev32(v19, __ T16B, v19);
2718 __ rev32(v20, __ T16B, v20);
2719 __ BIND(L_loadkeys_44);
2720 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2721 __ rev32(v21, __ T16B, v21);
2722 __ rev32(v22, __ T16B, v22);
2723 __ rev32(v23, __ T16B, v23);
2724 __ rev32(v24, __ T16B, v24);
2725 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2726 __ rev32(v25, __ T16B, v25);
2727 __ rev32(v26, __ T16B, v26);
2728 __ rev32(v27, __ T16B, v27);
2729 __ rev32(v28, __ T16B, v28);
2730 __ ld1(v29, v30, v31, __ T16B, key);
2731 __ rev32(v29, __ T16B, v29);
2732 __ rev32(v30, __ T16B, v30);
2733 __ rev32(v31, __ T16B, v31);
2734
2735 __ BIND(L_aes_loop);
2736 __ ld1(v1, __ T16B, __ post(from, 16));
2737 __ eor(v0, __ T16B, v0, v1);
2738
2739 __ br(Assembler::CC, L_rounds_44);
2740 __ br(Assembler::EQ, L_rounds_52);
2741
2742 __ aese(v0, v17); __ aesmc(v0, v0);
2743 __ aese(v0, v18); __ aesmc(v0, v0);
2744 __ BIND(L_rounds_52);
2745 __ aese(v0, v19); __ aesmc(v0, v0);
2746 __ aese(v0, v20); __ aesmc(v0, v0);
2747 __ BIND(L_rounds_44);
2748 __ aese(v0, v21); __ aesmc(v0, v0);
2749 __ aese(v0, v22); __ aesmc(v0, v0);
2750 __ aese(v0, v23); __ aesmc(v0, v0);
2751 __ aese(v0, v24); __ aesmc(v0, v0);
2752 __ aese(v0, v25); __ aesmc(v0, v0);
2753 __ aese(v0, v26); __ aesmc(v0, v0);
2754 __ aese(v0, v27); __ aesmc(v0, v0);
2755 __ aese(v0, v28); __ aesmc(v0, v0);
2756 __ aese(v0, v29); __ aesmc(v0, v0);
2757 __ aese(v0, v30);
2758 __ eor(v0, __ T16B, v0, v31);
2759
2760 __ st1(v0, __ T16B, __ post(to, 16));
2761
2762 __ subw(len_reg, len_reg, 16);
2763 __ cbnzw(len_reg, L_aes_loop);
2764
2765 __ st1(v0, __ T16B, rvec);
2766
2767 __ mov(r0, rscratch2);
2768
2769 __ leave();
2770 __ ret(lr);
2771
2772 return start;
2773 }
2774
2775 // Arguments:
2776 //
2777 // Inputs:
2778 // c_rarg0 - source byte array address
2779 // c_rarg1 - destination byte array address
2780 // c_rarg2 - K (key) in little endian int array
2781 // c_rarg3 - r vector byte array address
2782 // c_rarg4 - input length
2783 //
2784 // Output:
2785 // r0 - input length
2786 //
generate_cipherBlockChaining_decryptAESCrypt()2787 address generate_cipherBlockChaining_decryptAESCrypt() {
2788 assert(UseAES, "need AES instructions and misaligned SSE support");
2789 __ align(CodeEntryAlignment);
2790 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2791
2792 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2793
2794 const Register from = c_rarg0; // source array address
2795 const Register to = c_rarg1; // destination array address
2796 const Register key = c_rarg2; // key array address
2797 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2798 // and left with the results of the last encryption block
2799 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2800 const Register keylen = rscratch1;
2801
2802 address start = __ pc();
2803
2804 __ enter();
2805
2806 __ movw(rscratch2, len_reg);
2807
2808 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2809
2810 __ ld1(v2, __ T16B, rvec);
2811
2812 __ ld1(v31, __ T16B, __ post(key, 16));
2813 __ rev32(v31, __ T16B, v31);
2814
2815 __ cmpw(keylen, 52);
2816 __ br(Assembler::CC, L_loadkeys_44);
2817 __ br(Assembler::EQ, L_loadkeys_52);
2818
2819 __ ld1(v17, v18, __ T16B, __ post(key, 32));
2820 __ rev32(v17, __ T16B, v17);
2821 __ rev32(v18, __ T16B, v18);
2822 __ BIND(L_loadkeys_52);
2823 __ ld1(v19, v20, __ T16B, __ post(key, 32));
2824 __ rev32(v19, __ T16B, v19);
2825 __ rev32(v20, __ T16B, v20);
2826 __ BIND(L_loadkeys_44);
2827 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2828 __ rev32(v21, __ T16B, v21);
2829 __ rev32(v22, __ T16B, v22);
2830 __ rev32(v23, __ T16B, v23);
2831 __ rev32(v24, __ T16B, v24);
2832 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2833 __ rev32(v25, __ T16B, v25);
2834 __ rev32(v26, __ T16B, v26);
2835 __ rev32(v27, __ T16B, v27);
2836 __ rev32(v28, __ T16B, v28);
2837 __ ld1(v29, v30, __ T16B, key);
2838 __ rev32(v29, __ T16B, v29);
2839 __ rev32(v30, __ T16B, v30);
2840
2841 __ BIND(L_aes_loop);
2842 __ ld1(v0, __ T16B, __ post(from, 16));
2843 __ orr(v1, __ T16B, v0, v0);
2844
2845 __ br(Assembler::CC, L_rounds_44);
2846 __ br(Assembler::EQ, L_rounds_52);
2847
2848 __ aesd(v0, v17); __ aesimc(v0, v0);
2849 __ aesd(v0, v18); __ aesimc(v0, v0);
2850 __ BIND(L_rounds_52);
2851 __ aesd(v0, v19); __ aesimc(v0, v0);
2852 __ aesd(v0, v20); __ aesimc(v0, v0);
2853 __ BIND(L_rounds_44);
2854 __ aesd(v0, v21); __ aesimc(v0, v0);
2855 __ aesd(v0, v22); __ aesimc(v0, v0);
2856 __ aesd(v0, v23); __ aesimc(v0, v0);
2857 __ aesd(v0, v24); __ aesimc(v0, v0);
2858 __ aesd(v0, v25); __ aesimc(v0, v0);
2859 __ aesd(v0, v26); __ aesimc(v0, v0);
2860 __ aesd(v0, v27); __ aesimc(v0, v0);
2861 __ aesd(v0, v28); __ aesimc(v0, v0);
2862 __ aesd(v0, v29); __ aesimc(v0, v0);
2863 __ aesd(v0, v30);
2864 __ eor(v0, __ T16B, v0, v31);
2865 __ eor(v0, __ T16B, v0, v2);
2866
2867 __ st1(v0, __ T16B, __ post(to, 16));
2868 __ orr(v2, __ T16B, v1, v1);
2869
2870 __ subw(len_reg, len_reg, 16);
2871 __ cbnzw(len_reg, L_aes_loop);
2872
2873 __ st1(v2, __ T16B, rvec);
2874
2875 __ mov(r0, rscratch2);
2876
2877 __ leave();
2878 __ ret(lr);
2879
2880 return start;
2881 }
2882
2883 // Arguments:
2884 //
2885 // Inputs:
2886 // c_rarg0 - byte[] source+offset
2887 // c_rarg1 - int[] SHA.state
2888 // c_rarg2 - int offset
2889 // c_rarg3 - int limit
2890 //
generate_sha1_implCompress(bool multi_block,const char * name)2891 address generate_sha1_implCompress(bool multi_block, const char *name) {
2892 __ align(CodeEntryAlignment);
2893 StubCodeMark mark(this, "StubRoutines", name);
2894 address start = __ pc();
2895
2896 Register buf = c_rarg0;
2897 Register state = c_rarg1;
2898 Register ofs = c_rarg2;
2899 Register limit = c_rarg3;
2900
2901 Label keys;
2902 Label sha1_loop;
2903
2904 // load the keys into v0..v3
2905 __ adr(rscratch1, keys);
2906 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2907 // load 5 words state into v6, v7
2908 __ ldrq(v6, Address(state, 0));
2909 __ ldrs(v7, Address(state, 16));
2910
2911
2912 __ BIND(sha1_loop);
2913 // load 64 bytes of data into v16..v19
2914 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2915 __ rev32(v16, __ T16B, v16);
2916 __ rev32(v17, __ T16B, v17);
2917 __ rev32(v18, __ T16B, v18);
2918 __ rev32(v19, __ T16B, v19);
2919
2920 // do the sha1
2921 __ addv(v4, __ T4S, v16, v0);
2922 __ orr(v20, __ T16B, v6, v6);
2923
2924 FloatRegister d0 = v16;
2925 FloatRegister d1 = v17;
2926 FloatRegister d2 = v18;
2927 FloatRegister d3 = v19;
2928
2929 for (int round = 0; round < 20; round++) {
2930 FloatRegister tmp1 = (round & 1) ? v4 : v5;
2931 FloatRegister tmp2 = (round & 1) ? v21 : v22;
2932 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2933 FloatRegister tmp4 = (round & 1) ? v5 : v4;
2934 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2935
2936 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2937 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2938 __ sha1h(tmp2, __ T4S, v20);
2939 if (round < 5)
2940 __ sha1c(v20, __ T4S, tmp3, tmp4);
2941 else if (round < 10 || round >= 15)
2942 __ sha1p(v20, __ T4S, tmp3, tmp4);
2943 else
2944 __ sha1m(v20, __ T4S, tmp3, tmp4);
2945 if (round < 16) __ sha1su1(d0, __ T4S, d3);
2946
2947 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2948 }
2949
2950 __ addv(v7, __ T2S, v7, v21);
2951 __ addv(v6, __ T4S, v6, v20);
2952
2953 if (multi_block) {
2954 __ add(ofs, ofs, 64);
2955 __ cmp(ofs, limit);
2956 __ br(Assembler::LE, sha1_loop);
2957 __ mov(c_rarg0, ofs); // return ofs
2958 }
2959
2960 __ strq(v6, Address(state, 0));
2961 __ strs(v7, Address(state, 16));
2962
2963 __ ret(lr);
2964
2965 __ bind(keys);
2966 __ emit_int32(0x5a827999);
2967 __ emit_int32(0x6ed9eba1);
2968 __ emit_int32(0x8f1bbcdc);
2969 __ emit_int32(0xca62c1d6);
2970
2971 return start;
2972 }
2973
2974
2975 // Arguments:
2976 //
2977 // Inputs:
2978 // c_rarg0 - byte[] source+offset
2979 // c_rarg1 - int[] SHA.state
2980 // c_rarg2 - int offset
2981 // c_rarg3 - int limit
2982 //
generate_sha256_implCompress(bool multi_block,const char * name)2983 address generate_sha256_implCompress(bool multi_block, const char *name) {
2984 static const uint32_t round_consts[64] = {
2985 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2986 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2987 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2988 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2989 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2990 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2991 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2992 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2993 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2994 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2995 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2996 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2997 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2998 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2999 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3000 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3001 };
3002 __ align(CodeEntryAlignment);
3003 StubCodeMark mark(this, "StubRoutines", name);
3004 address start = __ pc();
3005
3006 Register buf = c_rarg0;
3007 Register state = c_rarg1;
3008 Register ofs = c_rarg2;
3009 Register limit = c_rarg3;
3010
3011 Label sha1_loop;
3012
3013 __ stpd(v8, v9, __ pre(sp, -32));
3014 __ stpd(v10, v11, Address(sp, 16));
3015
3016 // dga == v0
3017 // dgb == v1
3018 // dg0 == v2
3019 // dg1 == v3
3020 // dg2 == v4
3021 // t0 == v6
3022 // t1 == v7
3023
3024 // load 16 keys to v16..v31
3025 __ lea(rscratch1, ExternalAddress((address)round_consts));
3026 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3027 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3028 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3029 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3030
3031 // load 8 words (256 bits) state
3032 __ ldpq(v0, v1, state);
3033
3034 __ BIND(sha1_loop);
3035 // load 64 bytes of data into v8..v11
3036 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3037 __ rev32(v8, __ T16B, v8);
3038 __ rev32(v9, __ T16B, v9);
3039 __ rev32(v10, __ T16B, v10);
3040 __ rev32(v11, __ T16B, v11);
3041
3042 __ addv(v6, __ T4S, v8, v16);
3043 __ orr(v2, __ T16B, v0, v0);
3044 __ orr(v3, __ T16B, v1, v1);
3045
3046 FloatRegister d0 = v8;
3047 FloatRegister d1 = v9;
3048 FloatRegister d2 = v10;
3049 FloatRegister d3 = v11;
3050
3051
3052 for (int round = 0; round < 16; round++) {
3053 FloatRegister tmp1 = (round & 1) ? v6 : v7;
3054 FloatRegister tmp2 = (round & 1) ? v7 : v6;
3055 FloatRegister tmp3 = (round & 1) ? v2 : v4;
3056 FloatRegister tmp4 = (round & 1) ? v4 : v2;
3057
3058 if (round < 12) __ sha256su0(d0, __ T4S, d1);
3059 __ orr(v4, __ T16B, v2, v2);
3060 if (round < 15)
3061 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3062 __ sha256h(v2, __ T4S, v3, tmp2);
3063 __ sha256h2(v3, __ T4S, v4, tmp2);
3064 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3065
3066 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3067 }
3068
3069 __ addv(v0, __ T4S, v0, v2);
3070 __ addv(v1, __ T4S, v1, v3);
3071
3072 if (multi_block) {
3073 __ add(ofs, ofs, 64);
3074 __ cmp(ofs, limit);
3075 __ br(Assembler::LE, sha1_loop);
3076 __ mov(c_rarg0, ofs); // return ofs
3077 }
3078
3079 __ ldpd(v10, v11, Address(sp, 16));
3080 __ ldpd(v8, v9, __ post(sp, 32));
3081
3082 __ stpq(v0, v1, state);
3083
3084 __ ret(lr);
3085
3086 return start;
3087 }
3088
3089 // Safefetch stubs.
generate_safefetch(const char * name,int size,address * entry,address * fault_pc,address * continuation_pc)3090 void generate_safefetch(const char* name, int size, address* entry,
3091 address* fault_pc, address* continuation_pc) {
3092 // safefetch signatures:
3093 // int SafeFetch32(int* adr, int errValue);
3094 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3095 //
3096 // arguments:
3097 // c_rarg0 = adr
3098 // c_rarg1 = errValue
3099 //
3100 // result:
3101 // PPC_RET = *adr or errValue
3102
3103 StubCodeMark mark(this, "StubRoutines", name);
3104
3105 // Entry point, pc or function descriptor.
3106 *entry = __ pc();
3107
3108 // Load *adr into c_rarg1, may fault.
3109 *fault_pc = __ pc();
3110 switch (size) {
3111 case 4:
3112 // int32_t
3113 __ ldrw(c_rarg1, Address(c_rarg0, 0));
3114 break;
3115 case 8:
3116 // int64_t
3117 __ ldr(c_rarg1, Address(c_rarg0, 0));
3118 break;
3119 default:
3120 ShouldNotReachHere();
3121 }
3122
3123 // return errValue or *adr
3124 *continuation_pc = __ pc();
3125 __ mov(r0, c_rarg1);
3126 __ ret(lr);
3127 }
3128
3129 /**
3130 * Arguments:
3131 *
3132 * Inputs:
3133 * c_rarg0 - int crc
3134 * c_rarg1 - byte* buf
3135 * c_rarg2 - int length
3136 *
3137 * Ouput:
3138 * rax - int crc result
3139 */
generate_updateBytesCRC32()3140 address generate_updateBytesCRC32() {
3141 assert(UseCRC32Intrinsics, "what are we doing here?");
3142
3143 __ align(CodeEntryAlignment);
3144 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3145
3146 address start = __ pc();
3147
3148 const Register crc = c_rarg0; // crc
3149 const Register buf = c_rarg1; // source java byte array address
3150 const Register len = c_rarg2; // length
3151 const Register table0 = c_rarg3; // crc_table address
3152 const Register table1 = c_rarg4;
3153 const Register table2 = c_rarg5;
3154 const Register table3 = c_rarg6;
3155 const Register tmp3 = c_rarg7;
3156
3157 BLOCK_COMMENT("Entry:");
3158 __ enter(); // required for proper stackwalking of RuntimeStub frame
3159
3160 __ kernel_crc32(crc, buf, len,
3161 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3162
3163 __ leave(); // required for proper stackwalking of RuntimeStub frame
3164 __ ret(lr);
3165
3166 return start;
3167 }
3168
3169 /**
3170 * Arguments:
3171 *
3172 * Inputs:
3173 * c_rarg0 - int crc
3174 * c_rarg1 - byte* buf
3175 * c_rarg2 - int length
3176 * c_rarg3 - int* table
3177 *
3178 * Ouput:
3179 * r0 - int crc result
3180 */
generate_updateBytesCRC32C()3181 address generate_updateBytesCRC32C() {
3182 assert(UseCRC32CIntrinsics, "what are we doing here?");
3183
3184 __ align(CodeEntryAlignment);
3185 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3186
3187 address start = __ pc();
3188
3189 const Register crc = c_rarg0; // crc
3190 const Register buf = c_rarg1; // source java byte array address
3191 const Register len = c_rarg2; // length
3192 const Register table0 = c_rarg3; // crc_table address
3193 const Register table1 = c_rarg4;
3194 const Register table2 = c_rarg5;
3195 const Register table3 = c_rarg6;
3196 const Register tmp3 = c_rarg7;
3197
3198 BLOCK_COMMENT("Entry:");
3199 __ enter(); // required for proper stackwalking of RuntimeStub frame
3200
3201 __ kernel_crc32c(crc, buf, len,
3202 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3203
3204 __ leave(); // required for proper stackwalking of RuntimeStub frame
3205 __ ret(lr);
3206
3207 return start;
3208 }
3209
3210 /***
3211 * Arguments:
3212 *
3213 * Inputs:
3214 * c_rarg0 - int adler
3215 * c_rarg1 - byte* buff
3216 * c_rarg2 - int len
3217 *
3218 * Output:
3219 * c_rarg0 - int adler result
3220 */
generate_updateBytesAdler32()3221 address generate_updateBytesAdler32() {
3222 __ align(CodeEntryAlignment);
3223 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3224 address start = __ pc();
3225
3226 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3227
3228 // Aliases
3229 Register adler = c_rarg0;
3230 Register s1 = c_rarg0;
3231 Register s2 = c_rarg3;
3232 Register buff = c_rarg1;
3233 Register len = c_rarg2;
3234 Register nmax = r4;
3235 Register base = r5;
3236 Register count = r6;
3237 Register temp0 = rscratch1;
3238 Register temp1 = rscratch2;
3239 FloatRegister vbytes = v0;
3240 FloatRegister vs1acc = v1;
3241 FloatRegister vs2acc = v2;
3242 FloatRegister vtable = v3;
3243
3244 // Max number of bytes we can process before having to take the mod
3245 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3246 u_int64_t BASE = 0xfff1;
3247 u_int64_t NMAX = 0x15B0;
3248
3249 __ mov(base, BASE);
3250 __ mov(nmax, NMAX);
3251
3252 // Load accumulation coefficients for the upper 16 bits
3253 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3254 __ ld1(vtable, __ T16B, Address(temp0));
3255
3256 // s1 is initialized to the lower 16 bits of adler
3257 // s2 is initialized to the upper 16 bits of adler
3258 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
3259 __ uxth(s1, adler); // s1 = (adler & 0xffff)
3260
3261 // The pipelined loop needs at least 16 elements for 1 iteration
3262 // It does check this, but it is more effective to skip to the cleanup loop
3263 __ cmp(len, (u1)16);
3264 __ br(Assembler::HS, L_nmax);
3265 __ cbz(len, L_combine);
3266
3267 __ bind(L_simple_by1_loop);
3268 __ ldrb(temp0, Address(__ post(buff, 1)));
3269 __ add(s1, s1, temp0);
3270 __ add(s2, s2, s1);
3271 __ subs(len, len, 1);
3272 __ br(Assembler::HI, L_simple_by1_loop);
3273
3274 // s1 = s1 % BASE
3275 __ subs(temp0, s1, base);
3276 __ csel(s1, temp0, s1, Assembler::HS);
3277
3278 // s2 = s2 % BASE
3279 __ lsr(temp0, s2, 16);
3280 __ lsl(temp1, temp0, 4);
3281 __ sub(temp1, temp1, temp0);
3282 __ add(s2, temp1, s2, ext::uxth);
3283
3284 __ subs(temp0, s2, base);
3285 __ csel(s2, temp0, s2, Assembler::HS);
3286
3287 __ b(L_combine);
3288
3289 __ bind(L_nmax);
3290 __ subs(len, len, nmax);
3291 __ sub(count, nmax, 16);
3292 __ br(Assembler::LO, L_by16);
3293
3294 __ bind(L_nmax_loop);
3295
3296 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3297 vbytes, vs1acc, vs2acc, vtable);
3298
3299 __ subs(count, count, 16);
3300 __ br(Assembler::HS, L_nmax_loop);
3301
3302 // s1 = s1 % BASE
3303 __ lsr(temp0, s1, 16);
3304 __ lsl(temp1, temp0, 4);
3305 __ sub(temp1, temp1, temp0);
3306 __ add(temp1, temp1, s1, ext::uxth);
3307
3308 __ lsr(temp0, temp1, 16);
3309 __ lsl(s1, temp0, 4);
3310 __ sub(s1, s1, temp0);
3311 __ add(s1, s1, temp1, ext:: uxth);
3312
3313 __ subs(temp0, s1, base);
3314 __ csel(s1, temp0, s1, Assembler::HS);
3315
3316 // s2 = s2 % BASE
3317 __ lsr(temp0, s2, 16);
3318 __ lsl(temp1, temp0, 4);
3319 __ sub(temp1, temp1, temp0);
3320 __ add(temp1, temp1, s2, ext::uxth);
3321
3322 __ lsr(temp0, temp1, 16);
3323 __ lsl(s2, temp0, 4);
3324 __ sub(s2, s2, temp0);
3325 __ add(s2, s2, temp1, ext:: uxth);
3326
3327 __ subs(temp0, s2, base);
3328 __ csel(s2, temp0, s2, Assembler::HS);
3329
3330 __ subs(len, len, nmax);
3331 __ sub(count, nmax, 16);
3332 __ br(Assembler::HS, L_nmax_loop);
3333
3334 __ bind(L_by16);
3335 __ adds(len, len, count);
3336 __ br(Assembler::LO, L_by1);
3337
3338 __ bind(L_by16_loop);
3339
3340 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3341 vbytes, vs1acc, vs2acc, vtable);
3342
3343 __ subs(len, len, 16);
3344 __ br(Assembler::HS, L_by16_loop);
3345
3346 __ bind(L_by1);
3347 __ adds(len, len, 15);
3348 __ br(Assembler::LO, L_do_mod);
3349
3350 __ bind(L_by1_loop);
3351 __ ldrb(temp0, Address(__ post(buff, 1)));
3352 __ add(s1, temp0, s1);
3353 __ add(s2, s2, s1);
3354 __ subs(len, len, 1);
3355 __ br(Assembler::HS, L_by1_loop);
3356
3357 __ bind(L_do_mod);
3358 // s1 = s1 % BASE
3359 __ lsr(temp0, s1, 16);
3360 __ lsl(temp1, temp0, 4);
3361 __ sub(temp1, temp1, temp0);
3362 __ add(temp1, temp1, s1, ext::uxth);
3363
3364 __ lsr(temp0, temp1, 16);
3365 __ lsl(s1, temp0, 4);
3366 __ sub(s1, s1, temp0);
3367 __ add(s1, s1, temp1, ext:: uxth);
3368
3369 __ subs(temp0, s1, base);
3370 __ csel(s1, temp0, s1, Assembler::HS);
3371
3372 // s2 = s2 % BASE
3373 __ lsr(temp0, s2, 16);
3374 __ lsl(temp1, temp0, 4);
3375 __ sub(temp1, temp1, temp0);
3376 __ add(temp1, temp1, s2, ext::uxth);
3377
3378 __ lsr(temp0, temp1, 16);
3379 __ lsl(s2, temp0, 4);
3380 __ sub(s2, s2, temp0);
3381 __ add(s2, s2, temp1, ext:: uxth);
3382
3383 __ subs(temp0, s2, base);
3384 __ csel(s2, temp0, s2, Assembler::HS);
3385
3386 // Combine lower bits and higher bits
3387 __ bind(L_combine);
3388 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3389
3390 __ ret(lr);
3391
3392 return start;
3393 }
3394
generate_updateBytesAdler32_accum(Register s1,Register s2,Register buff,Register temp0,Register temp1,FloatRegister vbytes,FloatRegister vs1acc,FloatRegister vs2acc,FloatRegister vtable)3395 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3396 Register temp0, Register temp1, FloatRegister vbytes,
3397 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3398 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3399 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3400 // In non-vectorized code, we update s1 and s2 as:
3401 // s1 <- s1 + b1
3402 // s2 <- s2 + s1
3403 // s1 <- s1 + b2
3404 // s2 <- s2 + b1
3405 // ...
3406 // s1 <- s1 + b16
3407 // s2 <- s2 + s1
3408 // Putting above assignments together, we have:
3409 // s1_new = s1 + b1 + b2 + ... + b16
3410 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3411 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3412 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3413 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3414
3415 // s2 = s2 + s1 * 16
3416 __ add(s2, s2, s1, Assembler::LSL, 4);
3417
3418 // vs1acc = b1 + b2 + b3 + ... + b16
3419 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3420 __ umullv(vs2acc, __ T8B, vtable, vbytes);
3421 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3422 __ uaddlv(vs1acc, __ T16B, vbytes);
3423 __ uaddlv(vs2acc, __ T8H, vs2acc);
3424
3425 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3426 __ fmovd(temp0, vs1acc);
3427 __ fmovd(temp1, vs2acc);
3428 __ add(s1, s1, temp0);
3429 __ add(s2, s2, temp1);
3430 }
3431
3432 /**
3433 * Arguments:
3434 *
3435 * Input:
3436 * c_rarg0 - x address
3437 * c_rarg1 - x length
3438 * c_rarg2 - y address
3439 * c_rarg3 - y lenth
3440 * c_rarg4 - z address
3441 * c_rarg5 - z length
3442 */
generate_multiplyToLen()3443 address generate_multiplyToLen() {
3444 __ align(CodeEntryAlignment);
3445 StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3446
3447 address start = __ pc();
3448 const Register x = r0;
3449 const Register xlen = r1;
3450 const Register y = r2;
3451 const Register ylen = r3;
3452 const Register z = r4;
3453 const Register zlen = r5;
3454
3455 const Register tmp1 = r10;
3456 const Register tmp2 = r11;
3457 const Register tmp3 = r12;
3458 const Register tmp4 = r13;
3459 const Register tmp5 = r14;
3460 const Register tmp6 = r15;
3461 const Register tmp7 = r16;
3462
3463 BLOCK_COMMENT("Entry:");
3464 __ enter(); // required for proper stackwalking of RuntimeStub frame
3465 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3466 __ leave(); // required for proper stackwalking of RuntimeStub frame
3467 __ ret(lr);
3468
3469 return start;
3470 }
3471
generate_squareToLen()3472 address generate_squareToLen() {
3473 // squareToLen algorithm for sizes 1..127 described in java code works
3474 // faster than multiply_to_len on some CPUs and slower on others, but
3475 // multiply_to_len shows a bit better overall results
3476 __ align(CodeEntryAlignment);
3477 StubCodeMark mark(this, "StubRoutines", "squareToLen");
3478 address start = __ pc();
3479
3480 const Register x = r0;
3481 const Register xlen = r1;
3482 const Register z = r2;
3483 const Register zlen = r3;
3484 const Register y = r4; // == x
3485 const Register ylen = r5; // == xlen
3486
3487 const Register tmp1 = r10;
3488 const Register tmp2 = r11;
3489 const Register tmp3 = r12;
3490 const Register tmp4 = r13;
3491 const Register tmp5 = r14;
3492 const Register tmp6 = r15;
3493 const Register tmp7 = r16;
3494
3495 RegSet spilled_regs = RegSet::of(y, ylen);
3496 BLOCK_COMMENT("Entry:");
3497 __ enter();
3498 __ push(spilled_regs, sp);
3499 __ mov(y, x);
3500 __ mov(ylen, xlen);
3501 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3502 __ pop(spilled_regs, sp);
3503 __ leave();
3504 __ ret(lr);
3505 return start;
3506 }
3507
generate_mulAdd()3508 address generate_mulAdd() {
3509 __ align(CodeEntryAlignment);
3510 StubCodeMark mark(this, "StubRoutines", "mulAdd");
3511
3512 address start = __ pc();
3513
3514 const Register out = r0;
3515 const Register in = r1;
3516 const Register offset = r2;
3517 const Register len = r3;
3518 const Register k = r4;
3519
3520 BLOCK_COMMENT("Entry:");
3521 __ enter();
3522 __ mul_add(out, in, offset, len, k);
3523 __ leave();
3524 __ ret(lr);
3525
3526 return start;
3527 }
3528
ghash_multiply(FloatRegister result_lo,FloatRegister result_hi,FloatRegister a,FloatRegister b,FloatRegister a1_xor_a0,FloatRegister tmp1,FloatRegister tmp2,FloatRegister tmp3,FloatRegister tmp4)3529 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3530 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3531 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3532 // Karatsuba multiplication performs a 128*128 -> 256-bit
3533 // multiplication in three 128-bit multiplications and a few
3534 // additions.
3535 //
3536 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3537 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3538 //
3539 // Inputs:
3540 //
3541 // A0 in a.d[0] (subkey)
3542 // A1 in a.d[1]
3543 // (A1+A0) in a1_xor_a0.d[0]
3544 //
3545 // B0 in b.d[0] (state)
3546 // B1 in b.d[1]
3547
3548 __ ext(tmp1, __ T16B, b, b, 0x08);
3549 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1
3550 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0)
3551 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0
3552 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3553
3554 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3555 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3556 __ eor(tmp2, __ T16B, tmp2, tmp4);
3557 __ eor(tmp2, __ T16B, tmp2, tmp3);
3558
3559 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3560 __ ins(result_hi, __ D, tmp2, 0, 1);
3561 __ ins(result_lo, __ D, tmp2, 1, 0);
3562 }
3563
ghash_reduce(FloatRegister result,FloatRegister lo,FloatRegister hi,FloatRegister p,FloatRegister z,FloatRegister t1)3564 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3565 FloatRegister p, FloatRegister z, FloatRegister t1) {
3566 const FloatRegister t0 = result;
3567
3568 // The GCM field polynomial f is z^128 + p(z), where p =
3569 // z^7+z^2+z+1.
3570 //
3571 // z^128 === -p(z) (mod (z^128 + p(z)))
3572 //
3573 // so, given that the product we're reducing is
3574 // a == lo + hi * z^128
3575 // substituting,
3576 // === lo - hi * p(z) (mod (z^128 + p(z)))
3577 //
3578 // we reduce by multiplying hi by p(z) and subtracting the result
3579 // from (i.e. XORing it with) lo. Because p has no nonzero high
3580 // bits we can do this with two 64-bit multiplications, lo*p and
3581 // hi*p.
3582
3583 __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3584 __ ext(t1, __ T16B, t0, z, 8);
3585 __ eor(hi, __ T16B, hi, t1);
3586 __ ext(t1, __ T16B, z, t0, 8);
3587 __ eor(lo, __ T16B, lo, t1);
3588 __ pmull(t0, __ T1Q, hi, p, __ T1D);
3589 __ eor(result, __ T16B, lo, t0);
3590 }
3591
generate_has_negatives(address & has_negatives_long)3592 address generate_has_negatives(address &has_negatives_long) {
3593 const u1 large_loop_size = 64;
3594 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3595 int dcache_line = VM_Version::dcache_line_size();
3596
3597 Register ary1 = r1, len = r2, result = r0;
3598
3599 __ align(CodeEntryAlignment);
3600
3601 StubCodeMark mark(this, "StubRoutines", "has_negatives");
3602
3603 address entry = __ pc();
3604
3605 __ enter();
3606
3607 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3608 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3609
3610 __ cmp(len, (u1)15);
3611 __ br(Assembler::GT, LEN_OVER_15);
3612 // The only case when execution falls into this code is when pointer is near
3613 // the end of memory page and we have to avoid reading next page
3614 __ add(ary1, ary1, len);
3615 __ subs(len, len, 8);
3616 __ br(Assembler::GT, LEN_OVER_8);
3617 __ ldr(rscratch2, Address(ary1, -8));
3618 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
3619 __ lsrv(rscratch2, rscratch2, rscratch1);
3620 __ tst(rscratch2, UPPER_BIT_MASK);
3621 __ cset(result, Assembler::NE);
3622 __ leave();
3623 __ ret(lr);
3624 __ bind(LEN_OVER_8);
3625 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3626 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3627 __ tst(rscratch2, UPPER_BIT_MASK);
3628 __ br(Assembler::NE, RET_TRUE_NO_POP);
3629 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3630 __ lsrv(rscratch1, rscratch1, rscratch2);
3631 __ tst(rscratch1, UPPER_BIT_MASK);
3632 __ cset(result, Assembler::NE);
3633 __ leave();
3634 __ ret(lr);
3635
3636 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3637 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3638
3639 has_negatives_long = __ pc(); // 2nd entry point
3640
3641 __ enter();
3642
3643 __ bind(LEN_OVER_15);
3644 __ push(spilled_regs, sp);
3645 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3646 __ cbz(rscratch2, ALIGNED);
3647 __ ldp(tmp6, tmp1, Address(ary1));
3648 __ mov(tmp5, 16);
3649 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3650 __ add(ary1, ary1, rscratch1);
3651 __ sub(len, len, rscratch1);
3652 __ orr(tmp6, tmp6, tmp1);
3653 __ tst(tmp6, UPPER_BIT_MASK);
3654 __ br(Assembler::NE, RET_TRUE);
3655
3656 __ bind(ALIGNED);
3657 __ cmp(len, large_loop_size);
3658 __ br(Assembler::LT, CHECK_16);
3659 // Perform 16-byte load as early return in pre-loop to handle situation
3660 // when initially aligned large array has negative values at starting bytes,
3661 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3662 // slower. Cases with negative bytes further ahead won't be affected that
3663 // much. In fact, it'll be faster due to early loads, less instructions and
3664 // less branches in LARGE_LOOP.
3665 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3666 __ sub(len, len, 16);
3667 __ orr(tmp6, tmp6, tmp1);
3668 __ tst(tmp6, UPPER_BIT_MASK);
3669 __ br(Assembler::NE, RET_TRUE);
3670 __ cmp(len, large_loop_size);
3671 __ br(Assembler::LT, CHECK_16);
3672
3673 if (SoftwarePrefetchHintDistance >= 0
3674 && SoftwarePrefetchHintDistance >= dcache_line) {
3675 // initial prefetch
3676 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3677 }
3678 __ bind(LARGE_LOOP);
3679 if (SoftwarePrefetchHintDistance >= 0) {
3680 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3681 }
3682 // Issue load instructions first, since it can save few CPU/MEM cycles, also
3683 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3684 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3685 // instructions per cycle and have less branches, but this approach disables
3686 // early return, thus, all 64 bytes are loaded and checked every time.
3687 __ ldp(tmp2, tmp3, Address(ary1));
3688 __ ldp(tmp4, tmp5, Address(ary1, 16));
3689 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3690 __ ldp(tmp6, tmp1, Address(ary1, 48));
3691 __ add(ary1, ary1, large_loop_size);
3692 __ sub(len, len, large_loop_size);
3693 __ orr(tmp2, tmp2, tmp3);
3694 __ orr(tmp4, tmp4, tmp5);
3695 __ orr(rscratch1, rscratch1, rscratch2);
3696 __ orr(tmp6, tmp6, tmp1);
3697 __ orr(tmp2, tmp2, tmp4);
3698 __ orr(rscratch1, rscratch1, tmp6);
3699 __ orr(tmp2, tmp2, rscratch1);
3700 __ tst(tmp2, UPPER_BIT_MASK);
3701 __ br(Assembler::NE, RET_TRUE);
3702 __ cmp(len, large_loop_size);
3703 __ br(Assembler::GE, LARGE_LOOP);
3704
3705 __ bind(CHECK_16); // small 16-byte load pre-loop
3706 __ cmp(len, (u1)16);
3707 __ br(Assembler::LT, POST_LOOP16);
3708
3709 __ bind(LOOP16); // small 16-byte load loop
3710 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3711 __ sub(len, len, 16);
3712 __ orr(tmp2, tmp2, tmp3);
3713 __ tst(tmp2, UPPER_BIT_MASK);
3714 __ br(Assembler::NE, RET_TRUE);
3715 __ cmp(len, (u1)16);
3716 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3717
3718 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3719 __ cmp(len, (u1)8);
3720 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3721 __ ldr(tmp3, Address(__ post(ary1, 8)));
3722 __ sub(len, len, 8);
3723 __ tst(tmp3, UPPER_BIT_MASK);
3724 __ br(Assembler::NE, RET_TRUE);
3725
3726 __ bind(POST_LOOP16_LOAD_TAIL);
3727 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3728 __ ldr(tmp1, Address(ary1));
3729 __ mov(tmp2, 64);
3730 __ sub(tmp4, tmp2, len, __ LSL, 3);
3731 __ lslv(tmp1, tmp1, tmp4);
3732 __ tst(tmp1, UPPER_BIT_MASK);
3733 __ br(Assembler::NE, RET_TRUE);
3734 // Fallthrough
3735
3736 __ bind(RET_FALSE);
3737 __ pop(spilled_regs, sp);
3738 __ leave();
3739 __ mov(result, zr);
3740 __ ret(lr);
3741
3742 __ bind(RET_TRUE);
3743 __ pop(spilled_regs, sp);
3744 __ bind(RET_TRUE_NO_POP);
3745 __ leave();
3746 __ mov(result, 1);
3747 __ ret(lr);
3748
3749 __ bind(DONE);
3750 __ pop(spilled_regs, sp);
3751 __ leave();
3752 __ ret(lr);
3753 return entry;
3754 }
3755
generate_large_array_equals_loop_nonsimd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)3756 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3757 bool usePrefetch, Label &NOT_EQUAL) {
3758 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3759 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3760 tmp7 = r12, tmp8 = r13;
3761 Label LOOP;
3762
3763 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3764 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3765 __ bind(LOOP);
3766 if (usePrefetch) {
3767 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3768 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3769 }
3770 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3771 __ eor(tmp1, tmp1, tmp2);
3772 __ eor(tmp3, tmp3, tmp4);
3773 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3774 __ orr(tmp1, tmp1, tmp3);
3775 __ cbnz(tmp1, NOT_EQUAL);
3776 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3777 __ eor(tmp5, tmp5, tmp6);
3778 __ eor(tmp7, tmp7, tmp8);
3779 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3780 __ orr(tmp5, tmp5, tmp7);
3781 __ cbnz(tmp5, NOT_EQUAL);
3782 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3783 __ eor(tmp1, tmp1, tmp2);
3784 __ eor(tmp3, tmp3, tmp4);
3785 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3786 __ orr(tmp1, tmp1, tmp3);
3787 __ cbnz(tmp1, NOT_EQUAL);
3788 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3789 __ eor(tmp5, tmp5, tmp6);
3790 __ sub(cnt1, cnt1, 8 * wordSize);
3791 __ eor(tmp7, tmp7, tmp8);
3792 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3793 // tmp6 is not used. MacroAssembler::subs is used here (rather than
3794 // cmp) because subs allows an unlimited range of immediate operand.
3795 __ subs(tmp6, cnt1, loopThreshold);
3796 __ orr(tmp5, tmp5, tmp7);
3797 __ cbnz(tmp5, NOT_EQUAL);
3798 __ br(__ GE, LOOP);
3799 // post-loop
3800 __ eor(tmp1, tmp1, tmp2);
3801 __ eor(tmp3, tmp3, tmp4);
3802 __ orr(tmp1, tmp1, tmp3);
3803 __ sub(cnt1, cnt1, 2 * wordSize);
3804 __ cbnz(tmp1, NOT_EQUAL);
3805 }
3806
generate_large_array_equals_loop_simd(int loopThreshold,bool usePrefetch,Label & NOT_EQUAL)3807 void generate_large_array_equals_loop_simd(int loopThreshold,
3808 bool usePrefetch, Label &NOT_EQUAL) {
3809 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3810 tmp2 = rscratch2;
3811 Label LOOP;
3812
3813 __ bind(LOOP);
3814 if (usePrefetch) {
3815 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3816 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3817 }
3818 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3819 __ sub(cnt1, cnt1, 8 * wordSize);
3820 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3821 __ subs(tmp1, cnt1, loopThreshold);
3822 __ eor(v0, __ T16B, v0, v4);
3823 __ eor(v1, __ T16B, v1, v5);
3824 __ eor(v2, __ T16B, v2, v6);
3825 __ eor(v3, __ T16B, v3, v7);
3826 __ orr(v0, __ T16B, v0, v1);
3827 __ orr(v1, __ T16B, v2, v3);
3828 __ orr(v0, __ T16B, v0, v1);
3829 __ umov(tmp1, v0, __ D, 0);
3830 __ umov(tmp2, v0, __ D, 1);
3831 __ orr(tmp1, tmp1, tmp2);
3832 __ cbnz(tmp1, NOT_EQUAL);
3833 __ br(__ GE, LOOP);
3834 }
3835
3836 // a1 = r1 - array1 address
3837 // a2 = r2 - array2 address
3838 // result = r0 - return value. Already contains "false"
3839 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3840 // r3-r5 are reserved temporary registers
generate_large_array_equals()3841 address generate_large_array_equals() {
3842 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3843 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3844 tmp7 = r12, tmp8 = r13;
3845 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3846 SMALL_LOOP, POST_LOOP;
3847 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3848 // calculate if at least 32 prefetched bytes are used
3849 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3850 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3851 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3852 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3853 tmp5, tmp6, tmp7, tmp8);
3854
3855 __ align(CodeEntryAlignment);
3856
3857 StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3858
3859 address entry = __ pc();
3860 __ enter();
3861 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
3862 // also advance pointers to use post-increment instead of pre-increment
3863 __ add(a1, a1, wordSize);
3864 __ add(a2, a2, wordSize);
3865 if (AvoidUnalignedAccesses) {
3866 // both implementations (SIMD/nonSIMD) are using relatively large load
3867 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3868 // on some CPUs in case of address is not at least 16-byte aligned.
3869 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3870 // load if needed at least for 1st address and make if 16-byte aligned.
3871 Label ALIGNED16;
3872 __ tbz(a1, 3, ALIGNED16);
3873 __ ldr(tmp1, Address(__ post(a1, wordSize)));
3874 __ ldr(tmp2, Address(__ post(a2, wordSize)));
3875 __ sub(cnt1, cnt1, wordSize);
3876 __ eor(tmp1, tmp1, tmp2);
3877 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3878 __ bind(ALIGNED16);
3879 }
3880 if (UseSIMDForArrayEquals) {
3881 if (SoftwarePrefetchHintDistance >= 0) {
3882 __ subs(tmp1, cnt1, prefetchLoopThreshold);
3883 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3884 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3885 /* prfm = */ true, NOT_EQUAL);
3886 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3887 __ br(__ LT, TAIL);
3888 }
3889 __ bind(NO_PREFETCH_LARGE_LOOP);
3890 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3891 /* prfm = */ false, NOT_EQUAL);
3892 } else {
3893 __ push(spilled_regs, sp);
3894 if (SoftwarePrefetchHintDistance >= 0) {
3895 __ subs(tmp1, cnt1, prefetchLoopThreshold);
3896 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3897 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3898 /* prfm = */ true, NOT_EQUAL);
3899 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3900 __ br(__ LT, TAIL);
3901 }
3902 __ bind(NO_PREFETCH_LARGE_LOOP);
3903 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3904 /* prfm = */ false, NOT_EQUAL);
3905 }
3906 __ bind(TAIL);
3907 __ cbz(cnt1, EQUAL);
3908 __ subs(cnt1, cnt1, wordSize);
3909 __ br(__ LE, POST_LOOP);
3910 __ bind(SMALL_LOOP);
3911 __ ldr(tmp1, Address(__ post(a1, wordSize)));
3912 __ ldr(tmp2, Address(__ post(a2, wordSize)));
3913 __ subs(cnt1, cnt1, wordSize);
3914 __ eor(tmp1, tmp1, tmp2);
3915 __ cbnz(tmp1, NOT_EQUAL);
3916 __ br(__ GT, SMALL_LOOP);
3917 __ bind(POST_LOOP);
3918 __ ldr(tmp1, Address(a1, cnt1));
3919 __ ldr(tmp2, Address(a2, cnt1));
3920 __ eor(tmp1, tmp1, tmp2);
3921 __ cbnz(tmp1, NOT_EQUAL);
3922 __ bind(EQUAL);
3923 __ mov(result, true);
3924 __ bind(NOT_EQUAL);
3925 if (!UseSIMDForArrayEquals) {
3926 __ pop(spilled_regs, sp);
3927 }
3928 __ bind(NOT_EQUAL_NO_POP);
3929 __ leave();
3930 __ ret(lr);
3931 return entry;
3932 }
3933
generate_dsin_dcos(bool isCos)3934 address generate_dsin_dcos(bool isCos) {
3935 __ align(CodeEntryAlignment);
3936 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3937 address start = __ pc();
3938 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3939 (address)StubRoutines::aarch64::_two_over_pi,
3940 (address)StubRoutines::aarch64::_pio2,
3941 (address)StubRoutines::aarch64::_dsin_coef,
3942 (address)StubRoutines::aarch64::_dcos_coef);
3943 return start;
3944 }
3945
generate_dlog()3946 address generate_dlog() {
3947 __ align(CodeEntryAlignment);
3948 StubCodeMark mark(this, "StubRoutines", "dlog");
3949 address entry = __ pc();
3950 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3951 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3952 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3953 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3954 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3955 return entry;
3956 }
3957
3958 // code for comparing 16 bytes of strings with same encoding
compare_string_16_bytes_same(Label & DIFF1,Label & DIFF2)3959 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3960 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3961 __ ldr(rscratch1, Address(__ post(str1, 8)));
3962 __ eor(rscratch2, tmp1, tmp2);
3963 __ ldr(cnt1, Address(__ post(str2, 8)));
3964 __ cbnz(rscratch2, DIFF1);
3965 __ ldr(tmp1, Address(__ post(str1, 8)));
3966 __ eor(rscratch2, rscratch1, cnt1);
3967 __ ldr(tmp2, Address(__ post(str2, 8)));
3968 __ cbnz(rscratch2, DIFF2);
3969 }
3970
3971 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
compare_string_16_x_LU(Register tmpL,Register tmpU,Label & DIFF1,Label & DIFF2)3972 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
3973 Label &DIFF2) {
3974 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
3975 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
3976
3977 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
3978 __ ldr(tmpU, Address(__ post(cnt1, 8)));
3979 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
3980 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
3981
3982 __ fmovd(tmpL, vtmp3);
3983 __ eor(rscratch2, tmp3, tmpL);
3984 __ cbnz(rscratch2, DIFF2);
3985
3986 __ ldr(tmp3, Address(__ post(cnt1, 8)));
3987 __ umov(tmpL, vtmp3, __ D, 1);
3988 __ eor(rscratch2, tmpU, tmpL);
3989 __ cbnz(rscratch2, DIFF1);
3990
3991 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
3992 __ ldr(tmpU, Address(__ post(cnt1, 8)));
3993 __ fmovd(tmpL, vtmp);
3994 __ eor(rscratch2, tmp3, tmpL);
3995 __ cbnz(rscratch2, DIFF2);
3996
3997 __ ldr(tmp3, Address(__ post(cnt1, 8)));
3998 __ umov(tmpL, vtmp, __ D, 1);
3999 __ eor(rscratch2, tmpU, tmpL);
4000 __ cbnz(rscratch2, DIFF1);
4001 }
4002
4003 // r0 = result
4004 // r1 = str1
4005 // r2 = cnt1
4006 // r3 = str2
4007 // r4 = cnt2
4008 // r10 = tmp1
4009 // r11 = tmp2
generate_compare_long_string_different_encoding(bool isLU)4010 address generate_compare_long_string_different_encoding(bool isLU) {
4011 __ align(CodeEntryAlignment);
4012 StubCodeMark mark(this, "StubRoutines", isLU
4013 ? "compare_long_string_different_encoding LU"
4014 : "compare_long_string_different_encoding UL");
4015 address entry = __ pc();
4016 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4017 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4018 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4019 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4020 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4021 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4022 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4023
4024 int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4025
4026 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4027 // cnt2 == amount of characters left to compare
4028 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4029 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4030 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4031 __ add(str2, str2, isLU ? wordSize : wordSize/2);
4032 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4033 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4034 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4035 __ eor(rscratch2, tmp1, tmp2);
4036 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4037 __ mov(rscratch1, tmp2);
4038 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4039 Register strU = isLU ? str2 : str1,
4040 strL = isLU ? str1 : str2,
4041 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4042 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4043 __ push(spilled_regs, sp);
4044 __ sub(tmp2, strL, cnt2); // strL pointer to load from
4045 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4046
4047 __ ldr(tmp3, Address(__ post(cnt1, 8)));
4048
4049 if (SoftwarePrefetchHintDistance >= 0) {
4050 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4051 __ br(__ LT, NO_PREFETCH);
4052 __ bind(LARGE_LOOP_PREFETCH);
4053 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4054 __ mov(tmp4, 2);
4055 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4056 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4057 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4058 __ subs(tmp4, tmp4, 1);
4059 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4060 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4061 __ mov(tmp4, 2);
4062 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4063 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4064 __ subs(tmp4, tmp4, 1);
4065 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4066 __ sub(cnt2, cnt2, 64);
4067 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4068 __ br(__ GE, LARGE_LOOP_PREFETCH);
4069 }
4070 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4071 __ bind(NO_PREFETCH);
4072 __ subs(cnt2, cnt2, 16);
4073 __ br(__ LT, TAIL);
4074 __ bind(SMALL_LOOP); // smaller loop
4075 __ subs(cnt2, cnt2, 16);
4076 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4077 __ br(__ GE, SMALL_LOOP);
4078 __ cmn(cnt2, (u1)16);
4079 __ br(__ EQ, LOAD_LAST);
4080 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4081 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string
4082 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4083 __ ldr(tmp3, Address(cnt1, -8));
4084 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4085 __ b(LOAD_LAST);
4086 __ bind(DIFF2);
4087 __ mov(tmpU, tmp3);
4088 __ bind(DIFF1);
4089 __ pop(spilled_regs, sp);
4090 __ b(CALCULATE_DIFFERENCE);
4091 __ bind(LOAD_LAST);
4092 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4093 // No need to load it again
4094 __ mov(tmpU, tmp3);
4095 __ pop(spilled_regs, sp);
4096
4097 __ ldrs(vtmp, Address(strL));
4098 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4099 __ fmovd(tmpL, vtmp);
4100
4101 __ eor(rscratch2, tmpU, tmpL);
4102 __ cbz(rscratch2, DONE);
4103
4104 // Find the first different characters in the longwords and
4105 // compute their difference.
4106 __ bind(CALCULATE_DIFFERENCE);
4107 __ rev(rscratch2, rscratch2);
4108 __ clz(rscratch2, rscratch2);
4109 __ andr(rscratch2, rscratch2, -16);
4110 __ lsrv(tmp1, tmp1, rscratch2);
4111 __ uxthw(tmp1, tmp1);
4112 __ lsrv(rscratch1, rscratch1, rscratch2);
4113 __ uxthw(rscratch1, rscratch1);
4114 __ subw(result, tmp1, rscratch1);
4115 __ bind(DONE);
4116 __ ret(lr);
4117 return entry;
4118 }
4119
4120 // r0 = result
4121 // r1 = str1
4122 // r2 = cnt1
4123 // r3 = str2
4124 // r4 = cnt2
4125 // r10 = tmp1
4126 // r11 = tmp2
generate_compare_long_string_same_encoding(bool isLL)4127 address generate_compare_long_string_same_encoding(bool isLL) {
4128 __ align(CodeEntryAlignment);
4129 StubCodeMark mark(this, "StubRoutines", isLL
4130 ? "compare_long_string_same_encoding LL"
4131 : "compare_long_string_same_encoding UU");
4132 address entry = __ pc();
4133 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4134 tmp1 = r10, tmp2 = r11;
4135 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4136 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4137 DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4138 // exit from large loop when less than 64 bytes left to read or we're about
4139 // to prefetch memory behind array border
4140 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4141 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4142 // update cnt2 counter with already loaded 8 bytes
4143 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4144 // update pointers, because of previous read
4145 __ add(str1, str1, wordSize);
4146 __ add(str2, str2, wordSize);
4147 if (SoftwarePrefetchHintDistance >= 0) {
4148 __ bind(LARGE_LOOP_PREFETCH);
4149 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4150 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4151 compare_string_16_bytes_same(DIFF, DIFF2);
4152 compare_string_16_bytes_same(DIFF, DIFF2);
4153 __ sub(cnt2, cnt2, isLL ? 64 : 32);
4154 compare_string_16_bytes_same(DIFF, DIFF2);
4155 __ subs(rscratch2, cnt2, largeLoopExitCondition);
4156 compare_string_16_bytes_same(DIFF, DIFF2);
4157 __ br(__ GT, LARGE_LOOP_PREFETCH);
4158 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4159 }
4160 // less than 16 bytes left?
4161 __ subs(cnt2, cnt2, isLL ? 16 : 8);
4162 __ br(__ LT, TAIL);
4163 __ bind(SMALL_LOOP);
4164 compare_string_16_bytes_same(DIFF, DIFF2);
4165 __ subs(cnt2, cnt2, isLL ? 16 : 8);
4166 __ br(__ GE, SMALL_LOOP);
4167 __ bind(TAIL);
4168 __ adds(cnt2, cnt2, isLL ? 16 : 8);
4169 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4170 __ subs(cnt2, cnt2, isLL ? 8 : 4);
4171 __ br(__ LE, CHECK_LAST);
4172 __ eor(rscratch2, tmp1, tmp2);
4173 __ cbnz(rscratch2, DIFF);
4174 __ ldr(tmp1, Address(__ post(str1, 8)));
4175 __ ldr(tmp2, Address(__ post(str2, 8)));
4176 __ sub(cnt2, cnt2, isLL ? 8 : 4);
4177 __ bind(CHECK_LAST);
4178 if (!isLL) {
4179 __ add(cnt2, cnt2, cnt2); // now in bytes
4180 }
4181 __ eor(rscratch2, tmp1, tmp2);
4182 __ cbnz(rscratch2, DIFF);
4183 __ ldr(rscratch1, Address(str1, cnt2));
4184 __ ldr(cnt1, Address(str2, cnt2));
4185 __ eor(rscratch2, rscratch1, cnt1);
4186 __ cbz(rscratch2, LENGTH_DIFF);
4187 // Find the first different characters in the longwords and
4188 // compute their difference.
4189 __ bind(DIFF2);
4190 __ rev(rscratch2, rscratch2);
4191 __ clz(rscratch2, rscratch2);
4192 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4193 __ lsrv(rscratch1, rscratch1, rscratch2);
4194 if (isLL) {
4195 __ lsrv(cnt1, cnt1, rscratch2);
4196 __ uxtbw(rscratch1, rscratch1);
4197 __ uxtbw(cnt1, cnt1);
4198 } else {
4199 __ lsrv(cnt1, cnt1, rscratch2);
4200 __ uxthw(rscratch1, rscratch1);
4201 __ uxthw(cnt1, cnt1);
4202 }
4203 __ subw(result, rscratch1, cnt1);
4204 __ b(LENGTH_DIFF);
4205 __ bind(DIFF);
4206 __ rev(rscratch2, rscratch2);
4207 __ clz(rscratch2, rscratch2);
4208 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4209 __ lsrv(tmp1, tmp1, rscratch2);
4210 if (isLL) {
4211 __ lsrv(tmp2, tmp2, rscratch2);
4212 __ uxtbw(tmp1, tmp1);
4213 __ uxtbw(tmp2, tmp2);
4214 } else {
4215 __ lsrv(tmp2, tmp2, rscratch2);
4216 __ uxthw(tmp1, tmp1);
4217 __ uxthw(tmp2, tmp2);
4218 }
4219 __ subw(result, tmp1, tmp2);
4220 __ b(LENGTH_DIFF);
4221 __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4222 __ eor(rscratch2, tmp1, tmp2);
4223 __ cbnz(rscratch2, DIFF);
4224 __ bind(LENGTH_DIFF);
4225 __ ret(lr);
4226 return entry;
4227 }
4228
generate_compare_long_strings()4229 void generate_compare_long_strings() {
4230 StubRoutines::aarch64::_compare_long_string_LL
4231 = generate_compare_long_string_same_encoding(true);
4232 StubRoutines::aarch64::_compare_long_string_UU
4233 = generate_compare_long_string_same_encoding(false);
4234 StubRoutines::aarch64::_compare_long_string_LU
4235 = generate_compare_long_string_different_encoding(true);
4236 StubRoutines::aarch64::_compare_long_string_UL
4237 = generate_compare_long_string_different_encoding(false);
4238 }
4239
4240 // R0 = result
4241 // R1 = str2
4242 // R2 = cnt1
4243 // R3 = str1
4244 // R4 = cnt2
4245 // This generic linear code use few additional ideas, which makes it faster:
4246 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4247 // in order to skip initial loading(help in systems with 1 ld pipeline)
4248 // 2) we can use "fast" algorithm of finding single character to search for
4249 // first symbol with less branches(1 branch per each loaded register instead
4250 // of branch for each symbol), so, this is where constants like
4251 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4252 // 3) after loading and analyzing 1st register of source string, it can be
4253 // used to search for every 1st character entry, saving few loads in
4254 // comparison with "simplier-but-slower" implementation
4255 // 4) in order to avoid lots of push/pop operations, code below is heavily
4256 // re-using/re-initializing/compressing register values, which makes code
4257 // larger and a bit less readable, however, most of extra operations are
4258 // issued during loads or branches, so, penalty is minimal
generate_string_indexof_linear(bool str1_isL,bool str2_isL)4259 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4260 const char* stubName = str1_isL
4261 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4262 : "indexof_linear_uu";
4263 __ align(CodeEntryAlignment);
4264 StubCodeMark mark(this, "StubRoutines", stubName);
4265 address entry = __ pc();
4266
4267 int str1_chr_size = str1_isL ? 1 : 2;
4268 int str2_chr_size = str2_isL ? 1 : 2;
4269 int str1_chr_shift = str1_isL ? 0 : 1;
4270 int str2_chr_shift = str2_isL ? 0 : 1;
4271 bool isL = str1_isL && str2_isL;
4272 // parameters
4273 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4274 // temporary registers
4275 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4276 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4277 // redefinitions
4278 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4279
4280 __ push(spilled_regs, sp);
4281 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4282 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4283 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4284 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4285 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4286 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4287 // Read whole register from str1. It is safe, because length >=8 here
4288 __ ldr(ch1, Address(str1));
4289 // Read whole register from str2. It is safe, because length >=8 here
4290 __ ldr(ch2, Address(str2));
4291 __ sub(cnt2, cnt2, cnt1);
4292 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4293 if (str1_isL != str2_isL) {
4294 __ eor(v0, __ T16B, v0, v0);
4295 }
4296 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4297 __ mul(first, first, tmp1);
4298 // check if we have less than 1 register to check
4299 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4300 if (str1_isL != str2_isL) {
4301 __ fmovd(v1, ch1);
4302 }
4303 __ br(__ LE, L_SMALL);
4304 __ eor(ch2, first, ch2);
4305 if (str1_isL != str2_isL) {
4306 __ zip1(v1, __ T16B, v1, v0);
4307 }
4308 __ sub(tmp2, ch2, tmp1);
4309 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4310 __ bics(tmp2, tmp2, ch2);
4311 if (str1_isL != str2_isL) {
4312 __ fmovd(ch1, v1);
4313 }
4314 __ br(__ NE, L_HAS_ZERO);
4315 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4316 __ add(result, result, wordSize/str2_chr_size);
4317 __ add(str2, str2, wordSize);
4318 __ br(__ LT, L_POST_LOOP);
4319 __ BIND(L_LOOP);
4320 __ ldr(ch2, Address(str2));
4321 __ eor(ch2, first, ch2);
4322 __ sub(tmp2, ch2, tmp1);
4323 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4324 __ bics(tmp2, tmp2, ch2);
4325 __ br(__ NE, L_HAS_ZERO);
4326 __ BIND(L_LOOP_PROCEED);
4327 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4328 __ add(str2, str2, wordSize);
4329 __ add(result, result, wordSize/str2_chr_size);
4330 __ br(__ GE, L_LOOP);
4331 __ BIND(L_POST_LOOP);
4332 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4333 __ br(__ LE, NOMATCH);
4334 __ ldr(ch2, Address(str2));
4335 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4336 __ eor(ch2, first, ch2);
4337 __ sub(tmp2, ch2, tmp1);
4338 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4339 __ mov(tmp4, -1); // all bits set
4340 __ b(L_SMALL_PROCEED);
4341 __ align(OptoLoopAlignment);
4342 __ BIND(L_SMALL);
4343 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4344 __ eor(ch2, first, ch2);
4345 if (str1_isL != str2_isL) {
4346 __ zip1(v1, __ T16B, v1, v0);
4347 }
4348 __ sub(tmp2, ch2, tmp1);
4349 __ mov(tmp4, -1); // all bits set
4350 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4351 if (str1_isL != str2_isL) {
4352 __ fmovd(ch1, v1); // move converted 4 symbols
4353 }
4354 __ BIND(L_SMALL_PROCEED);
4355 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4356 __ bic(tmp2, tmp2, ch2);
4357 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4358 __ rbit(tmp2, tmp2);
4359 __ br(__ EQ, NOMATCH);
4360 __ BIND(L_SMALL_HAS_ZERO_LOOP);
4361 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4362 __ cmp(cnt1, u1(wordSize/str2_chr_size));
4363 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4364 if (str2_isL) { // LL
4365 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4366 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4367 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4368 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4369 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4370 } else {
4371 __ mov(ch2, 0xE); // all bits in byte set except last one
4372 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4373 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4374 __ lslv(tmp2, tmp2, tmp4);
4375 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4376 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4377 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4378 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4379 }
4380 __ cmp(ch1, ch2);
4381 __ mov(tmp4, wordSize/str2_chr_size);
4382 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4383 __ BIND(L_SMALL_CMP_LOOP);
4384 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4385 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4386 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4387 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4388 __ add(tmp4, tmp4, 1);
4389 __ cmp(tmp4, cnt1);
4390 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4391 __ cmp(first, ch2);
4392 __ br(__ EQ, L_SMALL_CMP_LOOP);
4393 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4394 __ cbz(tmp2, NOMATCH); // no more matches. exit
4395 __ clz(tmp4, tmp2);
4396 __ add(result, result, 1); // advance index
4397 __ add(str2, str2, str2_chr_size); // advance pointer
4398 __ b(L_SMALL_HAS_ZERO_LOOP);
4399 __ align(OptoLoopAlignment);
4400 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4401 __ cmp(first, ch2);
4402 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4403 __ b(DONE);
4404 __ align(OptoLoopAlignment);
4405 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4406 if (str2_isL) { // LL
4407 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4408 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4409 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4410 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4411 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4412 } else {
4413 __ mov(ch2, 0xE); // all bits in byte set except last one
4414 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4415 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4416 __ lslv(tmp2, tmp2, tmp4);
4417 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4418 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4419 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4420 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4421 }
4422 __ cmp(ch1, ch2);
4423 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4424 __ b(DONE);
4425 __ align(OptoLoopAlignment);
4426 __ BIND(L_HAS_ZERO);
4427 __ rbit(tmp2, tmp2);
4428 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4429 // Now, perform compression of counters(cnt2 and cnt1) into one register.
4430 // It's fine because both counters are 32bit and are not changed in this
4431 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4432 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4433 __ sub(result, result, 1);
4434 __ BIND(L_HAS_ZERO_LOOP);
4435 __ mov(cnt1, wordSize/str2_chr_size);
4436 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4437 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4438 if (str2_isL) {
4439 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4440 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4441 __ lslv(tmp2, tmp2, tmp4);
4442 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4443 __ add(tmp4, tmp4, 1);
4444 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4445 __ lsl(tmp2, tmp2, 1);
4446 __ mov(tmp4, wordSize/str2_chr_size);
4447 } else {
4448 __ mov(ch2, 0xE);
4449 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4450 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4451 __ lslv(tmp2, tmp2, tmp4);
4452 __ add(tmp4, tmp4, 1);
4453 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4454 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4455 __ lsl(tmp2, tmp2, 1);
4456 __ mov(tmp4, wordSize/str2_chr_size);
4457 __ sub(str2, str2, str2_chr_size);
4458 }
4459 __ cmp(ch1, ch2);
4460 __ mov(tmp4, wordSize/str2_chr_size);
4461 __ br(__ NE, L_CMP_LOOP_NOMATCH);
4462 __ BIND(L_CMP_LOOP);
4463 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4464 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4465 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4466 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4467 __ add(tmp4, tmp4, 1);
4468 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4469 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4470 __ cmp(cnt1, ch2);
4471 __ br(__ EQ, L_CMP_LOOP);
4472 __ BIND(L_CMP_LOOP_NOMATCH);
4473 // here we're not matched
4474 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4475 __ clz(tmp4, tmp2);
4476 __ add(str2, str2, str2_chr_size); // advance pointer
4477 __ b(L_HAS_ZERO_LOOP);
4478 __ align(OptoLoopAlignment);
4479 __ BIND(L_CMP_LOOP_LAST_CMP);
4480 __ cmp(cnt1, ch2);
4481 __ br(__ NE, L_CMP_LOOP_NOMATCH);
4482 __ b(DONE);
4483 __ align(OptoLoopAlignment);
4484 __ BIND(L_CMP_LOOP_LAST_CMP2);
4485 if (str2_isL) {
4486 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4487 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4488 __ lslv(tmp2, tmp2, tmp4);
4489 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4490 __ add(tmp4, tmp4, 1);
4491 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4492 __ lsl(tmp2, tmp2, 1);
4493 } else {
4494 __ mov(ch2, 0xE);
4495 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4496 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4497 __ lslv(tmp2, tmp2, tmp4);
4498 __ add(tmp4, tmp4, 1);
4499 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4500 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4501 __ lsl(tmp2, tmp2, 1);
4502 __ sub(str2, str2, str2_chr_size);
4503 }
4504 __ cmp(ch1, ch2);
4505 __ br(__ NE, L_CMP_LOOP_NOMATCH);
4506 __ b(DONE);
4507 __ align(OptoLoopAlignment);
4508 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4509 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4510 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4511 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4512 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4513 // result by analyzed characters value, so, we can just reset lower bits
4514 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4515 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4516 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4517 // index of last analyzed substring inside current octet. So, str2 in at
4518 // respective start address. We need to advance it to next octet
4519 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4520 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4521 __ bfm(result, zr, 0, 2 - str2_chr_shift);
4522 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4523 __ movw(cnt2, cnt2);
4524 __ b(L_LOOP_PROCEED);
4525 __ align(OptoLoopAlignment);
4526 __ BIND(NOMATCH);
4527 __ mov(result, -1);
4528 __ BIND(DONE);
4529 __ pop(spilled_regs, sp);
4530 __ ret(lr);
4531 return entry;
4532 }
4533
generate_string_indexof_stubs()4534 void generate_string_indexof_stubs() {
4535 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4536 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4537 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4538 }
4539
inflate_and_store_2_fp_registers(bool generatePrfm,FloatRegister src1,FloatRegister src2)4540 void inflate_and_store_2_fp_registers(bool generatePrfm,
4541 FloatRegister src1, FloatRegister src2) {
4542 Register dst = r1;
4543 __ zip1(v1, __ T16B, src1, v0);
4544 __ zip2(v2, __ T16B, src1, v0);
4545 if (generatePrfm) {
4546 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4547 }
4548 __ zip1(v3, __ T16B, src2, v0);
4549 __ zip2(v4, __ T16B, src2, v0);
4550 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4551 }
4552
4553 // R0 = src
4554 // R1 = dst
4555 // R2 = len
4556 // R3 = len >> 3
4557 // V0 = 0
4558 // v1 = loaded 8 bytes
generate_large_byte_array_inflate()4559 address generate_large_byte_array_inflate() {
4560 __ align(CodeEntryAlignment);
4561 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4562 address entry = __ pc();
4563 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4564 Register src = r0, dst = r1, len = r2, octetCounter = r3;
4565 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4566
4567 // do one more 8-byte read to have address 16-byte aligned in most cases
4568 // also use single store instruction
4569 __ ldrd(v2, __ post(src, 8));
4570 __ sub(octetCounter, octetCounter, 2);
4571 __ zip1(v1, __ T16B, v1, v0);
4572 __ zip1(v2, __ T16B, v2, v0);
4573 __ st1(v1, v2, __ T16B, __ post(dst, 32));
4574 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4575 __ subs(rscratch1, octetCounter, large_loop_threshold);
4576 __ br(__ LE, LOOP_START);
4577 __ b(LOOP_PRFM_START);
4578 __ bind(LOOP_PRFM);
4579 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4580 __ bind(LOOP_PRFM_START);
4581 __ prfm(Address(src, SoftwarePrefetchHintDistance));
4582 __ sub(octetCounter, octetCounter, 8);
4583 __ subs(rscratch1, octetCounter, large_loop_threshold);
4584 inflate_and_store_2_fp_registers(true, v3, v4);
4585 inflate_and_store_2_fp_registers(true, v5, v6);
4586 __ br(__ GT, LOOP_PRFM);
4587 __ cmp(octetCounter, (u1)8);
4588 __ br(__ LT, DONE);
4589 __ bind(LOOP);
4590 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4591 __ bind(LOOP_START);
4592 __ sub(octetCounter, octetCounter, 8);
4593 __ cmp(octetCounter, (u1)8);
4594 inflate_and_store_2_fp_registers(false, v3, v4);
4595 inflate_and_store_2_fp_registers(false, v5, v6);
4596 __ br(__ GE, LOOP);
4597 __ bind(DONE);
4598 __ ret(lr);
4599 return entry;
4600 }
4601
4602 /**
4603 * Arguments:
4604 *
4605 * Input:
4606 * c_rarg0 - current state address
4607 * c_rarg1 - H key address
4608 * c_rarg2 - data address
4609 * c_rarg3 - number of blocks
4610 *
4611 * Output:
4612 * Updated state at c_rarg0
4613 */
generate_ghash_processBlocks()4614 address generate_ghash_processBlocks() {
4615 // Bafflingly, GCM uses little-endian for the byte order, but
4616 // big-endian for the bit order. For example, the polynomial 1 is
4617 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4618 //
4619 // So, we must either reverse the bytes in each word and do
4620 // everything big-endian or reverse the bits in each byte and do
4621 // it little-endian. On AArch64 it's more idiomatic to reverse
4622 // the bits in each byte (we have an instruction, RBIT, to do
4623 // that) and keep the data in little-endian bit order throught the
4624 // calculation, bit-reversing the inputs and outputs.
4625
4626 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4627 __ align(wordSize * 2);
4628 address p = __ pc();
4629 __ emit_int64(0x87); // The low-order bits of the field
4630 // polynomial (i.e. p = z^7+z^2+z+1)
4631 // repeated in the low and high parts of a
4632 // 128-bit vector
4633 __ emit_int64(0x87);
4634
4635 __ align(CodeEntryAlignment);
4636 address start = __ pc();
4637
4638 Register state = c_rarg0;
4639 Register subkeyH = c_rarg1;
4640 Register data = c_rarg2;
4641 Register blocks = c_rarg3;
4642
4643 FloatRegister vzr = v30;
4644 __ eor(vzr, __ T16B, vzr, vzr); // zero register
4645
4646 __ ldrq(v0, Address(state));
4647 __ ldrq(v1, Address(subkeyH));
4648
4649 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
4650 __ rbit(v0, __ T16B, v0);
4651 __ rev64(v1, __ T16B, v1);
4652 __ rbit(v1, __ T16B, v1);
4653
4654 __ ldrq(v26, p);
4655
4656 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4657 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4658
4659 {
4660 Label L_ghash_loop;
4661 __ bind(L_ghash_loop);
4662
4663 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4664 // reversing each byte
4665 __ rbit(v2, __ T16B, v2);
4666 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
4667
4668 // Multiply state in v2 by subkey in v1
4669 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4670 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4671 /*temps*/v6, v20, v18, v21);
4672 // Reduce v7:v5 by the field polynomial
4673 ghash_reduce(v0, v5, v7, v26, vzr, v20);
4674
4675 __ sub(blocks, blocks, 1);
4676 __ cbnz(blocks, L_ghash_loop);
4677 }
4678
4679 // The bit-reversed result is at this point in v0
4680 __ rev64(v1, __ T16B, v0);
4681 __ rbit(v1, __ T16B, v1);
4682
4683 __ st1(v1, __ T16B, state);
4684 __ ret(lr);
4685
4686 return start;
4687 }
4688
generate_base64_encode_simdround(Register src,Register dst,FloatRegister codec,u8 size)4689 void generate_base64_encode_simdround(Register src, Register dst,
4690 FloatRegister codec, u8 size) {
4691
4692 FloatRegister in0 = v4, in1 = v5, in2 = v6;
4693 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
4694 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
4695
4696 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
4697
4698 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
4699
4700 __ ushr(ind0, arrangement, in0, 2);
4701
4702 __ ushr(ind1, arrangement, in1, 2);
4703 __ shl(in0, arrangement, in0, 6);
4704 __ orr(ind1, arrangement, ind1, in0);
4705 __ ushr(ind1, arrangement, ind1, 2);
4706
4707 __ ushr(ind2, arrangement, in2, 4);
4708 __ shl(in1, arrangement, in1, 4);
4709 __ orr(ind2, arrangement, in1, ind2);
4710 __ ushr(ind2, arrangement, ind2, 2);
4711
4712 __ shl(ind3, arrangement, in2, 2);
4713 __ ushr(ind3, arrangement, ind3, 2);
4714
4715 __ tbl(out0, arrangement, codec, 4, ind0);
4716 __ tbl(out1, arrangement, codec, 4, ind1);
4717 __ tbl(out2, arrangement, codec, 4, ind2);
4718 __ tbl(out3, arrangement, codec, 4, ind3);
4719
4720 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
4721 }
4722
4723 /**
4724 * Arguments:
4725 *
4726 * Input:
4727 * c_rarg0 - src_start
4728 * c_rarg1 - src_offset
4729 * c_rarg2 - src_length
4730 * c_rarg3 - dest_start
4731 * c_rarg4 - dest_offset
4732 * c_rarg5 - isURL
4733 *
4734 */
generate_base64_encodeBlock()4735 address generate_base64_encodeBlock() {
4736
4737 static const char toBase64[64] = {
4738 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4739 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4740 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4741 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4742 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
4743 };
4744
4745 static const char toBase64URL[64] = {
4746 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
4747 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
4748 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
4749 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
4750 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
4751 };
4752
4753 __ align(CodeEntryAlignment);
4754 StubCodeMark mark(this, "StubRoutines", "encodeBlock");
4755 address start = __ pc();
4756
4757 Register src = c_rarg0; // source array
4758 Register soff = c_rarg1; // source start offset
4759 Register send = c_rarg2; // source end offset
4760 Register dst = c_rarg3; // dest array
4761 Register doff = c_rarg4; // position for writing to dest array
4762 Register isURL = c_rarg5; // Base64 or URL chracter set
4763
4764 // c_rarg6 and c_rarg7 are free to use as temps
4765 Register codec = c_rarg6;
4766 Register length = c_rarg7;
4767
4768 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
4769
4770 __ add(src, src, soff);
4771 __ add(dst, dst, doff);
4772 __ sub(length, send, soff);
4773
4774 // load the codec base address
4775 __ lea(codec, ExternalAddress((address) toBase64));
4776 __ cbz(isURL, ProcessData);
4777 __ lea(codec, ExternalAddress((address) toBase64URL));
4778
4779 __ BIND(ProcessData);
4780
4781 // too short to formup a SIMD loop, roll back
4782 __ cmp(length, (u1)24);
4783 __ br(Assembler::LT, Process3B);
4784
4785 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
4786
4787 __ BIND(Process48B);
4788 __ cmp(length, (u1)48);
4789 __ br(Assembler::LT, Process24B);
4790 generate_base64_encode_simdround(src, dst, v0, 16);
4791 __ sub(length, length, 48);
4792 __ b(Process48B);
4793
4794 __ BIND(Process24B);
4795 __ cmp(length, (u1)24);
4796 __ br(Assembler::LT, SIMDExit);
4797 generate_base64_encode_simdround(src, dst, v0, 8);
4798 __ sub(length, length, 24);
4799
4800 __ BIND(SIMDExit);
4801 __ cbz(length, Exit);
4802
4803 __ BIND(Process3B);
4804 // 3 src bytes, 24 bits
4805 __ ldrb(r10, __ post(src, 1));
4806 __ ldrb(r11, __ post(src, 1));
4807 __ ldrb(r12, __ post(src, 1));
4808 __ orrw(r11, r11, r10, Assembler::LSL, 8);
4809 __ orrw(r12, r12, r11, Assembler::LSL, 8);
4810 // codec index
4811 __ ubfmw(r15, r12, 18, 23);
4812 __ ubfmw(r14, r12, 12, 17);
4813 __ ubfmw(r13, r12, 6, 11);
4814 __ andw(r12, r12, 63);
4815 // get the code based on the codec
4816 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
4817 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
4818 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
4819 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
4820 __ strb(r15, __ post(dst, 1));
4821 __ strb(r14, __ post(dst, 1));
4822 __ strb(r13, __ post(dst, 1));
4823 __ strb(r12, __ post(dst, 1));
4824 __ sub(length, length, 3);
4825 __ cbnz(length, Process3B);
4826
4827 __ BIND(Exit);
4828 __ ret(lr);
4829
4830 return start;
4831 }
4832
4833 // Continuation point for throwing of implicit exceptions that are
4834 // not handled in the current activation. Fabricates an exception
4835 // oop and initiates normal exception dispatching in this
4836 // frame. Since we need to preserve callee-saved values (currently
4837 // only for C2, but done for C1 as well) we need a callee-saved oop
4838 // map and therefore have to make these stubs into RuntimeStubs
4839 // rather than BufferBlobs. If the compiler needs all registers to
4840 // be preserved between the fault point and the exception handler
4841 // then it must assume responsibility for that in
4842 // AbstractCompiler::continuation_for_implicit_null_exception or
4843 // continuation_for_implicit_division_by_zero_exception. All other
4844 // implicit exceptions (e.g., NullPointerException or
4845 // AbstractMethodError on entry) are either at call sites or
4846 // otherwise assume that stack unwinding will be initiated, so
4847 // caller saved registers were assumed volatile in the compiler.
4848
4849 #undef __
4850 #define __ masm->
4851
generate_throw_exception(const char * name,address runtime_entry,Register arg1=noreg,Register arg2=noreg)4852 address generate_throw_exception(const char* name,
4853 address runtime_entry,
4854 Register arg1 = noreg,
4855 Register arg2 = noreg) {
4856 // Information about frame layout at time of blocking runtime call.
4857 // Note that we only have to preserve callee-saved registers since
4858 // the compilers are responsible for supplying a continuation point
4859 // if they expect all registers to be preserved.
4860 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4861 enum layout {
4862 rfp_off = 0,
4863 rfp_off2,
4864 return_off,
4865 return_off2,
4866 framesize // inclusive of return address
4867 };
4868
4869 int insts_size = 512;
4870 int locs_size = 64;
4871
4872 CodeBuffer code(name, insts_size, locs_size);
4873 OopMapSet* oop_maps = new OopMapSet();
4874 MacroAssembler* masm = new MacroAssembler(&code);
4875
4876 address start = __ pc();
4877
4878 // This is an inlined and slightly modified version of call_VM
4879 // which has the ability to fetch the return PC out of
4880 // thread-local storage and also sets up last_Java_sp slightly
4881 // differently than the real call_VM
4882
4883 __ enter(); // Save FP and LR before call
4884
4885 assert(is_even(framesize/2), "sp not 16-byte aligned");
4886
4887 // lr and fp are already in place
4888 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4889
4890 int frame_complete = __ pc() - start;
4891
4892 // Set up last_Java_sp and last_Java_fp
4893 address the_pc = __ pc();
4894 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4895
4896 // Call runtime
4897 if (arg1 != noreg) {
4898 assert(arg2 != c_rarg1, "clobbered");
4899 __ mov(c_rarg1, arg1);
4900 }
4901 if (arg2 != noreg) {
4902 __ mov(c_rarg2, arg2);
4903 }
4904 __ mov(c_rarg0, rthread);
4905 BLOCK_COMMENT("call runtime_entry");
4906 __ mov(rscratch1, runtime_entry);
4907 __ blr(rscratch1);
4908
4909 // Generate oop map
4910 OopMap* map = new OopMap(framesize, 0);
4911
4912 oop_maps->add_gc_map(the_pc - start, map);
4913
4914 __ reset_last_Java_frame(true);
4915 __ maybe_isb();
4916
4917 __ leave();
4918
4919 // check for pending exceptions
4920 #ifdef ASSERT
4921 Label L;
4922 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4923 __ cbnz(rscratch1, L);
4924 __ should_not_reach_here();
4925 __ bind(L);
4926 #endif // ASSERT
4927 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4928
4929
4930 // codeBlob framesize is in words (not VMRegImpl::slot_size)
4931 RuntimeStub* stub =
4932 RuntimeStub::new_runtime_stub(name,
4933 &code,
4934 frame_complete,
4935 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4936 oop_maps, false);
4937 return stub->entry_point();
4938 }
4939
4940 class MontgomeryMultiplyGenerator : public MacroAssembler {
4941
4942 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4943 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4944
4945 RegSet _toSave;
4946 bool _squaring;
4947
4948 public:
MontgomeryMultiplyGenerator(Assembler * as,bool squaring)4949 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4950 : MacroAssembler(as->code()), _squaring(squaring) {
4951
4952 // Register allocation
4953
4954 Register reg = c_rarg0;
4955 Pa_base = reg; // Argument registers
4956 if (squaring)
4957 Pb_base = Pa_base;
4958 else
4959 Pb_base = ++reg;
4960 Pn_base = ++reg;
4961 Rlen= ++reg;
4962 inv = ++reg;
4963 Pm_base = ++reg;
4964
4965 // Working registers:
4966 Ra = ++reg; // The current digit of a, b, n, and m.
4967 Rb = ++reg;
4968 Rm = ++reg;
4969 Rn = ++reg;
4970
4971 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m.
4972 Pb = ++reg;
4973 Pm = ++reg;
4974 Pn = ++reg;
4975
4976 t0 = ++reg; // Three registers which form a
4977 t1 = ++reg; // triple-precision accumuator.
4978 t2 = ++reg;
4979
4980 Ri = ++reg; // Inner and outer loop indexes.
4981 Rj = ++reg;
4982
4983 Rhi_ab = ++reg; // Product registers: low and high parts
4984 Rlo_ab = ++reg; // of a*b and m*n.
4985 Rhi_mn = ++reg;
4986 Rlo_mn = ++reg;
4987
4988 // r19 and up are callee-saved.
4989 _toSave = RegSet::range(r19, reg) + Pm_base;
4990 }
4991
4992 private:
save_regs()4993 void save_regs() {
4994 push(_toSave, sp);
4995 }
4996
restore_regs()4997 void restore_regs() {
4998 pop(_toSave, sp);
4999 }
5000
5001 template <typename T>
unroll_2(Register count,T block)5002 void unroll_2(Register count, T block) {
5003 Label loop, end, odd;
5004 tbnz(count, 0, odd);
5005 cbz(count, end);
5006 align(16);
5007 bind(loop);
5008 (this->*block)();
5009 bind(odd);
5010 (this->*block)();
5011 subs(count, count, 2);
5012 br(Assembler::GT, loop);
5013 bind(end);
5014 }
5015
5016 template <typename T>
unroll_2(Register count,T block,Register d,Register s,Register tmp)5017 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
5018 Label loop, end, odd;
5019 tbnz(count, 0, odd);
5020 cbz(count, end);
5021 align(16);
5022 bind(loop);
5023 (this->*block)(d, s, tmp);
5024 bind(odd);
5025 (this->*block)(d, s, tmp);
5026 subs(count, count, 2);
5027 br(Assembler::GT, loop);
5028 bind(end);
5029 }
5030
pre1(RegisterOrConstant i)5031 void pre1(RegisterOrConstant i) {
5032 block_comment("pre1");
5033 // Pa = Pa_base;
5034 // Pb = Pb_base + i;
5035 // Pm = Pm_base;
5036 // Pn = Pn_base + i;
5037 // Ra = *Pa;
5038 // Rb = *Pb;
5039 // Rm = *Pm;
5040 // Rn = *Pn;
5041 ldr(Ra, Address(Pa_base));
5042 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5043 ldr(Rm, Address(Pm_base));
5044 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5045 lea(Pa, Address(Pa_base));
5046 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5047 lea(Pm, Address(Pm_base));
5048 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5049
5050 // Zero the m*n result.
5051 mov(Rhi_mn, zr);
5052 mov(Rlo_mn, zr);
5053 }
5054
5055 // The core multiply-accumulate step of a Montgomery
5056 // multiplication. The idea is to schedule operations as a
5057 // pipeline so that instructions with long latencies (loads and
5058 // multiplies) have time to complete before their results are
5059 // used. This most benefits in-order implementations of the
5060 // architecture but out-of-order ones also benefit.
step()5061 void step() {
5062 block_comment("step");
5063 // MACC(Ra, Rb, t0, t1, t2);
5064 // Ra = *++Pa;
5065 // Rb = *--Pb;
5066 umulh(Rhi_ab, Ra, Rb);
5067 mul(Rlo_ab, Ra, Rb);
5068 ldr(Ra, pre(Pa, wordSize));
5069 ldr(Rb, pre(Pb, -wordSize));
5070 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5071 // previous iteration.
5072 // MACC(Rm, Rn, t0, t1, t2);
5073 // Rm = *++Pm;
5074 // Rn = *--Pn;
5075 umulh(Rhi_mn, Rm, Rn);
5076 mul(Rlo_mn, Rm, Rn);
5077 ldr(Rm, pre(Pm, wordSize));
5078 ldr(Rn, pre(Pn, -wordSize));
5079 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5080 }
5081
post1()5082 void post1() {
5083 block_comment("post1");
5084
5085 // MACC(Ra, Rb, t0, t1, t2);
5086 // Ra = *++Pa;
5087 // Rb = *--Pb;
5088 umulh(Rhi_ab, Ra, Rb);
5089 mul(Rlo_ab, Ra, Rb);
5090 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
5091 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5092
5093 // *Pm = Rm = t0 * inv;
5094 mul(Rm, t0, inv);
5095 str(Rm, Address(Pm));
5096
5097 // MACC(Rm, Rn, t0, t1, t2);
5098 // t0 = t1; t1 = t2; t2 = 0;
5099 umulh(Rhi_mn, Rm, Rn);
5100
5101 #ifndef PRODUCT
5102 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5103 {
5104 mul(Rlo_mn, Rm, Rn);
5105 add(Rlo_mn, t0, Rlo_mn);
5106 Label ok;
5107 cbz(Rlo_mn, ok); {
5108 stop("broken Montgomery multiply");
5109 } bind(ok);
5110 }
5111 #endif
5112 // We have very carefully set things up so that
5113 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5114 // the lower half of Rm * Rn because we know the result already:
5115 // it must be -t0. t0 + (-t0) must generate a carry iff
5116 // t0 != 0. So, rather than do a mul and an adds we just set
5117 // the carry flag iff t0 is nonzero.
5118 //
5119 // mul(Rlo_mn, Rm, Rn);
5120 // adds(zr, t0, Rlo_mn);
5121 subs(zr, t0, 1); // Set carry iff t0 is nonzero
5122 adcs(t0, t1, Rhi_mn);
5123 adc(t1, t2, zr);
5124 mov(t2, zr);
5125 }
5126
pre2(RegisterOrConstant i,RegisterOrConstant len)5127 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5128 block_comment("pre2");
5129 // Pa = Pa_base + i-len;
5130 // Pb = Pb_base + len;
5131 // Pm = Pm_base + i-len;
5132 // Pn = Pn_base + len;
5133
5134 if (i.is_register()) {
5135 sub(Rj, i.as_register(), len);
5136 } else {
5137 mov(Rj, i.as_constant());
5138 sub(Rj, Rj, len);
5139 }
5140 // Rj == i-len
5141
5142 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5143 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5144 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5145 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5146
5147 // Ra = *++Pa;
5148 // Rb = *--Pb;
5149 // Rm = *++Pm;
5150 // Rn = *--Pn;
5151 ldr(Ra, pre(Pa, wordSize));
5152 ldr(Rb, pre(Pb, -wordSize));
5153 ldr(Rm, pre(Pm, wordSize));
5154 ldr(Rn, pre(Pn, -wordSize));
5155
5156 mov(Rhi_mn, zr);
5157 mov(Rlo_mn, zr);
5158 }
5159
post2(RegisterOrConstant i,RegisterOrConstant len)5160 void post2(RegisterOrConstant i, RegisterOrConstant len) {
5161 block_comment("post2");
5162 if (i.is_constant()) {
5163 mov(Rj, i.as_constant()-len.as_constant());
5164 } else {
5165 sub(Rj, i.as_register(), len);
5166 }
5167
5168 adds(t0, t0, Rlo_mn); // The pending m*n, low part
5169
5170 // As soon as we know the least significant digit of our result,
5171 // store it.
5172 // Pm_base[i-len] = t0;
5173 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5174
5175 // t0 = t1; t1 = t2; t2 = 0;
5176 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5177 adc(t1, t2, zr);
5178 mov(t2, zr);
5179 }
5180
5181 // A carry in t0 after Montgomery multiplication means that we
5182 // should subtract multiples of n from our result in m. We'll
5183 // keep doing that until there is no carry.
normalize(RegisterOrConstant len)5184 void normalize(RegisterOrConstant len) {
5185 block_comment("normalize");
5186 // while (t0)
5187 // t0 = sub(Pm_base, Pn_base, t0, len);
5188 Label loop, post, again;
5189 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5190 cbz(t0, post); {
5191 bind(again); {
5192 mov(i, zr);
5193 mov(cnt, len);
5194 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5195 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5196 subs(zr, zr, zr); // set carry flag, i.e. no borrow
5197 align(16);
5198 bind(loop); {
5199 sbcs(Rm, Rm, Rn);
5200 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5201 add(i, i, 1);
5202 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5203 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5204 sub(cnt, cnt, 1);
5205 } cbnz(cnt, loop);
5206 sbc(t0, t0, zr);
5207 } cbnz(t0, again);
5208 } bind(post);
5209 }
5210
5211 // Move memory at s to d, reversing words.
5212 // Increments d to end of copied memory
5213 // Destroys tmp1, tmp2
5214 // Preserves len
5215 // Leaves s pointing to the address which was in d at start
reverse(Register d,Register s,Register len,Register tmp1,Register tmp2)5216 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5217 assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5218
5219 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5220 mov(tmp1, len);
5221 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5222 sub(s, d, len, ext::uxtw, LogBytesPerWord);
5223 }
5224 // where
reverse1(Register d,Register s,Register tmp)5225 void reverse1(Register d, Register s, Register tmp) {
5226 ldr(tmp, pre(s, -wordSize));
5227 ror(tmp, tmp, 32);
5228 str(tmp, post(d, wordSize));
5229 }
5230
step_squaring()5231 void step_squaring() {
5232 // An extra ACC
5233 step();
5234 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5235 }
5236
last_squaring(RegisterOrConstant i)5237 void last_squaring(RegisterOrConstant i) {
5238 Label dont;
5239 // if ((i & 1) == 0) {
5240 tbnz(i.as_register(), 0, dont); {
5241 // MACC(Ra, Rb, t0, t1, t2);
5242 // Ra = *++Pa;
5243 // Rb = *--Pb;
5244 umulh(Rhi_ab, Ra, Rb);
5245 mul(Rlo_ab, Ra, Rb);
5246 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5247 } bind(dont);
5248 }
5249
extra_step_squaring()5250 void extra_step_squaring() {
5251 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
5252
5253 // MACC(Rm, Rn, t0, t1, t2);
5254 // Rm = *++Pm;
5255 // Rn = *--Pn;
5256 umulh(Rhi_mn, Rm, Rn);
5257 mul(Rlo_mn, Rm, Rn);
5258 ldr(Rm, pre(Pm, wordSize));
5259 ldr(Rn, pre(Pn, -wordSize));
5260 }
5261
post1_squaring()5262 void post1_squaring() {
5263 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
5264
5265 // *Pm = Rm = t0 * inv;
5266 mul(Rm, t0, inv);
5267 str(Rm, Address(Pm));
5268
5269 // MACC(Rm, Rn, t0, t1, t2);
5270 // t0 = t1; t1 = t2; t2 = 0;
5271 umulh(Rhi_mn, Rm, Rn);
5272
5273 #ifndef PRODUCT
5274 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5275 {
5276 mul(Rlo_mn, Rm, Rn);
5277 add(Rlo_mn, t0, Rlo_mn);
5278 Label ok;
5279 cbz(Rlo_mn, ok); {
5280 stop("broken Montgomery multiply");
5281 } bind(ok);
5282 }
5283 #endif
5284 // We have very carefully set things up so that
5285 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5286 // the lower half of Rm * Rn because we know the result already:
5287 // it must be -t0. t0 + (-t0) must generate a carry iff
5288 // t0 != 0. So, rather than do a mul and an adds we just set
5289 // the carry flag iff t0 is nonzero.
5290 //
5291 // mul(Rlo_mn, Rm, Rn);
5292 // adds(zr, t0, Rlo_mn);
5293 subs(zr, t0, 1); // Set carry iff t0 is nonzero
5294 adcs(t0, t1, Rhi_mn);
5295 adc(t1, t2, zr);
5296 mov(t2, zr);
5297 }
5298
acc(Register Rhi,Register Rlo,Register t0,Register t1,Register t2)5299 void acc(Register Rhi, Register Rlo,
5300 Register t0, Register t1, Register t2) {
5301 adds(t0, t0, Rlo);
5302 adcs(t1, t1, Rhi);
5303 adc(t2, t2, zr);
5304 }
5305
5306 public:
5307 /**
5308 * Fast Montgomery multiplication. The derivation of the
5309 * algorithm is in A Cryptographic Library for the Motorola
5310 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5311 *
5312 * Arguments:
5313 *
5314 * Inputs for multiplication:
5315 * c_rarg0 - int array elements a
5316 * c_rarg1 - int array elements b
5317 * c_rarg2 - int array elements n (the modulus)
5318 * c_rarg3 - int length
5319 * c_rarg4 - int inv
5320 * c_rarg5 - int array elements m (the result)
5321 *
5322 * Inputs for squaring:
5323 * c_rarg0 - int array elements a
5324 * c_rarg1 - int array elements n (the modulus)
5325 * c_rarg2 - int length
5326 * c_rarg3 - int inv
5327 * c_rarg4 - int array elements m (the result)
5328 *
5329 */
generate_multiply()5330 address generate_multiply() {
5331 Label argh, nothing;
5332 bind(argh);
5333 stop("MontgomeryMultiply total_allocation must be <= 8192");
5334
5335 align(CodeEntryAlignment);
5336 address entry = pc();
5337
5338 cbzw(Rlen, nothing);
5339
5340 enter();
5341
5342 // Make room.
5343 cmpw(Rlen, 512);
5344 br(Assembler::HI, argh);
5345 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5346 andr(sp, Ra, -2 * wordSize);
5347
5348 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
5349
5350 {
5351 // Copy input args, reversing as we go. We use Ra as a
5352 // temporary variable.
5353 reverse(Ra, Pa_base, Rlen, t0, t1);
5354 if (!_squaring)
5355 reverse(Ra, Pb_base, Rlen, t0, t1);
5356 reverse(Ra, Pn_base, Rlen, t0, t1);
5357 }
5358
5359 // Push all call-saved registers and also Pm_base which we'll need
5360 // at the end.
5361 save_regs();
5362
5363 #ifndef PRODUCT
5364 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5365 {
5366 ldr(Rn, Address(Pn_base, 0));
5367 mul(Rlo_mn, Rn, inv);
5368 subs(zr, Rlo_mn, -1);
5369 Label ok;
5370 br(EQ, ok); {
5371 stop("broken inverse in Montgomery multiply");
5372 } bind(ok);
5373 }
5374 #endif
5375
5376 mov(Pm_base, Ra);
5377
5378 mov(t0, zr);
5379 mov(t1, zr);
5380 mov(t2, zr);
5381
5382 block_comment("for (int i = 0; i < len; i++) {");
5383 mov(Ri, zr); {
5384 Label loop, end;
5385 cmpw(Ri, Rlen);
5386 br(Assembler::GE, end);
5387
5388 bind(loop);
5389 pre1(Ri);
5390
5391 block_comment(" for (j = i; j; j--) {"); {
5392 movw(Rj, Ri);
5393 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5394 } block_comment(" } // j");
5395
5396 post1();
5397 addw(Ri, Ri, 1);
5398 cmpw(Ri, Rlen);
5399 br(Assembler::LT, loop);
5400 bind(end);
5401 block_comment("} // i");
5402 }
5403
5404 block_comment("for (int i = len; i < 2*len; i++) {");
5405 mov(Ri, Rlen); {
5406 Label loop, end;
5407 cmpw(Ri, Rlen, Assembler::LSL, 1);
5408 br(Assembler::GE, end);
5409
5410 bind(loop);
5411 pre2(Ri, Rlen);
5412
5413 block_comment(" for (j = len*2-i-1; j; j--) {"); {
5414 lslw(Rj, Rlen, 1);
5415 subw(Rj, Rj, Ri);
5416 subw(Rj, Rj, 1);
5417 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5418 } block_comment(" } // j");
5419
5420 post2(Ri, Rlen);
5421 addw(Ri, Ri, 1);
5422 cmpw(Ri, Rlen, Assembler::LSL, 1);
5423 br(Assembler::LT, loop);
5424 bind(end);
5425 }
5426 block_comment("} // i");
5427
5428 normalize(Rlen);
5429
5430 mov(Ra, Pm_base); // Save Pm_base in Ra
5431 restore_regs(); // Restore caller's Pm_base
5432
5433 // Copy our result into caller's Pm_base
5434 reverse(Pm_base, Ra, Rlen, t0, t1);
5435
5436 leave();
5437 bind(nothing);
5438 ret(lr);
5439
5440 return entry;
5441 }
5442 // In C, approximately:
5443
5444 // void
5445 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5446 // unsigned long Pn_base[], unsigned long Pm_base[],
5447 // unsigned long inv, int len) {
5448 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5449 // unsigned long *Pa, *Pb, *Pn, *Pm;
5450 // unsigned long Ra, Rb, Rn, Rm;
5451
5452 // int i;
5453
5454 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5455
5456 // for (i = 0; i < len; i++) {
5457 // int j;
5458
5459 // Pa = Pa_base;
5460 // Pb = Pb_base + i;
5461 // Pm = Pm_base;
5462 // Pn = Pn_base + i;
5463
5464 // Ra = *Pa;
5465 // Rb = *Pb;
5466 // Rm = *Pm;
5467 // Rn = *Pn;
5468
5469 // int iters = i;
5470 // for (j = 0; iters--; j++) {
5471 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5472 // MACC(Ra, Rb, t0, t1, t2);
5473 // Ra = *++Pa;
5474 // Rb = *--Pb;
5475 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5476 // MACC(Rm, Rn, t0, t1, t2);
5477 // Rm = *++Pm;
5478 // Rn = *--Pn;
5479 // }
5480
5481 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5482 // MACC(Ra, Rb, t0, t1, t2);
5483 // *Pm = Rm = t0 * inv;
5484 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5485 // MACC(Rm, Rn, t0, t1, t2);
5486
5487 // assert(t0 == 0, "broken Montgomery multiply");
5488
5489 // t0 = t1; t1 = t2; t2 = 0;
5490 // }
5491
5492 // for (i = len; i < 2*len; i++) {
5493 // int j;
5494
5495 // Pa = Pa_base + i-len;
5496 // Pb = Pb_base + len;
5497 // Pm = Pm_base + i-len;
5498 // Pn = Pn_base + len;
5499
5500 // Ra = *++Pa;
5501 // Rb = *--Pb;
5502 // Rm = *++Pm;
5503 // Rn = *--Pn;
5504
5505 // int iters = len*2-i-1;
5506 // for (j = i-len+1; iters--; j++) {
5507 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5508 // MACC(Ra, Rb, t0, t1, t2);
5509 // Ra = *++Pa;
5510 // Rb = *--Pb;
5511 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5512 // MACC(Rm, Rn, t0, t1, t2);
5513 // Rm = *++Pm;
5514 // Rn = *--Pn;
5515 // }
5516
5517 // Pm_base[i-len] = t0;
5518 // t0 = t1; t1 = t2; t2 = 0;
5519 // }
5520
5521 // while (t0)
5522 // t0 = sub(Pm_base, Pn_base, t0, len);
5523 // }
5524
5525 /**
5526 * Fast Montgomery squaring. This uses asymptotically 25% fewer
5527 * multiplies than Montgomery multiplication so it should be up to
5528 * 25% faster. However, its loop control is more complex and it
5529 * may actually run slower on some machines.
5530 *
5531 * Arguments:
5532 *
5533 * Inputs:
5534 * c_rarg0 - int array elements a
5535 * c_rarg1 - int array elements n (the modulus)
5536 * c_rarg2 - int length
5537 * c_rarg3 - int inv
5538 * c_rarg4 - int array elements m (the result)
5539 *
5540 */
generate_square()5541 address generate_square() {
5542 Label argh;
5543 bind(argh);
5544 stop("MontgomeryMultiply total_allocation must be <= 8192");
5545
5546 align(CodeEntryAlignment);
5547 address entry = pc();
5548
5549 enter();
5550
5551 // Make room.
5552 cmpw(Rlen, 512);
5553 br(Assembler::HI, argh);
5554 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5555 andr(sp, Ra, -2 * wordSize);
5556
5557 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
5558
5559 {
5560 // Copy input args, reversing as we go. We use Ra as a
5561 // temporary variable.
5562 reverse(Ra, Pa_base, Rlen, t0, t1);
5563 reverse(Ra, Pn_base, Rlen, t0, t1);
5564 }
5565
5566 // Push all call-saved registers and also Pm_base which we'll need
5567 // at the end.
5568 save_regs();
5569
5570 mov(Pm_base, Ra);
5571
5572 mov(t0, zr);
5573 mov(t1, zr);
5574 mov(t2, zr);
5575
5576 block_comment("for (int i = 0; i < len; i++) {");
5577 mov(Ri, zr); {
5578 Label loop, end;
5579 bind(loop);
5580 cmp(Ri, Rlen);
5581 br(Assembler::GE, end);
5582
5583 pre1(Ri);
5584
5585 block_comment("for (j = (i+1)/2; j; j--) {"); {
5586 add(Rj, Ri, 1);
5587 lsr(Rj, Rj, 1);
5588 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5589 } block_comment(" } // j");
5590
5591 last_squaring(Ri);
5592
5593 block_comment(" for (j = i/2; j; j--) {"); {
5594 lsr(Rj, Ri, 1);
5595 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5596 } block_comment(" } // j");
5597
5598 post1_squaring();
5599 add(Ri, Ri, 1);
5600 cmp(Ri, Rlen);
5601 br(Assembler::LT, loop);
5602
5603 bind(end);
5604 block_comment("} // i");
5605 }
5606
5607 block_comment("for (int i = len; i < 2*len; i++) {");
5608 mov(Ri, Rlen); {
5609 Label loop, end;
5610 bind(loop);
5611 cmp(Ri, Rlen, Assembler::LSL, 1);
5612 br(Assembler::GE, end);
5613
5614 pre2(Ri, Rlen);
5615
5616 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
5617 lsl(Rj, Rlen, 1);
5618 sub(Rj, Rj, Ri);
5619 sub(Rj, Rj, 1);
5620 lsr(Rj, Rj, 1);
5621 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5622 } block_comment(" } // j");
5623
5624 last_squaring(Ri);
5625
5626 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
5627 lsl(Rj, Rlen, 1);
5628 sub(Rj, Rj, Ri);
5629 lsr(Rj, Rj, 1);
5630 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5631 } block_comment(" } // j");
5632
5633 post2(Ri, Rlen);
5634 add(Ri, Ri, 1);
5635 cmp(Ri, Rlen, Assembler::LSL, 1);
5636
5637 br(Assembler::LT, loop);
5638 bind(end);
5639 block_comment("} // i");
5640 }
5641
5642 normalize(Rlen);
5643
5644 mov(Ra, Pm_base); // Save Pm_base in Ra
5645 restore_regs(); // Restore caller's Pm_base
5646
5647 // Copy our result into caller's Pm_base
5648 reverse(Pm_base, Ra, Rlen, t0, t1);
5649
5650 leave();
5651 ret(lr);
5652
5653 return entry;
5654 }
5655 // In C, approximately:
5656
5657 // void
5658 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5659 // unsigned long Pm_base[], unsigned long inv, int len) {
5660 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5661 // unsigned long *Pa, *Pb, *Pn, *Pm;
5662 // unsigned long Ra, Rb, Rn, Rm;
5663
5664 // int i;
5665
5666 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5667
5668 // for (i = 0; i < len; i++) {
5669 // int j;
5670
5671 // Pa = Pa_base;
5672 // Pb = Pa_base + i;
5673 // Pm = Pm_base;
5674 // Pn = Pn_base + i;
5675
5676 // Ra = *Pa;
5677 // Rb = *Pb;
5678 // Rm = *Pm;
5679 // Rn = *Pn;
5680
5681 // int iters = (i+1)/2;
5682 // for (j = 0; iters--; j++) {
5683 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5684 // MACC2(Ra, Rb, t0, t1, t2);
5685 // Ra = *++Pa;
5686 // Rb = *--Pb;
5687 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5688 // MACC(Rm, Rn, t0, t1, t2);
5689 // Rm = *++Pm;
5690 // Rn = *--Pn;
5691 // }
5692 // if ((i & 1) == 0) {
5693 // assert(Ra == Pa_base[j], "must be");
5694 // MACC(Ra, Ra, t0, t1, t2);
5695 // }
5696 // iters = i/2;
5697 // assert(iters == i-j, "must be");
5698 // for (; iters--; j++) {
5699 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5700 // MACC(Rm, Rn, t0, t1, t2);
5701 // Rm = *++Pm;
5702 // Rn = *--Pn;
5703 // }
5704
5705 // *Pm = Rm = t0 * inv;
5706 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5707 // MACC(Rm, Rn, t0, t1, t2);
5708
5709 // assert(t0 == 0, "broken Montgomery multiply");
5710
5711 // t0 = t1; t1 = t2; t2 = 0;
5712 // }
5713
5714 // for (i = len; i < 2*len; i++) {
5715 // int start = i-len+1;
5716 // int end = start + (len - start)/2;
5717 // int j;
5718
5719 // Pa = Pa_base + i-len;
5720 // Pb = Pa_base + len;
5721 // Pm = Pm_base + i-len;
5722 // Pn = Pn_base + len;
5723
5724 // Ra = *++Pa;
5725 // Rb = *--Pb;
5726 // Rm = *++Pm;
5727 // Rn = *--Pn;
5728
5729 // int iters = (2*len-i-1)/2;
5730 // assert(iters == end-start, "must be");
5731 // for (j = start; iters--; j++) {
5732 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5733 // MACC2(Ra, Rb, t0, t1, t2);
5734 // Ra = *++Pa;
5735 // Rb = *--Pb;
5736 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5737 // MACC(Rm, Rn, t0, t1, t2);
5738 // Rm = *++Pm;
5739 // Rn = *--Pn;
5740 // }
5741 // if ((i & 1) == 0) {
5742 // assert(Ra == Pa_base[j], "must be");
5743 // MACC(Ra, Ra, t0, t1, t2);
5744 // }
5745 // iters = (2*len-i)/2;
5746 // assert(iters == len-j, "must be");
5747 // for (; iters--; j++) {
5748 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5749 // MACC(Rm, Rn, t0, t1, t2);
5750 // Rm = *++Pm;
5751 // Rn = *--Pn;
5752 // }
5753 // Pm_base[i-len] = t0;
5754 // t0 = t1; t1 = t2; t2 = 0;
5755 // }
5756
5757 // while (t0)
5758 // t0 = sub(Pm_base, Pn_base, t0, len);
5759 // }
5760 };
5761
5762
5763 // Initialization
generate_initial()5764 void generate_initial() {
5765 // Generate initial stubs and initializes the entry points
5766
5767 // entry points that exist in all platforms Note: This is code
5768 // that could be shared among different platforms - however the
5769 // benefit seems to be smaller than the disadvantage of having a
5770 // much more complicated generator structure. See also comment in
5771 // stubRoutines.hpp.
5772
5773 StubRoutines::_forward_exception_entry = generate_forward_exception();
5774
5775 StubRoutines::_call_stub_entry =
5776 generate_call_stub(StubRoutines::_call_stub_return_address);
5777
5778 // is referenced by megamorphic call
5779 StubRoutines::_catch_exception_entry = generate_catch_exception();
5780
5781 // Build this early so it's available for the interpreter.
5782 StubRoutines::_throw_StackOverflowError_entry =
5783 generate_throw_exception("StackOverflowError throw_exception",
5784 CAST_FROM_FN_PTR(address,
5785 SharedRuntime::throw_StackOverflowError));
5786 StubRoutines::_throw_delayed_StackOverflowError_entry =
5787 generate_throw_exception("delayed StackOverflowError throw_exception",
5788 CAST_FROM_FN_PTR(address,
5789 SharedRuntime::throw_delayed_StackOverflowError));
5790 if (UseCRC32Intrinsics) {
5791 // set table address before stub generation which use it
5792 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5793 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5794 }
5795
5796 if (UseCRC32CIntrinsics) {
5797 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5798 }
5799
5800 // Disabled until JDK-8210858 is fixed
5801 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5802 // StubRoutines::_dlog = generate_dlog();
5803 // }
5804
5805 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5806 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5807 }
5808
5809 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5810 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5811 }
5812 }
5813
generate_all()5814 void generate_all() {
5815 // support for verify_oop (must happen after universe_init)
5816 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
5817 StubRoutines::_throw_AbstractMethodError_entry =
5818 generate_throw_exception("AbstractMethodError throw_exception",
5819 CAST_FROM_FN_PTR(address,
5820 SharedRuntime::
5821 throw_AbstractMethodError));
5822
5823 StubRoutines::_throw_IncompatibleClassChangeError_entry =
5824 generate_throw_exception("IncompatibleClassChangeError throw_exception",
5825 CAST_FROM_FN_PTR(address,
5826 SharedRuntime::
5827 throw_IncompatibleClassChangeError));
5828
5829 StubRoutines::_throw_NullPointerException_at_call_entry =
5830 generate_throw_exception("NullPointerException at call throw_exception",
5831 CAST_FROM_FN_PTR(address,
5832 SharedRuntime::
5833 throw_NullPointerException_at_call));
5834
5835 // arraycopy stubs used by compilers
5836 generate_arraycopy_stubs();
5837
5838 // has negatives stub for large arrays.
5839 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5840
5841 // array equals stub for large arrays.
5842 if (!UseSimpleArrayEquals) {
5843 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5844 }
5845
5846 generate_compare_long_strings();
5847
5848 generate_string_indexof_stubs();
5849
5850 // byte_array_inflate stub for large arrays.
5851 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5852
5853 #ifdef COMPILER2
5854 if (UseMultiplyToLenIntrinsic) {
5855 StubRoutines::_multiplyToLen = generate_multiplyToLen();
5856 }
5857
5858 if (UseSquareToLenIntrinsic) {
5859 StubRoutines::_squareToLen = generate_squareToLen();
5860 }
5861
5862 if (UseMulAddIntrinsic) {
5863 StubRoutines::_mulAdd = generate_mulAdd();
5864 }
5865
5866 if (UseMontgomeryMultiplyIntrinsic) {
5867 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5868 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5869 StubRoutines::_montgomeryMultiply = g.generate_multiply();
5870 }
5871
5872 if (UseMontgomerySquareIntrinsic) {
5873 StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5874 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5875 // We use generate_multiply() rather than generate_square()
5876 // because it's faster for the sizes of modulus we care about.
5877 StubRoutines::_montgomerySquare = g.generate_multiply();
5878 }
5879 #endif // COMPILER2
5880
5881 // generate GHASH intrinsics code
5882 if (UseGHASHIntrinsics) {
5883 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5884 }
5885
5886 if (UseBASE64Intrinsics) {
5887 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
5888 }
5889
5890 if (UseAESIntrinsics) {
5891 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5892 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5893 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5894 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5895 }
5896
5897 if (UseSHA1Intrinsics) {
5898 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5899 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5900 }
5901 if (UseSHA256Intrinsics) {
5902 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
5903 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
5904 }
5905
5906 // generate Adler32 intrinsics code
5907 if (UseAdler32Intrinsics) {
5908 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5909 }
5910
5911 // Safefetch stubs.
5912 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
5913 &StubRoutines::_safefetch32_fault_pc,
5914 &StubRoutines::_safefetch32_continuation_pc);
5915 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5916 &StubRoutines::_safefetchN_fault_pc,
5917 &StubRoutines::_safefetchN_continuation_pc);
5918 StubRoutines::aarch64::set_completed();
5919 }
5920
5921 public:
StubGenerator(CodeBuffer * code,bool all)5922 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5923 if (all) {
5924 generate_all();
5925 } else {
5926 generate_initial();
5927 }
5928 }
5929 }; // end class declaration
5930
5931 #define UCM_TABLE_MAX_ENTRIES 8
StubGenerator_generate(CodeBuffer * code,bool all)5932 void StubGenerator_generate(CodeBuffer* code, bool all) {
5933 if (UnsafeCopyMemory::_table == NULL) {
5934 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
5935 }
5936 StubGenerator g(code, all);
5937 }
5938