1#
2# Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved.
3# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4#
5# This code is free software; you can redistribute it and/or modify it
6# under the terms of the GNU General Public License version 2 only, as
7# published by the Free Software Foundation.
8#
9# This code is distributed in the hope that it will be useful, but WITHOUT
10# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12# version 2 for more details (a copy is included in the LICENSE file that
13# accompanied this code).
14#
15# You should have received a copy of the GNU General Public License version
16# 2 along with this work; if not, write to the Free Software Foundation,
17# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18#
19# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20# or visit www.oracle.com if you need additional information or have any
21# questions.
22#
23
24
25        # NOTE WELL!  The _Copy functions are called directly
26	# from server-compiler-generated code via CallLeafNoFP,
27	# which means that they *must* either not use floating
28	# point or use it in the same manner as does the server
29	# compiler.
30
31        .globl _Copy_arrayof_conjoint_bytes
32	.globl _Copy_arrayof_conjoint_jshorts
33        .globl _Copy_conjoint_jshorts_atomic
34        .globl _Copy_arrayof_conjoint_jints
35        .globl _Copy_conjoint_jints_atomic
36        .globl _Copy_arrayof_conjoint_jlongs
37        .globl _Copy_conjoint_jlongs_atomic
38
39	.text
40
41        .globl SpinPause
42        .align 16
43        .type  SpinPause,@function
44SpinPause:
45        rep
46        nop
47        movq   $1, %rax
48        ret
49
50        # Support for void Copy::arrayof_conjoint_bytes(void* from,
51        #                                               void* to,
52        #                                               size_t count)
53        # rdi - from
54        # rsi - to
55        # rdx - count, treated as ssize_t
56        #
57        .p2align 4,,15
58	.type    _Copy_arrayof_conjoint_bytes,@function
59_Copy_arrayof_conjoint_bytes:
60        movq     %rdx,%r8             # byte count
61        shrq     $3,%rdx              # qword count
62        cmpq     %rdi,%rsi
63        leaq     -1(%rdi,%r8,1),%rax  # from + bcount*1 - 1
64        jbe      acb_CopyRight
65        cmpq     %rax,%rsi
66        jbe      acb_CopyLeft
67acb_CopyRight:
68        leaq     -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8
69        leaq     -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8
70        negq     %rdx
71        jmp      7f
72        .p2align 4,,15
731:      movq     8(%rax,%rdx,8),%rsi
74        movq     %rsi,8(%rcx,%rdx,8)
75        addq     $1,%rdx
76        jnz      1b
772:      testq    $4,%r8               # check for trailing dword
78        jz       3f
79        movl     8(%rax),%esi         # copy trailing dword
80        movl     %esi,8(%rcx)
81        addq     $4,%rax
82        addq     $4,%rcx              # original %rsi is trashed, so we
83                                      #  can't use it as a base register
843:      testq    $2,%r8               # check for trailing word
85        jz       4f
86        movw     8(%rax),%si          # copy trailing word
87        movw     %si,8(%rcx)
88        addq     $2,%rcx
894:      testq    $1,%r8               # check for trailing byte
90        jz       5f
91        movb     -1(%rdi,%r8,1),%al   # copy trailing byte
92        movb     %al,8(%rcx)
935:      ret
94        .p2align 4,,15
956:      movq     -24(%rax,%rdx,8),%rsi
96        movq     %rsi,-24(%rcx,%rdx,8)
97        movq     -16(%rax,%rdx,8),%rsi
98        movq     %rsi,-16(%rcx,%rdx,8)
99        movq     -8(%rax,%rdx,8),%rsi
100        movq     %rsi,-8(%rcx,%rdx,8)
101        movq     (%rax,%rdx,8),%rsi
102        movq     %rsi,(%rcx,%rdx,8)
1037:      addq     $4,%rdx
104        jle      6b
105        subq     $4,%rdx
106        jl       1b
107        jmp      2b
108acb_CopyLeft:
109        testq    $1,%r8               # check for trailing byte
110        jz       1f
111        movb     -1(%rdi,%r8,1),%cl   # copy trailing byte
112        movb     %cl,-1(%rsi,%r8,1)
113        subq     $1,%r8               # adjust for possible trailing word
1141:      testq    $2,%r8               # check for trailing word
115        jz       2f
116        movw     -2(%rdi,%r8,1),%cx   # copy trailing word
117        movw     %cx,-2(%rsi,%r8,1)
1182:      testq    $4,%r8               # check for trailing dword
119        jz       5f
120        movl     (%rdi,%rdx,8),%ecx   # copy trailing dword
121        movl     %ecx,(%rsi,%rdx,8)
122        jmp      5f
123        .p2align 4,,15
1243:      movq     -8(%rdi,%rdx,8),%rcx
125        movq     %rcx,-8(%rsi,%rdx,8)
126        subq     $1,%rdx
127        jnz      3b
128        ret
129        .p2align 4,,15
1304:      movq     24(%rdi,%rdx,8),%rcx
131        movq     %rcx,24(%rsi,%rdx,8)
132        movq     16(%rdi,%rdx,8),%rcx
133        movq     %rcx,16(%rsi,%rdx,8)
134        movq     8(%rdi,%rdx,8),%rcx
135        movq     %rcx,8(%rsi,%rdx,8)
136        movq     (%rdi,%rdx,8),%rcx
137        movq     %rcx,(%rsi,%rdx,8)
1385:      subq     $4,%rdx
139        jge      4b
140        addq     $4,%rdx
141        jg       3b
142        ret
143
144        # Support for void Copy::arrayof_conjoint_jshorts(void* from,
145        #                                                 void* to,
146        #                                                 size_t count)
147        # Equivalent to
148        #   conjoint_jshorts_atomic
149        #
150        # If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
151        # let the hardware handle it.  The tow or four words within dwords
152        # or qwords that span cache line boundaries will still be loaded
153        # and stored atomically.
154        #
155        # rdi - from
156        # rsi - to
157        # rdx - count, treated as ssize_t
158        #
159        .p2align 4,,15
160	.type    _Copy_arrayof_conjoint_jshorts,@function
161	.type    _Copy_conjoint_jshorts_atomic,@function
162_Copy_arrayof_conjoint_jshorts:
163_Copy_conjoint_jshorts_atomic:
164        movq     %rdx,%r8             # word count
165        shrq     $2,%rdx              # qword count
166        cmpq     %rdi,%rsi
167        leaq     -2(%rdi,%r8,2),%rax  # from + wcount*2 - 2
168        jbe      acs_CopyRight
169        cmpq     %rax,%rsi
170        jbe      acs_CopyLeft
171acs_CopyRight:
172        leaq     -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8
173        leaq     -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8
174        negq     %rdx
175        jmp      6f
1761:      movq     8(%rax,%rdx,8),%rsi
177        movq     %rsi,8(%rcx,%rdx,8)
178        addq     $1,%rdx
179        jnz      1b
1802:      testq    $2,%r8               # check for trailing dword
181        jz       3f
182        movl     8(%rax),%esi         # copy trailing dword
183        movl     %esi,8(%rcx)
184        addq     $4,%rcx              # original %rsi is trashed, so we
185                                      #  can't use it as a base register
1863:      testq    $1,%r8               # check for trailing word
187        jz       4f
188        movw     -2(%rdi,%r8,2),%si   # copy trailing word
189        movw     %si,8(%rcx)
1904:      ret
191        .p2align 4,,15
1925:      movq     -24(%rax,%rdx,8),%rsi
193        movq     %rsi,-24(%rcx,%rdx,8)
194        movq     -16(%rax,%rdx,8),%rsi
195        movq     %rsi,-16(%rcx,%rdx,8)
196        movq     -8(%rax,%rdx,8),%rsi
197        movq     %rsi,-8(%rcx,%rdx,8)
198        movq     (%rax,%rdx,8),%rsi
199        movq     %rsi,(%rcx,%rdx,8)
2006:      addq     $4,%rdx
201        jle      5b
202        subq     $4,%rdx
203        jl       1b
204        jmp      2b
205acs_CopyLeft:
206        testq    $1,%r8               # check for trailing word
207        jz       1f
208        movw     -2(%rdi,%r8,2),%cx   # copy trailing word
209        movw     %cx,-2(%rsi,%r8,2)
2101:      testq    $2,%r8               # check for trailing dword
211        jz       4f
212        movl     (%rdi,%rdx,8),%ecx   # copy trailing dword
213        movl     %ecx,(%rsi,%rdx,8)
214        jmp      4f
2152:      movq     -8(%rdi,%rdx,8),%rcx
216        movq     %rcx,-8(%rsi,%rdx,8)
217        subq     $1,%rdx
218        jnz      2b
219        ret
220        .p2align 4,,15
2213:      movq     24(%rdi,%rdx,8),%rcx
222        movq     %rcx,24(%rsi,%rdx,8)
223        movq     16(%rdi,%rdx,8),%rcx
224        movq     %rcx,16(%rsi,%rdx,8)
225        movq     8(%rdi,%rdx,8),%rcx
226        movq     %rcx,8(%rsi,%rdx,8)
227        movq     (%rdi,%rdx,8),%rcx
228        movq     %rcx,(%rsi,%rdx,8)
2294:      subq     $4,%rdx
230        jge      3b
231        addq     $4,%rdx
232        jg       2b
233        ret
234
235        # Support for void Copy::arrayof_conjoint_jints(jint* from,
236        #                                               jint* to,
237        #                                               size_t count)
238        # Equivalent to
239        #   conjoint_jints_atomic
240        #
241        # If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
242        # the hardware handle it.  The two dwords within qwords that span
243        # cache line boundaries will still be loaded and stored atomically.
244        #
245        # rdi - from
246        # rsi - to
247        # rdx - count, treated as ssize_t
248        #
249        .p2align 4,,15
250	.type    _Copy_arrayof_conjoint_jints,@function
251	.type    _Copy_conjoint_jints_atomic,@function
252_Copy_arrayof_conjoint_jints:
253_Copy_conjoint_jints_atomic:
254        movq     %rdx,%r8             # dword count
255        shrq     %rdx                 # qword count
256        cmpq     %rdi,%rsi
257        leaq     -4(%rdi,%r8,4),%rax  # from + dcount*4 - 4
258        jbe      aci_CopyRight
259        cmpq     %rax,%rsi
260        jbe      aci_CopyLeft
261aci_CopyRight:
262        leaq     -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8
263        leaq     -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8
264        negq     %rdx
265        jmp      5f
266        .p2align 4,,15
2671:      movq     8(%rax,%rdx,8),%rsi
268        movq     %rsi,8(%rcx,%rdx,8)
269        addq     $1,%rdx
270        jnz       1b
2712:      testq    $1,%r8               # check for trailing dword
272        jz       3f
273        movl     8(%rax),%esi         # copy trailing dword
274        movl     %esi,8(%rcx)
2753:      ret
276        .p2align 4,,15
2774:      movq     -24(%rax,%rdx,8),%rsi
278        movq     %rsi,-24(%rcx,%rdx,8)
279        movq     -16(%rax,%rdx,8),%rsi
280        movq     %rsi,-16(%rcx,%rdx,8)
281        movq     -8(%rax,%rdx,8),%rsi
282        movq     %rsi,-8(%rcx,%rdx,8)
283        movq     (%rax,%rdx,8),%rsi
284        movq     %rsi,(%rcx,%rdx,8)
2855:      addq     $4,%rdx
286        jle      4b
287        subq     $4,%rdx
288        jl       1b
289        jmp      2b
290aci_CopyLeft:
291        testq    $1,%r8               # check for trailing dword
292        jz       3f
293        movl     -4(%rdi,%r8,4),%ecx  # copy trailing dword
294        movl     %ecx,-4(%rsi,%r8,4)
295        jmp      3f
2961:      movq     -8(%rdi,%rdx,8),%rcx
297        movq     %rcx,-8(%rsi,%rdx,8)
298        subq     $1,%rdx
299        jnz      1b
300        ret
301        .p2align 4,,15
3022:      movq     24(%rdi,%rdx,8),%rcx
303        movq     %rcx,24(%rsi,%rdx,8)
304        movq     16(%rdi,%rdx,8),%rcx
305        movq     %rcx,16(%rsi,%rdx,8)
306        movq     8(%rdi,%rdx,8),%rcx
307        movq     %rcx,8(%rsi,%rdx,8)
308        movq     (%rdi,%rdx,8),%rcx
309        movq     %rcx,(%rsi,%rdx,8)
3103:      subq     $4,%rdx
311        jge      2b
312        addq     $4,%rdx
313        jg       1b
314        ret
315
316        # Support for void Copy::arrayof_conjoint_jlongs(jlong* from,
317        #                                                jlong* to,
318        #                                                size_t count)
319        # Equivalent to
320        #   conjoint_jlongs_atomic
321        #   arrayof_conjoint_oops
322        #   conjoint_oops_atomic
323        #
324        # rdi - from
325        # rsi - to
326        # rdx - count, treated as ssize_t
327        #
328        .p2align 4,,15
329	.type    _Copy_arrayof_conjoint_jlongs,@function
330	.type    _Copy_conjoint_jlongs_atomic,@function
331_Copy_arrayof_conjoint_jlongs:
332_Copy_conjoint_jlongs_atomic:
333        cmpq     %rdi,%rsi
334        leaq     -8(%rdi,%rdx,8),%rax # from + count*8 - 8
335        jbe      acl_CopyRight
336        cmpq     %rax,%rsi
337        jbe      acl_CopyLeft
338acl_CopyRight:
339        leaq     -8(%rsi,%rdx,8),%rcx # to + count*8 - 8
340        negq     %rdx
341        jmp      3f
3421:      movq     8(%rax,%rdx,8),%rsi
343        movq     %rsi,8(%rcx,%rdx,8)
344        addq     $1,%rdx
345        jnz      1b
346        ret
347        .p2align 4,,15
3482:      movq     -24(%rax,%rdx,8),%rsi
349        movq     %rsi,-24(%rcx,%rdx,8)
350        movq     -16(%rax,%rdx,8),%rsi
351        movq     %rsi,-16(%rcx,%rdx,8)
352        movq     -8(%rax,%rdx,8),%rsi
353        movq     %rsi,-8(%rcx,%rdx,8)
354        movq     (%rax,%rdx,8),%rsi
355        movq     %rsi,(%rcx,%rdx,8)
3563:      addq     $4,%rdx
357        jle      2b
358        subq     $4,%rdx
359        jl       1b
360        ret
3614:      movq     -8(%rdi,%rdx,8),%rcx
362        movq     %rcx,-8(%rsi,%rdx,8)
363        subq     $1,%rdx
364        jnz      4b
365        ret
366        .p2align 4,,15
3675:      movq     24(%rdi,%rdx,8),%rcx
368        movq     %rcx,24(%rsi,%rdx,8)
369        movq     16(%rdi,%rdx,8),%rcx
370        movq     %rcx,16(%rsi,%rdx,8)
371        movq     8(%rdi,%rdx,8),%rcx
372        movq     %rcx,8(%rsi,%rdx,8)
373        movq     (%rdi,%rdx,8),%rcx
374        movq     %rcx,(%rsi,%rdx,8)
375acl_CopyLeft:
376        subq     $4,%rdx
377        jge      5b
378        addq     $4,%rdx
379        jg       4b
380        ret
381