1/*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * __folly_memcpy: An optimized memcpy implementation that uses prefetch and
19 * AVX2 instructions.
20 *
21 * This implementation of memcpy acts as a memmove: while overlapping copies
22 * are undefined in memcpy, in some implementations they're the same function and
23 * legacy programs rely on this behavior.
24 *
25 * This implementation uses prefetch to avoid dtlb misses. This can
26 * substantially reduce dtlb store misses in cases where the destination
27 * location is absent from L1 cache and where the copy size is small enough
28 * that the hardware prefetcher doesn't have a large impact.
29 *
30 * The number of branches is limited by the use of overlapping loads & stores.
31 * This helps with copies where the source and destination cache lines are already
32 * present in L1 because there are fewer instructions to execute and fewer
33 * branches to potentially mispredict.
34 *   e.g. to copy the last 4 <= n <= 7 bytes: copy the first & last 4 bytes (overlapped):
35 *      movl        (%rsi), %r8d
36 *      movl        -4(%rsi,%rdx), %r9d
37 *      movl        %r8d, (%rdi)
38 *      movl        %r9d, -4(%rdi,%rdx)
39 *
40 *
41 * For sizes up to 256 all source data is first read into registers and then written:
42 * - n <=  16: overlapping movs
43 * - n <=  32: overlapping unaligned 16-byte SSE XMM load/stores
44 * - n <= 256: overlapping unaligned 32-byte AVX YMM load/stores
45 *
46 * Large copies (> 256 bytes) use unaligned loads + aligned stores.
47 * This is observed to always be faster than rep movsb, so the rep movsb
48 * instruction is not used.
49 * - The head & tail may be unaligned => they're always written using unaligned stores.
50 *
51 * If the copy size is humongous (> 32 KiB) and the source and destination are both
52 * aligned, this memcpy will use non-temporal operations (AVX2). This can have
53 * a substantial speedup for copies where data is absent from L1, but it
54 * is significantly slower if the source and destination data were already
55 * in L1. The use of non-temporal operations also has the effect that after
56 * the copy is complete, the data will be moved out of L1, even if the data was
57 * present before the copy started.
58 *
59 * For n > 256 and overlapping src & dst buffers (memmove):
60 * - use unaligned loads + aligned stores, but not non-temporal stores
61 * - for dst < src forward copy in 128 byte batches:
62 *   - unaligned load the first 32 bytes & last 4 x 32 bytes
63 *   - forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
64 *   - unaligned store the first 32 bytes & last 4 x 32 bytes
65 * - for dst > src backward copy in 128 byte batches:
66 *   - unaligned load the first 4 x 32 bytes & last 32 bytes
67 *   - backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
68 *   - unaligned store the first 4 x 32 bytes & last 32 bytes
69 *
70 * @author Logan Evans <lpe@fb.com>
71 */
72
73#if defined(__AVX2__)
74
75// This threshold is half of L1 cache on a Skylake machine, which means that
76// potentially all of L1 will be populated by this copy once it is executed
77// (dst and src are cached for temporal copies).
78#define NON_TEMPORAL_STORE_THRESHOLD $32768
79
80        .file       "memcpy.S"
81        .section    .text,"ax"
82
83        .type       __folly_memcpy_short, @function
84__folly_memcpy_short:
85        .cfi_startproc
86
87.L_GE1_LE7:
88        cmp         $1, %rdx
89        je          .L_EQ1
90
91        cmp         $4, %rdx
92        jae         .L_GE4_LE7
93
94.L_GE2_LE3:
95        movw        (%rsi), %r8w
96        movw        -2(%rsi,%rdx), %r9w
97        movw        %r8w, (%rdi)
98        movw        %r9w, -2(%rdi,%rdx)
99        ret
100
101        .align      2
102.L_EQ1:
103        movb        (%rsi), %r8b
104        movb        %r8b, (%rdi)
105        ret
106
107        // Aligning the target of a jump to an even address has a measurable
108        // speedup in microbenchmarks.
109        .align      2
110.L_GE4_LE7:
111        movl        (%rsi), %r8d
112        movl        -4(%rsi,%rdx), %r9d
113        movl        %r8d, (%rdi)
114        movl        %r9d, -4(%rdi,%rdx)
115        ret
116
117        .cfi_endproc
118        .size       __folly_memcpy_short, .-__folly_memcpy_short
119
120// memcpy is an alternative entrypoint into the function named __folly_memcpy.
121// The compiler is able to call memcpy since the name is global while
122// stacktraces will show __folly_memcpy since that is the name of the function.
123// This is intended to aid in debugging by making it obvious which version of
124// memcpy is being used.
125        .align      64
126        .globl      __folly_memcpy
127        .type       __folly_memcpy, @function
128
129__folly_memcpy:
130        .cfi_startproc
131
132        mov         %rdi, %rax    # return: $rdi
133
134        test        %rdx, %rdx
135        je          .L_EQ0
136
137        prefetchw   (%rdi)
138        prefetchw   -1(%rdi,%rdx)
139
140        cmp         $8, %rdx
141        jb          .L_GE1_LE7
142
143.L_GE8:
144        cmp         $32, %rdx
145        ja          .L_GE33
146
147.L_GE8_LE32:
148        cmp         $16, %rdx
149        ja          .L_GE17_LE32
150
151.L_GE8_LE16:
152        mov         (%rsi), %r8
153        mov         -8(%rsi,%rdx), %r9
154        mov         %r8, (%rdi)
155        mov         %r9, -8(%rdi,%rdx)
156.L_EQ0:
157        ret
158
159        .align      2
160.L_GE17_LE32:
161        movdqu      (%rsi), %xmm0
162        movdqu      -16(%rsi,%rdx), %xmm1
163        movdqu      %xmm0, (%rdi)
164        movdqu      %xmm1, -16(%rdi,%rdx)
165        ret
166
167        .align      2
168.L_GE193_LE256:
169        vmovdqu     %ymm3, 96(%rdi)
170        vmovdqu     %ymm4, -128(%rdi,%rdx)
171
172.L_GE129_LE192:
173        vmovdqu     %ymm2, 64(%rdi)
174        vmovdqu     %ymm5, -96(%rdi,%rdx)
175
176.L_GE65_LE128:
177        vmovdqu     %ymm1, 32(%rdi)
178        vmovdqu     %ymm6, -64(%rdi,%rdx)
179
180.L_GE33_LE64:
181        vmovdqu     %ymm0, (%rdi)
182        vmovdqu     %ymm7, -32(%rdi,%rdx)
183
184        vzeroupper
185        ret
186
187        .align      2
188.L_GE33:
189        vmovdqu     (%rsi), %ymm0
190        vmovdqu     -32(%rsi,%rdx), %ymm7
191
192        cmp         $64, %rdx
193        jbe         .L_GE33_LE64
194
195        prefetchw   64(%rdi)
196
197        vmovdqu     32(%rsi), %ymm1
198        vmovdqu     -64(%rsi,%rdx), %ymm6
199
200        cmp         $128, %rdx
201        jbe         .L_GE65_LE128
202
203        prefetchw   128(%rdi)
204
205        vmovdqu     64(%rsi), %ymm2
206        vmovdqu     -96(%rsi,%rdx), %ymm5
207
208        cmp         $192, %rdx
209        jbe         .L_GE129_LE192
210
211        prefetchw   192(%rdi)
212
213        vmovdqu     96(%rsi), %ymm3
214        vmovdqu     -128(%rsi,%rdx), %ymm4
215
216        cmp         $256, %rdx
217        jbe         .L_GE193_LE256
218
219.L_GE257:
220        prefetchw   256(%rdi)
221
222        // Check if there is an overlap. If there is an overlap then the caller
223        // has a bug since this is undefined behavior. However, for legacy
224        // reasons this behavior is expected by some callers.
225        //
226        // All copies through 256 bytes will operate as a memmove since for
227        // those sizes all reads are performed before any writes.
228        //
229        // This check uses the idea that there is an overlap if
230        // (%rdi < (%rsi + %rdx)) && (%rsi < (%rdi + %rdx)),
231        // or equivalently, there is no overlap if
232        // ((%rsi + %rdx) <= %rdi) || ((%rdi + %rdx) <= %rsi).
233        //
234        // %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many
235        // bytes remain to be copied.
236
237        // (%rsi + %rdx <= %rdi) => no overlap
238        lea         (%rsi,%rdx), %r9
239        cmp         %rdi, %r9
240        jbe         .L_NO_OVERLAP
241
242        // (%rdi + %rdx <= %rsi) => no overlap
243        lea         (%rdi,%rdx), %r8
244        cmp         %rsi, %r8
245        // If no info is available in branch predictor's cache, Intel CPUs assume
246        // forward jumps are not taken. Use a forward jump as overlapping buffers
247        // are unlikely.
248        ja          .L_OVERLAP
249
250        .align      2
251.L_NO_OVERLAP:
252        vmovdqu     %ymm0, (%rdi)
253        vmovdqu     %ymm1, 32(%rdi)
254        vmovdqu     %ymm2, 64(%rdi)
255        vmovdqu     %ymm3, 96(%rdi)
256
257        // Align %rdi to a 32 byte boundary.
258        // %rcx = 128 - 31 & %rdi
259        mov         $128, %rcx
260        and         $31, %rdi
261        sub         %rdi, %rcx
262
263        lea         (%rsi,%rcx), %rsi
264        lea         (%rax,%rcx), %rdi
265        sub         %rcx, %rdx
266
267        // %r8 is the end condition for the loop.
268        lea         -128(%rsi,%rdx), %r8
269
270        cmp         NON_TEMPORAL_STORE_THRESHOLD, %rdx
271        jae         .L_NON_TEMPORAL_LOOP
272
273        .align      2
274.L_ALIGNED_DST_LOOP:
275        prefetchw   128(%rdi)
276        prefetchw   192(%rdi)
277
278        vmovdqu     (%rsi), %ymm0
279        vmovdqu     32(%rsi), %ymm1
280        vmovdqu     64(%rsi), %ymm2
281        vmovdqu     96(%rsi), %ymm3
282        add         $128, %rsi
283
284        vmovdqa     %ymm0, (%rdi)
285        vmovdqa     %ymm1, 32(%rdi)
286        vmovdqa     %ymm2, 64(%rdi)
287        vmovdqa     %ymm3, 96(%rdi)
288        add         $128, %rdi
289
290        cmp         %r8, %rsi
291        jb          .L_ALIGNED_DST_LOOP
292
293.L_ALIGNED_DST_LOOP_END:
294        sub         %rsi, %r9
295        mov         %r9, %rdx
296
297        vmovdqu     %ymm4, -128(%rdi,%rdx)
298        vmovdqu     %ymm5, -96(%rdi,%rdx)
299        vmovdqu     %ymm6, -64(%rdi,%rdx)
300        vmovdqu     %ymm7, -32(%rdi,%rdx)
301
302        vzeroupper
303        ret
304
305        .align      2
306.L_NON_TEMPORAL_LOOP:
307        testb       $31, %sil
308        jne         .L_ALIGNED_DST_LOOP
309        // This is prefetching the source data unlike ALIGNED_DST_LOOP which
310        // prefetches the destination data. This choice is again informed by
311        // benchmarks. With a non-temporal store the entirety of the cache line
312        // is being written so the previous data can be discarded without being
313        // fetched.
314        prefetchnta 128(%rsi)
315        prefetchnta 196(%rsi)
316
317        vmovntdqa   (%rsi), %ymm0
318        vmovntdqa   32(%rsi), %ymm1
319        vmovntdqa   64(%rsi), %ymm2
320        vmovntdqa   96(%rsi), %ymm3
321        add         $128, %rsi
322
323        vmovntdq    %ymm0, (%rdi)
324        vmovntdq    %ymm1, 32(%rdi)
325        vmovntdq    %ymm2, 64(%rdi)
326        vmovntdq    %ymm3, 96(%rdi)
327        add         $128, %rdi
328
329        cmp         %r8, %rsi
330        jb          .L_NON_TEMPORAL_LOOP
331
332        sfence
333        jmp         .L_ALIGNED_DST_LOOP_END
334
335
336.L_OVERLAP:
337        .align      2
338        cmp         %rdi, %rsi
339        jb          .L_OVERLAP_BWD  // %rsi  < %rdi => backward-copy
340        je          .L_RET          // %rsi == %rdi => return, nothing to copy
341
342        // Source & destination buffers overlap. Forward copy.
343
344        vmovdqu     (%rsi), %ymm8
345
346        // Align %rdi to a 32 byte boundary.
347        // %rcx = 32 - 31 & %rdi
348        mov         $32, %rcx
349        and         $31, %rdi
350        sub         %rdi, %rcx
351
352        lea         (%rsi,%rcx), %rsi
353        lea         (%rax,%rcx), %rdi
354        sub         %rcx, %rdx
355
356        // %r8 is the end condition for the loop.
357        lea         -128(%rsi,%rdx), %r8
358
359
360.L_OVERLAP_FWD_ALIGNED_DST_LOOP:
361        prefetchw   128(%rdi)
362        prefetchw   192(%rdi)
363
364        vmovdqu       (%rsi), %ymm0
365        vmovdqu     32(%rsi), %ymm1
366        vmovdqu     64(%rsi), %ymm2
367        vmovdqu     96(%rsi), %ymm3
368        add         $128, %rsi
369
370        vmovdqa     %ymm0,   (%rdi)
371        vmovdqa     %ymm1, 32(%rdi)
372        vmovdqa     %ymm2, 64(%rdi)
373        vmovdqa     %ymm3, 96(%rdi)
374        add         $128, %rdi
375
376        cmp         %r8, %rsi
377        jb          .L_OVERLAP_FWD_ALIGNED_DST_LOOP
378
379        sub         %rsi, %r9
380        mov         %r9, %rdx
381
382        vmovdqu     %ymm4, -128(%rdi,%rdx)
383        vmovdqu     %ymm5,  -96(%rdi,%rdx)
384        vmovdqu     %ymm6,  -64(%rdi,%rdx)
385        vmovdqu     %ymm7,  -32(%rdi,%rdx)
386        vmovdqu     %ymm8, (%rax)  // %rax == the original (unaligned) %rdi
387
388        vzeroupper
389
390.L_RET:
391        ret
392
393.L_OVERLAP_BWD:
394        # Save last 32 bytes.
395        vmovdqu     -32(%rsi, %rdx), %ymm8
396        lea         -32(%rdi, %rdx), %r9
397
398
399        // %r8 is the end condition for the loop.
400        lea         128(%rsi), %r8
401
402        // Align %rdi+%rdx (destination end) to a 32 byte boundary.
403        // %rcx = (%rdi + %rdx - 32) & 31
404        mov         %r9, %rcx
405        and         $31, %rcx
406        // Set %rsi & %rdi to the end of the 32 byte aligned range.
407        sub         %rcx, %rdx
408        add         %rdx, %rsi
409        add         %rdx, %rdi
410
411
412.L_OVERLAP_BWD_ALIGNED_DST_LOOP:
413        prefetchw   -128(%rdi)
414        prefetchw   -192(%rdi)
415
416        vmovdqu      -32(%rsi), %ymm4
417        vmovdqu      -64(%rsi), %ymm5
418        vmovdqu      -96(%rsi), %ymm6
419        vmovdqu     -128(%rsi), %ymm7
420        sub         $128, %rsi
421
422        vmovdqa     %ymm4,  -32(%rdi)
423        vmovdqa     %ymm5,  -64(%rdi)
424        vmovdqa     %ymm6,  -96(%rdi)
425        vmovdqa     %ymm7, -128(%rdi)
426        sub         $128, %rdi
427
428        cmp         %r8, %rsi
429        ja          .L_OVERLAP_BWD_ALIGNED_DST_LOOP
430
431        vmovdqu     %ymm0,   (%rax)  // %rax == the original unaligned %rdi
432        vmovdqu     %ymm1, 32(%rax)
433        vmovdqu     %ymm2, 64(%rax)
434        vmovdqu     %ymm3, 96(%rax)
435        vmovdqu     %ymm8, (%r9)
436
437        vzeroupper
438	ret
439
440        .cfi_endproc
441        .size       __folly_memcpy, .-__folly_memcpy
442
443#ifdef FOLLY_MEMCPY_IS_MEMCPY
444        .weak       memcpy
445        memcpy = __folly_memcpy
446
447        .weak       memmove
448        memmove = __folly_memcpy
449#endif
450
451        .ident "GCC: (GNU) 4.8.2"
452#ifdef __linux__
453        .section .note.GNU-stack,"",@progbits
454#endif
455
456#endif
457