1/* 2 * Copyright (c) Facebook, Inc. and its affiliates. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17/* 18 * __folly_memcpy: An optimized memcpy implementation that uses prefetch and 19 * AVX2 instructions. 20 * 21 * This implementation of memcpy acts as a memmove: while overlapping copies 22 * are undefined in memcpy, in some implementations they're the same function and 23 * legacy programs rely on this behavior. 24 * 25 * This implementation uses prefetch to avoid dtlb misses. This can 26 * substantially reduce dtlb store misses in cases where the destination 27 * location is absent from L1 cache and where the copy size is small enough 28 * that the hardware prefetcher doesn't have a large impact. 29 * 30 * The number of branches is limited by the use of overlapping loads & stores. 31 * This helps with copies where the source and destination cache lines are already 32 * present in L1 because there are fewer instructions to execute and fewer 33 * branches to potentially mispredict. 34 * e.g. to copy the last 4 <= n <= 7 bytes: copy the first & last 4 bytes (overlapped): 35 * movl (%rsi), %r8d 36 * movl -4(%rsi,%rdx), %r9d 37 * movl %r8d, (%rdi) 38 * movl %r9d, -4(%rdi,%rdx) 39 * 40 * 41 * For sizes up to 256 all source data is first read into registers and then written: 42 * - n <= 16: overlapping movs 43 * - n <= 32: overlapping unaligned 16-byte SSE XMM load/stores 44 * - n <= 256: overlapping unaligned 32-byte AVX YMM load/stores 45 * 46 * Large copies (> 256 bytes) use unaligned loads + aligned stores. 47 * This is observed to always be faster than rep movsb, so the rep movsb 48 * instruction is not used. 49 * - The head & tail may be unaligned => they're always written using unaligned stores. 50 * 51 * If the copy size is humongous (> 32 KiB) and the source and destination are both 52 * aligned, this memcpy will use non-temporal operations (AVX2). This can have 53 * a substantial speedup for copies where data is absent from L1, but it 54 * is significantly slower if the source and destination data were already 55 * in L1. The use of non-temporal operations also has the effect that after 56 * the copy is complete, the data will be moved out of L1, even if the data was 57 * present before the copy started. 58 * 59 * For n > 256 and overlapping src & dst buffers (memmove): 60 * - use unaligned loads + aligned stores, but not non-temporal stores 61 * - for dst < src forward copy in 128 byte batches: 62 * - unaligned load the first 32 bytes & last 4 x 32 bytes 63 * - forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time 64 * - unaligned store the first 32 bytes & last 4 x 32 bytes 65 * - for dst > src backward copy in 128 byte batches: 66 * - unaligned load the first 4 x 32 bytes & last 32 bytes 67 * - backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time 68 * - unaligned store the first 4 x 32 bytes & last 32 bytes 69 * 70 * @author Logan Evans <lpe@fb.com> 71 */ 72 73#if defined(__AVX2__) 74 75// This threshold is half of L1 cache on a Skylake machine, which means that 76// potentially all of L1 will be populated by this copy once it is executed 77// (dst and src are cached for temporal copies). 78#define NON_TEMPORAL_STORE_THRESHOLD $32768 79 80 .file "memcpy.S" 81 .section .text,"ax" 82 83 .type __folly_memcpy_short, @function 84__folly_memcpy_short: 85 .cfi_startproc 86 87.L_GE1_LE7: 88 cmp $1, %rdx 89 je .L_EQ1 90 91 cmp $4, %rdx 92 jae .L_GE4_LE7 93 94.L_GE2_LE3: 95 movw (%rsi), %r8w 96 movw -2(%rsi,%rdx), %r9w 97 movw %r8w, (%rdi) 98 movw %r9w, -2(%rdi,%rdx) 99 ret 100 101 .align 2 102.L_EQ1: 103 movb (%rsi), %r8b 104 movb %r8b, (%rdi) 105 ret 106 107 // Aligning the target of a jump to an even address has a measurable 108 // speedup in microbenchmarks. 109 .align 2 110.L_GE4_LE7: 111 movl (%rsi), %r8d 112 movl -4(%rsi,%rdx), %r9d 113 movl %r8d, (%rdi) 114 movl %r9d, -4(%rdi,%rdx) 115 ret 116 117 .cfi_endproc 118 .size __folly_memcpy_short, .-__folly_memcpy_short 119 120// memcpy is an alternative entrypoint into the function named __folly_memcpy. 121// The compiler is able to call memcpy since the name is global while 122// stacktraces will show __folly_memcpy since that is the name of the function. 123// This is intended to aid in debugging by making it obvious which version of 124// memcpy is being used. 125 .align 64 126 .globl __folly_memcpy 127 .type __folly_memcpy, @function 128 129__folly_memcpy: 130 .cfi_startproc 131 132 mov %rdi, %rax # return: $rdi 133 134 test %rdx, %rdx 135 je .L_EQ0 136 137 prefetchw (%rdi) 138 prefetchw -1(%rdi,%rdx) 139 140 cmp $8, %rdx 141 jb .L_GE1_LE7 142 143.L_GE8: 144 cmp $32, %rdx 145 ja .L_GE33 146 147.L_GE8_LE32: 148 cmp $16, %rdx 149 ja .L_GE17_LE32 150 151.L_GE8_LE16: 152 mov (%rsi), %r8 153 mov -8(%rsi,%rdx), %r9 154 mov %r8, (%rdi) 155 mov %r9, -8(%rdi,%rdx) 156.L_EQ0: 157 ret 158 159 .align 2 160.L_GE17_LE32: 161 movdqu (%rsi), %xmm0 162 movdqu -16(%rsi,%rdx), %xmm1 163 movdqu %xmm0, (%rdi) 164 movdqu %xmm1, -16(%rdi,%rdx) 165 ret 166 167 .align 2 168.L_GE193_LE256: 169 vmovdqu %ymm3, 96(%rdi) 170 vmovdqu %ymm4, -128(%rdi,%rdx) 171 172.L_GE129_LE192: 173 vmovdqu %ymm2, 64(%rdi) 174 vmovdqu %ymm5, -96(%rdi,%rdx) 175 176.L_GE65_LE128: 177 vmovdqu %ymm1, 32(%rdi) 178 vmovdqu %ymm6, -64(%rdi,%rdx) 179 180.L_GE33_LE64: 181 vmovdqu %ymm0, (%rdi) 182 vmovdqu %ymm7, -32(%rdi,%rdx) 183 184 vzeroupper 185 ret 186 187 .align 2 188.L_GE33: 189 vmovdqu (%rsi), %ymm0 190 vmovdqu -32(%rsi,%rdx), %ymm7 191 192 cmp $64, %rdx 193 jbe .L_GE33_LE64 194 195 prefetchw 64(%rdi) 196 197 vmovdqu 32(%rsi), %ymm1 198 vmovdqu -64(%rsi,%rdx), %ymm6 199 200 cmp $128, %rdx 201 jbe .L_GE65_LE128 202 203 prefetchw 128(%rdi) 204 205 vmovdqu 64(%rsi), %ymm2 206 vmovdqu -96(%rsi,%rdx), %ymm5 207 208 cmp $192, %rdx 209 jbe .L_GE129_LE192 210 211 prefetchw 192(%rdi) 212 213 vmovdqu 96(%rsi), %ymm3 214 vmovdqu -128(%rsi,%rdx), %ymm4 215 216 cmp $256, %rdx 217 jbe .L_GE193_LE256 218 219.L_GE257: 220 prefetchw 256(%rdi) 221 222 // Check if there is an overlap. If there is an overlap then the caller 223 // has a bug since this is undefined behavior. However, for legacy 224 // reasons this behavior is expected by some callers. 225 // 226 // All copies through 256 bytes will operate as a memmove since for 227 // those sizes all reads are performed before any writes. 228 // 229 // This check uses the idea that there is an overlap if 230 // (%rdi < (%rsi + %rdx)) && (%rsi < (%rdi + %rdx)), 231 // or equivalently, there is no overlap if 232 // ((%rsi + %rdx) <= %rdi) || ((%rdi + %rdx) <= %rsi). 233 // 234 // %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many 235 // bytes remain to be copied. 236 237 // (%rsi + %rdx <= %rdi) => no overlap 238 lea (%rsi,%rdx), %r9 239 cmp %rdi, %r9 240 jbe .L_NO_OVERLAP 241 242 // (%rdi + %rdx <= %rsi) => no overlap 243 lea (%rdi,%rdx), %r8 244 cmp %rsi, %r8 245 // If no info is available in branch predictor's cache, Intel CPUs assume 246 // forward jumps are not taken. Use a forward jump as overlapping buffers 247 // are unlikely. 248 ja .L_OVERLAP 249 250 .align 2 251.L_NO_OVERLAP: 252 vmovdqu %ymm0, (%rdi) 253 vmovdqu %ymm1, 32(%rdi) 254 vmovdqu %ymm2, 64(%rdi) 255 vmovdqu %ymm3, 96(%rdi) 256 257 // Align %rdi to a 32 byte boundary. 258 // %rcx = 128 - 31 & %rdi 259 mov $128, %rcx 260 and $31, %rdi 261 sub %rdi, %rcx 262 263 lea (%rsi,%rcx), %rsi 264 lea (%rax,%rcx), %rdi 265 sub %rcx, %rdx 266 267 // %r8 is the end condition for the loop. 268 lea -128(%rsi,%rdx), %r8 269 270 cmp NON_TEMPORAL_STORE_THRESHOLD, %rdx 271 jae .L_NON_TEMPORAL_LOOP 272 273 .align 2 274.L_ALIGNED_DST_LOOP: 275 prefetchw 128(%rdi) 276 prefetchw 192(%rdi) 277 278 vmovdqu (%rsi), %ymm0 279 vmovdqu 32(%rsi), %ymm1 280 vmovdqu 64(%rsi), %ymm2 281 vmovdqu 96(%rsi), %ymm3 282 add $128, %rsi 283 284 vmovdqa %ymm0, (%rdi) 285 vmovdqa %ymm1, 32(%rdi) 286 vmovdqa %ymm2, 64(%rdi) 287 vmovdqa %ymm3, 96(%rdi) 288 add $128, %rdi 289 290 cmp %r8, %rsi 291 jb .L_ALIGNED_DST_LOOP 292 293.L_ALIGNED_DST_LOOP_END: 294 sub %rsi, %r9 295 mov %r9, %rdx 296 297 vmovdqu %ymm4, -128(%rdi,%rdx) 298 vmovdqu %ymm5, -96(%rdi,%rdx) 299 vmovdqu %ymm6, -64(%rdi,%rdx) 300 vmovdqu %ymm7, -32(%rdi,%rdx) 301 302 vzeroupper 303 ret 304 305 .align 2 306.L_NON_TEMPORAL_LOOP: 307 testb $31, %sil 308 jne .L_ALIGNED_DST_LOOP 309 // This is prefetching the source data unlike ALIGNED_DST_LOOP which 310 // prefetches the destination data. This choice is again informed by 311 // benchmarks. With a non-temporal store the entirety of the cache line 312 // is being written so the previous data can be discarded without being 313 // fetched. 314 prefetchnta 128(%rsi) 315 prefetchnta 196(%rsi) 316 317 vmovntdqa (%rsi), %ymm0 318 vmovntdqa 32(%rsi), %ymm1 319 vmovntdqa 64(%rsi), %ymm2 320 vmovntdqa 96(%rsi), %ymm3 321 add $128, %rsi 322 323 vmovntdq %ymm0, (%rdi) 324 vmovntdq %ymm1, 32(%rdi) 325 vmovntdq %ymm2, 64(%rdi) 326 vmovntdq %ymm3, 96(%rdi) 327 add $128, %rdi 328 329 cmp %r8, %rsi 330 jb .L_NON_TEMPORAL_LOOP 331 332 sfence 333 jmp .L_ALIGNED_DST_LOOP_END 334 335 336.L_OVERLAP: 337 .align 2 338 cmp %rdi, %rsi 339 jb .L_OVERLAP_BWD // %rsi < %rdi => backward-copy 340 je .L_RET // %rsi == %rdi => return, nothing to copy 341 342 // Source & destination buffers overlap. Forward copy. 343 344 vmovdqu (%rsi), %ymm8 345 346 // Align %rdi to a 32 byte boundary. 347 // %rcx = 32 - 31 & %rdi 348 mov $32, %rcx 349 and $31, %rdi 350 sub %rdi, %rcx 351 352 lea (%rsi,%rcx), %rsi 353 lea (%rax,%rcx), %rdi 354 sub %rcx, %rdx 355 356 // %r8 is the end condition for the loop. 357 lea -128(%rsi,%rdx), %r8 358 359 360.L_OVERLAP_FWD_ALIGNED_DST_LOOP: 361 prefetchw 128(%rdi) 362 prefetchw 192(%rdi) 363 364 vmovdqu (%rsi), %ymm0 365 vmovdqu 32(%rsi), %ymm1 366 vmovdqu 64(%rsi), %ymm2 367 vmovdqu 96(%rsi), %ymm3 368 add $128, %rsi 369 370 vmovdqa %ymm0, (%rdi) 371 vmovdqa %ymm1, 32(%rdi) 372 vmovdqa %ymm2, 64(%rdi) 373 vmovdqa %ymm3, 96(%rdi) 374 add $128, %rdi 375 376 cmp %r8, %rsi 377 jb .L_OVERLAP_FWD_ALIGNED_DST_LOOP 378 379 sub %rsi, %r9 380 mov %r9, %rdx 381 382 vmovdqu %ymm4, -128(%rdi,%rdx) 383 vmovdqu %ymm5, -96(%rdi,%rdx) 384 vmovdqu %ymm6, -64(%rdi,%rdx) 385 vmovdqu %ymm7, -32(%rdi,%rdx) 386 vmovdqu %ymm8, (%rax) // %rax == the original (unaligned) %rdi 387 388 vzeroupper 389 390.L_RET: 391 ret 392 393.L_OVERLAP_BWD: 394 # Save last 32 bytes. 395 vmovdqu -32(%rsi, %rdx), %ymm8 396 lea -32(%rdi, %rdx), %r9 397 398 399 // %r8 is the end condition for the loop. 400 lea 128(%rsi), %r8 401 402 // Align %rdi+%rdx (destination end) to a 32 byte boundary. 403 // %rcx = (%rdi + %rdx - 32) & 31 404 mov %r9, %rcx 405 and $31, %rcx 406 // Set %rsi & %rdi to the end of the 32 byte aligned range. 407 sub %rcx, %rdx 408 add %rdx, %rsi 409 add %rdx, %rdi 410 411 412.L_OVERLAP_BWD_ALIGNED_DST_LOOP: 413 prefetchw -128(%rdi) 414 prefetchw -192(%rdi) 415 416 vmovdqu -32(%rsi), %ymm4 417 vmovdqu -64(%rsi), %ymm5 418 vmovdqu -96(%rsi), %ymm6 419 vmovdqu -128(%rsi), %ymm7 420 sub $128, %rsi 421 422 vmovdqa %ymm4, -32(%rdi) 423 vmovdqa %ymm5, -64(%rdi) 424 vmovdqa %ymm6, -96(%rdi) 425 vmovdqa %ymm7, -128(%rdi) 426 sub $128, %rdi 427 428 cmp %r8, %rsi 429 ja .L_OVERLAP_BWD_ALIGNED_DST_LOOP 430 431 vmovdqu %ymm0, (%rax) // %rax == the original unaligned %rdi 432 vmovdqu %ymm1, 32(%rax) 433 vmovdqu %ymm2, 64(%rax) 434 vmovdqu %ymm3, 96(%rax) 435 vmovdqu %ymm8, (%r9) 436 437 vzeroupper 438 ret 439 440 .cfi_endproc 441 .size __folly_memcpy, .-__folly_memcpy 442 443#ifdef FOLLY_MEMCPY_IS_MEMCPY 444 .weak memcpy 445 memcpy = __folly_memcpy 446 447 .weak memmove 448 memmove = __folly_memcpy 449#endif 450 451 .ident "GCC: (GNU) 4.8.2" 452#ifdef __linux__ 453 .section .note.GNU-stack,"",@progbits 454#endif 455 456#endif 457