1 //===-- Memset utils --------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
10 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
11 
12 #include "src/string/memory_utils/elements.h"
13 #include "src/string/memory_utils/utils.h"
14 
15 #include <stddef.h> // size_t
16 
17 namespace __llvm_libc {
18 
19 // A general purpose implementation assuming cheap unaligned writes for sizes:
20 // 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
as_uint32_bits(float x)21 // or 64 Bytes at a time, the compiler will expand them as needed.
22 //
23 // This implementation is subject to change as we benchmark more processors. We
24 // may also want to customize it for processors with specialized instructions
25 // that performs better (e.g. `rep stosb`).
26 //
27 // A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
28 // We want to balance two things here:
29 //  - The number of redundant writes (when using `SetBlockOverlap`),
30 //  - The number of conditionals for sizes <=128 (~90% of memset calls are for
31 //    such sizes).
32 //
33 // For the range 64-128:
34 //  - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
35 //  is wasteful near 65 but efficient toward 128.
36 //  - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
37 //  96 or 128 Bytes.
38 //  - Another approach could be to use an hybrid approach Copy<64>+Overlap<32>
39 //  for 65-96 and Copy<96>+Overlap<32> for 97-128
40 //
41 // Benchmarks showed that redundant writes were cheap (for Intel X86) but
42 // conditional were expensive, even on processor that do not support writing 64B
43 // at a time (pre-AVX512F). We also want to favor short functions that allow
44 // more hot code to fit in the iL1 cache.
45 //
46 // Above 128 we have to use conditionals since we don't know the upper bound in
47 // advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
48 // may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
49 // superior for sizes that mattered.
50 inline static void GeneralPurposeMemset(char *dst, unsigned char value,
51                                         size_t count) {
52 #if defined(__i386__) || defined(__x86_64__)
53   using namespace ::__llvm_libc::x86;
54 #else
55   using namespace ::__llvm_libc::scalar;
56 #endif
57 
58   if (count == 0)
59     return;
60   if (count == 1)
61     return SplatSet<_1>(dst, value);
62   if (count == 2)
63     return SplatSet<_2>(dst, value);
64   if (count == 3)
65     return SplatSet<_3>(dst, value);
66   if (count == 4)
67     return SplatSet<_4>(dst, value);
68   if (count <= 8)
69     return SplatSet<HeadTail<_4>>(dst, value, count);
70   if (count <= 16)
71     return SplatSet<HeadTail<_8>>(dst, value, count);
72   if (count <= 32)
73     return SplatSet<HeadTail<_16>>(dst, value, count);
74   if (count <= 64)
75     return SplatSet<HeadTail<_32>>(dst, value, count);
76   if (count <= 128)
77     return SplatSet<HeadTail<_64>>(dst, value, count);
78   return SplatSet<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
79 }
80 
81 } // namespace __llvm_libc
82 
83 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
84