1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED.
3 * Copyright (C) ARM Ltd. 2016-2020. ALL RIGHTS RESERVED.
4 *
5 * See file LICENSE for terms.
6 */
7
8 #ifndef UCS_AARCH64_CPU_H_
9 #define UCS_AARCH64_CPU_H_
10
11 #include "config.h"
12 #include <time.h>
13 #include <string.h>
14 #include <sys/times.h>
15 #include <ucs/sys/compiler_def.h>
16 #include <ucs/arch/generic/cpu.h>
17 #include <ucs/sys/math.h>
18 #include <ucs/type/status.h>
19 #ifdef __ARM_NEON
20 #include <arm_neon.h>
21 #endif
22
23
24 #define UCS_ARCH_CACHE_LINE_SIZE 64
25
26 BEGIN_C_DECLS
27
28 /** @file cpu.h */
29
30 /**
31 * Assume the worst - weak memory ordering.
32 */
33
34 #define ucs_aarch64_dmb(_op) asm volatile ("dmb " #_op ::: "memory")
35 #define ucs_aarch64_isb(_op) asm volatile ("isb " #_op ::: "memory")
36 #define ucs_aarch64_dsb(_op) asm volatile ("dsb " #_op ::: "memory")
37
38 /* The macro is used to serialize stores across Normal NC (or Device) and WB
39 * memory, (see Arm Spec, B2.7.2). Based on recent changes in Linux kernel:
40 * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=22ec71615d824f4f11d38d0e55a88d8956b7e45f
41 *
42 * The underlying barrier code was changed to use lighter weight DMB instead
43 * of DSB. The barrier used for synchronization of access between write back
44 * and device mapped memory (PCIe BAR).
45 */
46 #define ucs_memory_bus_fence() ucs_aarch64_dmb(oshsy)
47 #define ucs_memory_bus_store_fence() ucs_aarch64_dmb(oshst)
48 #define ucs_memory_bus_load_fence() ucs_aarch64_dmb(oshld)
49
50 /* The macro is used to flush all pending stores from write combining buffer.
51 * Some uarch "auto" flush the stores once cache line is full (no need for additional barrier).
52 */
53 #if defined(HAVE_AARCH64_THUNDERX2)
54 #define ucs_memory_bus_cacheline_wc_flush()
55 #else
56 /* The macro is used to flush stores to Normal NC or Device memory */
57 #define ucs_memory_bus_cacheline_wc_flush() ucs_aarch64_dmb(oshst)
58 #endif
59
60 #define ucs_memory_cpu_fence() ucs_aarch64_dmb(ish)
61 #define ucs_memory_cpu_store_fence() ucs_aarch64_dmb(ishst)
62 #define ucs_memory_cpu_load_fence() ucs_aarch64_dmb(ishld)
63
64 /* The macro is used to serialize stores to Normal NC or Device memory
65 * (see Arm Spec, B2.7.2)
66 */
67 #define ucs_memory_cpu_wc_fence() ucs_aarch64_dmb(oshst)
68
69
70 /*
71 * ARM processor ID (ARM ISA - Main ID Register, EL1)
72 */
73 typedef struct ucs_aarch64_cpuid {
74 int implementer;
75 int architecture;
76 int variant;
77 int part;
78 int revision;
79 } ucs_aarch64_cpuid_t;
80
81
82 /**
83 * Get ARM CPU identifier and version
84 */
85 void ucs_aarch64_cpuid(ucs_aarch64_cpuid_t *cpuid);
86
87
88 #if defined(HAVE_AARCH64_THUNDERX2)
89 extern void *__memcpy_thunderx2(void *, const void *, size_t);
90 #endif
91
92
93 #if HAVE_HW_TIMER
ucs_arch_read_hres_clock(void)94 static inline uint64_t ucs_arch_read_hres_clock(void)
95 {
96 uint64_t ticks;
97 asm volatile("isb" : : : "memory");
98 asm volatile("mrs %0, cntvct_el0" : "=r" (ticks));
99 return ticks;
100 }
101
ucs_arch_get_clocks_per_sec()102 static inline double ucs_arch_get_clocks_per_sec()
103 {
104 uint64_t freq;
105 asm volatile("mrs %0, cntfrq_el0" : "=r" (freq));
106 return (double) freq;
107 }
108
109 #else
110
111 #define ucs_arch_read_hres_clock ucs_arch_generic_read_hres_clock
112 #define ucs_arch_get_clocks_per_sec ucs_arch_generic_get_clocks_per_sec
113
114 #endif
115
ucs_arch_get_cpu_model()116 static inline ucs_cpu_model_t ucs_arch_get_cpu_model()
117 {
118 return UCS_CPU_MODEL_ARM_AARCH64;
119 }
120
ucs_arch_get_cpu_vendor()121 static inline ucs_cpu_vendor_t ucs_arch_get_cpu_vendor()
122 {
123 ucs_aarch64_cpuid_t cpuid;
124 ucs_aarch64_cpuid(&cpuid);
125
126 if ((cpuid.implementer == 0x46) && (cpuid.architecture == 8)) {
127 return UCS_CPU_VENDOR_FUJITSU_ARM;
128 }
129
130 return UCS_CPU_VENDOR_GENERIC_ARM;
131 }
132
ucs_arch_get_cpu_flag()133 static inline int ucs_arch_get_cpu_flag()
134 {
135 return UCS_CPU_FLAG_UNKNOWN;
136 }
137
ucs_cpu_init()138 static inline void ucs_cpu_init()
139 {
140 }
141
ucs_arch_wait_mem(void * address)142 static inline void ucs_arch_wait_mem(void *address)
143 {
144 unsigned long tmp;
145 asm volatile ("ldxrb %w0, %1 \n"
146 "wfe \n"
147 : "=&r"(tmp)
148 : "Q"(address));
149 }
150
151 #if !HAVE___CLEAR_CACHE
ucs_arch_clear_cache(void * start,void * end)152 static inline void ucs_arch_clear_cache(void *start, void *end)
153 {
154 #if HAVE___AARCH64_SYNC_CACHE_RANGE
155 /* do not allow global declaration of compiler intrinsic */
156 void __aarch64_sync_cache_range(void* beg, void* end);
157
158 __aarch64_sync_cache_range(start, end);
159 #else
160 uintptr_t ptr;
161 unsigned icache;
162 unsigned dcache;
163 unsigned dic;
164 unsigned idc;
165 unsigned ctr_el0;
166
167 /* Get cache line size, using ctr_el0 register
168 *
169 * Bits Name Function
170 * *****************************
171 * [31] - Reserved, RES1.
172 * [30] - Reserved, RES0.
173 * [29] DIC Instruction cache invalidation requirements for data to instruction
174 * coherence.
175 * [28] IDC Data cache clean requirements for instruction to data coherence.
176 * [27:24] CWG Cache Write-Back granule. Log2 of the number of words of the
177 * maximum size of memory that can be overwritten as a result of
178 * the eviction of a cache entry that has had a memory location
179 * in it modified:
180 * 0x4
181 * Cache Write-Back granule size is 16 words.
182 * [23:20] ERG Exclusives Reservation Granule. Log2 of the number of words of
183 * the maximum size of the reservation granule that has been
184 * implemented for the Load-Exclusive and Store-Exclusive instructions:
185 * 0x4
186 * Exclusive reservation granule size is 16 words.
187 * [19:16] DminLine Log2 of the number of words in the smallest cache line of all the
188 * data and unified caches that the processor controls:
189 * 0x4
190 * Smallest data cache line size is 16 words.
191 * [15:14] L1lp L1 Instruction cache policy. Indicates the indexing and tagging
192 * policy for the L1 Instruction cache:
193 * 0b10
194 * Virtually Indexed Physically Tagged (VIPT).
195 * [13:4] - Reserved, res0.
196 * [3:0] IminLine Log2 of the number of words in the smallest cache line of all
197 * the instruction caches that the processor controls.
198 * 0x4
199 * Smallest instruction cache line size is 16 words.
200 */
201 asm volatile ("mrs\t%0, ctr_el0":"=r" (ctr_el0));
202 icache = sizeof(int) << (ctr_el0 & 0xf);
203 dcache = sizeof(int) << ((ctr_el0 >> 16) & 0xf);
204 dic = (ctr_el0 >> 29) & 0x1;
205 idc = (ctr_el0 >> 28) & 0x1;
206
207 /*
208 * Check if Data cache clean to the Point of Unification is required for instruction to
209 * data coherence
210 */
211 if (idc == 0) {
212 for (ptr = ucs_align_down((uintptr_t)start, dcache); ptr < (uintptr_t)end; ptr += dcache) {
213 asm volatile ("dc cvau, %0" :: "r" (ptr) : "memory");
214 }
215 }
216
217 /*
218 * Check if Instruction cache invalidation to the Point of Unification is required for
219 * data to instruction coherence.
220 */
221 if (dic == 0) {
222 ucs_aarch64_dsb(ish);
223 for (ptr = ucs_align_down((uintptr_t)start, icache); ptr < (uintptr_t)end; ptr += icache) {
224 asm volatile ("ic ivau, %0" :: "r" (ptr) : "memory");
225 }
226 }
227 ucs_aarch64_dsb(ish);
228 ucs_aarch64_isb();
229 #endif
230 }
231 #endif
232
ucs_memcpy_relaxed(void * dst,const void * src,size_t len)233 static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
234 {
235 #if defined(HAVE_AARCH64_THUNDERX2)
236 return __memcpy_thunderx2(dst, src,len);
237 #else
238 return memcpy(dst, src, len);
239 #endif
240 }
241
242 static UCS_F_ALWAYS_INLINE void
ucs_memcpy_nontemporal(void * dst,const void * src,size_t len)243 ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
244 {
245 #if defined(HAVE_AARCH64_THUNDERX2)
246 __memcpy_thunderx2(dst, src,len);
247 #else
248 memcpy(dst, src, len);
249 #endif
250
251 }
252
ucs_arch_get_cache_size(size_t * cache_sizes)253 static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
254 {
255 return UCS_ERR_UNSUPPORTED;
256 }
257
258 END_C_DECLS
259
260 #endif
261