1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
3 * Copyright (C) ARM Ltd. 2016-2020.  ALL RIGHTS RESERVED.
4 *
5 * See file LICENSE for terms.
6 */
7 
8 #ifndef UCS_AARCH64_CPU_H_
9 #define UCS_AARCH64_CPU_H_
10 
11 #include "config.h"
12 #include <time.h>
13 #include <string.h>
14 #include <sys/times.h>
15 #include <ucs/sys/compiler_def.h>
16 #include <ucs/arch/generic/cpu.h>
17 #include <ucs/sys/math.h>
18 #include <ucs/type/status.h>
19 #ifdef __ARM_NEON
20 #include <arm_neon.h>
21 #endif
22 
23 
24 #define UCS_ARCH_CACHE_LINE_SIZE 64
25 
26 BEGIN_C_DECLS
27 
28 /** @file cpu.h */
29 
30 /**
31  * Assume the worst - weak memory ordering.
32  */
33 
34 #define ucs_aarch64_dmb(_op)          asm volatile ("dmb " #_op ::: "memory")
35 #define ucs_aarch64_isb(_op)          asm volatile ("isb " #_op ::: "memory")
36 #define ucs_aarch64_dsb(_op)          asm volatile ("dsb " #_op ::: "memory")
37 
38 /* The macro is used to serialize stores across Normal NC (or Device) and WB
39  * memory, (see Arm Spec, B2.7.2).  Based on recent changes in Linux kernel:
40  * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=22ec71615d824f4f11d38d0e55a88d8956b7e45f
41  *
42  * The underlying barrier code was changed to use lighter weight DMB instead
43  * of DSB. The barrier used for synchronization of access between write back
44  * and device mapped memory (PCIe BAR).
45  */
46 #define ucs_memory_bus_fence()        ucs_aarch64_dmb(oshsy)
47 #define ucs_memory_bus_store_fence()  ucs_aarch64_dmb(oshst)
48 #define ucs_memory_bus_load_fence()   ucs_aarch64_dmb(oshld)
49 
50 /* The macro is used to flush all pending stores from write combining buffer.
51  * Some uarch "auto" flush the stores once cache line is full (no need for additional barrier).
52  */
53 #if defined(HAVE_AARCH64_THUNDERX2)
54 #define ucs_memory_bus_cacheline_wc_flush()
55 #else
56 /* The macro is used to flush stores to Normal NC or Device memory */
57 #define ucs_memory_bus_cacheline_wc_flush()     ucs_aarch64_dmb(oshst)
58 #endif
59 
60 #define ucs_memory_cpu_fence()        ucs_aarch64_dmb(ish)
61 #define ucs_memory_cpu_store_fence()  ucs_aarch64_dmb(ishst)
62 #define ucs_memory_cpu_load_fence()   ucs_aarch64_dmb(ishld)
63 
64 /* The macro is used to serialize stores to Normal NC or Device memory
65  * (see Arm Spec, B2.7.2)
66  */
67 #define ucs_memory_cpu_wc_fence()     ucs_aarch64_dmb(oshst)
68 
69 
70 /*
71  * ARM processor ID (ARM ISA - Main ID Register, EL1)
72  */
73 typedef struct ucs_aarch64_cpuid {
74     int       implementer;
75     int       architecture;
76     int       variant;
77     int       part;
78     int       revision;
79 } ucs_aarch64_cpuid_t;
80 
81 
82 /**
83  * Get ARM CPU identifier and version
84  */
85 void ucs_aarch64_cpuid(ucs_aarch64_cpuid_t *cpuid);
86 
87 
88 #if defined(HAVE_AARCH64_THUNDERX2)
89 extern void *__memcpy_thunderx2(void *, const void *, size_t);
90 #endif
91 
92 
93 #if HAVE_HW_TIMER
ucs_arch_read_hres_clock(void)94 static inline uint64_t ucs_arch_read_hres_clock(void)
95 {
96     uint64_t ticks;
97     asm volatile("isb" : : : "memory");
98     asm volatile("mrs %0, cntvct_el0" : "=r" (ticks));
99     return ticks;
100 }
101 
ucs_arch_get_clocks_per_sec()102 static inline double ucs_arch_get_clocks_per_sec()
103 {
104     uint64_t freq;
105     asm volatile("mrs %0, cntfrq_el0" : "=r" (freq));
106     return (double) freq;
107 }
108 
109 #else
110 
111 #define ucs_arch_read_hres_clock ucs_arch_generic_read_hres_clock
112 #define ucs_arch_get_clocks_per_sec ucs_arch_generic_get_clocks_per_sec
113 
114 #endif
115 
ucs_arch_get_cpu_model()116 static inline ucs_cpu_model_t ucs_arch_get_cpu_model()
117 {
118     return UCS_CPU_MODEL_ARM_AARCH64;
119 }
120 
ucs_arch_get_cpu_vendor()121 static inline ucs_cpu_vendor_t ucs_arch_get_cpu_vendor()
122 {
123     ucs_aarch64_cpuid_t cpuid;
124     ucs_aarch64_cpuid(&cpuid);
125 
126     if ((cpuid.implementer == 0x46) && (cpuid.architecture == 8)) {
127         return UCS_CPU_VENDOR_FUJITSU_ARM;
128     }
129 
130     return UCS_CPU_VENDOR_GENERIC_ARM;
131 }
132 
ucs_arch_get_cpu_flag()133 static inline int ucs_arch_get_cpu_flag()
134 {
135     return UCS_CPU_FLAG_UNKNOWN;
136 }
137 
ucs_cpu_init()138 static inline void ucs_cpu_init()
139 {
140 }
141 
ucs_arch_wait_mem(void * address)142 static inline void ucs_arch_wait_mem(void *address)
143 {
144     unsigned long tmp;
145     asm volatile ("ldxrb %w0, %1 \n"
146                   "wfe           \n"
147                   : "=&r"(tmp)
148                   : "Q"(address));
149 }
150 
151 #if !HAVE___CLEAR_CACHE
ucs_arch_clear_cache(void * start,void * end)152 static inline void ucs_arch_clear_cache(void *start, void *end)
153 {
154 #if HAVE___AARCH64_SYNC_CACHE_RANGE
155     /* do not allow global declaration of compiler intrinsic */
156     void __aarch64_sync_cache_range(void* beg, void* end);
157 
158     __aarch64_sync_cache_range(start, end);
159 #else
160     uintptr_t ptr;
161     unsigned icache;
162     unsigned dcache;
163     unsigned dic;
164     unsigned idc;
165     unsigned ctr_el0;
166 
167     /* Get cache line size, using ctr_el0 register
168      *
169      * Bits    Name      Function
170      * *****************************
171      * [31]    -         Reserved, RES1.
172      * [30]    -         Reserved, RES0.
173      * [29]    DIC       Instruction cache invalidation requirements for data to instruction
174      *                   coherence.
175      * [28]    IDC       Data cache clean requirements for instruction to data coherence.
176      * [27:24] CWG       Cache Write-Back granule. Log2 of the number of words of the
177      *                   maximum size of memory that can be overwritten as a result of
178      *                   the eviction of a cache entry that has had a memory location
179      *                   in it modified:
180      *                   0x4
181      *                   Cache Write-Back granule size is 16 words.
182      * [23:20] ERG       Exclusives Reservation Granule. Log2 of the number of words of
183      *                   the maximum size of the reservation granule that has been
184      *                   implemented for the Load-Exclusive and Store-Exclusive instructions:
185      *                   0x4
186      *                   Exclusive reservation granule size is 16 words.
187      * [19:16] DminLine  Log2 of the number of words in the smallest cache line of all the
188      *                   data and unified caches that the processor controls:
189      *                   0x4
190      *                   Smallest data cache line size is 16 words.
191      * [15:14] L1lp      L1 Instruction cache policy. Indicates the indexing and tagging
192      *                   policy for the L1 Instruction cache:
193      *                   0b10
194      *                   Virtually Indexed Physically Tagged (VIPT).
195      * [13:4]  -         Reserved, res0.
196      * [3:0]   IminLine  Log2 of the number of words in the smallest cache line of all
197      *                   the instruction caches that the processor controls.
198      *                   0x4
199      *                   Smallest instruction cache line size is 16 words.
200      */
201     asm volatile ("mrs\t%0, ctr_el0":"=r" (ctr_el0));
202     icache = sizeof(int) << (ctr_el0 & 0xf);
203     dcache = sizeof(int) << ((ctr_el0 >> 16) & 0xf);
204     dic = (ctr_el0 >> 29) & 0x1;
205     idc = (ctr_el0 >> 28) & 0x1;
206 
207     /*
208      * Check if Data cache clean to the Point of Unification is required for instruction to
209      * data coherence
210      */
211     if (idc == 0) {
212         for (ptr = ucs_align_down((uintptr_t)start, dcache); ptr < (uintptr_t)end; ptr += dcache) {
213             asm volatile ("dc cvau, %0" :: "r" (ptr) : "memory");
214         }
215     }
216 
217     /*
218      * Check if Instruction cache invalidation to the Point of Unification is required for
219      * data to instruction coherence.
220      */
221     if (dic == 0) {
222         ucs_aarch64_dsb(ish);
223         for (ptr = ucs_align_down((uintptr_t)start, icache); ptr < (uintptr_t)end; ptr += icache) {
224             asm volatile ("ic ivau, %0" :: "r" (ptr) : "memory");
225         }
226     }
227     ucs_aarch64_dsb(ish);
228     ucs_aarch64_isb();
229 #endif
230 }
231 #endif
232 
ucs_memcpy_relaxed(void * dst,const void * src,size_t len)233 static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
234 {
235 #if defined(HAVE_AARCH64_THUNDERX2)
236     return __memcpy_thunderx2(dst, src,len);
237 #else
238     return memcpy(dst, src, len);
239 #endif
240 }
241 
242 static UCS_F_ALWAYS_INLINE void
ucs_memcpy_nontemporal(void * dst,const void * src,size_t len)243 ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
244 {
245 #if defined(HAVE_AARCH64_THUNDERX2)
246     __memcpy_thunderx2(dst, src,len);
247 #else
248     memcpy(dst, src, len);
249 #endif
250 
251 }
252 
ucs_arch_get_cache_size(size_t * cache_sizes)253 static inline ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
254 {
255     return UCS_ERR_UNSUPPORTED;
256 }
257 
258 END_C_DECLS
259 
260 #endif
261