1f413aae9SCharlie Jenkins // SPDX-License-Identifier: GPL-2.0-only
2f413aae9SCharlie Jenkins /*
3f413aae9SCharlie Jenkins  * Copyright 2024 Rivos Inc.
4f413aae9SCharlie Jenkins  */
5f413aae9SCharlie Jenkins 
6f413aae9SCharlie Jenkins #include <linux/cpu.h>
7f413aae9SCharlie Jenkins #include <linux/cpumask.h>
8f413aae9SCharlie Jenkins #include <linux/jump_label.h>
9f413aae9SCharlie Jenkins #include <linux/mm.h>
10f413aae9SCharlie Jenkins #include <linux/smp.h>
11f413aae9SCharlie Jenkins #include <linux/types.h>
12f413aae9SCharlie Jenkins #include <asm/cpufeature.h>
13f413aae9SCharlie Jenkins #include <asm/hwprobe.h>
14f413aae9SCharlie Jenkins 
15f413aae9SCharlie Jenkins #include "copy-unaligned.h"
16f413aae9SCharlie Jenkins 
17f413aae9SCharlie Jenkins #define MISALIGNED_ACCESS_JIFFIES_LG2 1
18f413aae9SCharlie Jenkins #define MISALIGNED_BUFFER_SIZE 0x4000
19f413aae9SCharlie Jenkins #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
20f413aae9SCharlie Jenkins #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
21f413aae9SCharlie Jenkins 
22f413aae9SCharlie Jenkins DEFINE_PER_CPU(long, misaligned_access_speed);
23f413aae9SCharlie Jenkins 
24f413aae9SCharlie Jenkins #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
25f413aae9SCharlie Jenkins static cpumask_t fast_misaligned_access;
check_unaligned_access(void * param)26f413aae9SCharlie Jenkins static int check_unaligned_access(void *param)
27f413aae9SCharlie Jenkins {
28f413aae9SCharlie Jenkins 	int cpu = smp_processor_id();
29f413aae9SCharlie Jenkins 	u64 start_cycles, end_cycles;
30f413aae9SCharlie Jenkins 	u64 word_cycles;
31f413aae9SCharlie Jenkins 	u64 byte_cycles;
32f413aae9SCharlie Jenkins 	int ratio;
33f413aae9SCharlie Jenkins 	unsigned long start_jiffies, now;
34f413aae9SCharlie Jenkins 	struct page *page = param;
35f413aae9SCharlie Jenkins 	void *dst;
36f413aae9SCharlie Jenkins 	void *src;
37f413aae9SCharlie Jenkins 	long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
38f413aae9SCharlie Jenkins 
39f413aae9SCharlie Jenkins 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
40f413aae9SCharlie Jenkins 		return 0;
41f413aae9SCharlie Jenkins 
42f413aae9SCharlie Jenkins 	/* Make an unaligned destination buffer. */
43f413aae9SCharlie Jenkins 	dst = (void *)((unsigned long)page_address(page) | 0x1);
44f413aae9SCharlie Jenkins 	/* Unalign src as well, but differently (off by 1 + 2 = 3). */
45f413aae9SCharlie Jenkins 	src = dst + (MISALIGNED_BUFFER_SIZE / 2);
46f413aae9SCharlie Jenkins 	src += 2;
47f413aae9SCharlie Jenkins 	word_cycles = -1ULL;
48f413aae9SCharlie Jenkins 	/* Do a warmup. */
49f413aae9SCharlie Jenkins 	__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
50f413aae9SCharlie Jenkins 	preempt_disable();
51f413aae9SCharlie Jenkins 	start_jiffies = jiffies;
52f413aae9SCharlie Jenkins 	while ((now = jiffies) == start_jiffies)
53f413aae9SCharlie Jenkins 		cpu_relax();
54f413aae9SCharlie Jenkins 
55f413aae9SCharlie Jenkins 	/*
56f413aae9SCharlie Jenkins 	 * For a fixed amount of time, repeatedly try the function, and take
57f413aae9SCharlie Jenkins 	 * the best time in cycles as the measurement.
58f413aae9SCharlie Jenkins 	 */
59f413aae9SCharlie Jenkins 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
60f413aae9SCharlie Jenkins 		start_cycles = get_cycles64();
61f413aae9SCharlie Jenkins 		/* Ensure the CSR read can't reorder WRT to the copy. */
62f413aae9SCharlie Jenkins 		mb();
63f413aae9SCharlie Jenkins 		__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
64f413aae9SCharlie Jenkins 		/* Ensure the copy ends before the end time is snapped. */
65f413aae9SCharlie Jenkins 		mb();
66f413aae9SCharlie Jenkins 		end_cycles = get_cycles64();
67f413aae9SCharlie Jenkins 		if ((end_cycles - start_cycles) < word_cycles)
68f413aae9SCharlie Jenkins 			word_cycles = end_cycles - start_cycles;
69f413aae9SCharlie Jenkins 	}
70f413aae9SCharlie Jenkins 
71f413aae9SCharlie Jenkins 	byte_cycles = -1ULL;
72f413aae9SCharlie Jenkins 	__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
73f413aae9SCharlie Jenkins 	start_jiffies = jiffies;
74f413aae9SCharlie Jenkins 	while ((now = jiffies) == start_jiffies)
75f413aae9SCharlie Jenkins 		cpu_relax();
76f413aae9SCharlie Jenkins 
77f413aae9SCharlie Jenkins 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
78f413aae9SCharlie Jenkins 		start_cycles = get_cycles64();
79f413aae9SCharlie Jenkins 		mb();
80f413aae9SCharlie Jenkins 		__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
81f413aae9SCharlie Jenkins 		mb();
82f413aae9SCharlie Jenkins 		end_cycles = get_cycles64();
83f413aae9SCharlie Jenkins 		if ((end_cycles - start_cycles) < byte_cycles)
84f413aae9SCharlie Jenkins 			byte_cycles = end_cycles - start_cycles;
85f413aae9SCharlie Jenkins 	}
86f413aae9SCharlie Jenkins 
87f413aae9SCharlie Jenkins 	preempt_enable();
88f413aae9SCharlie Jenkins 
89f413aae9SCharlie Jenkins 	/* Don't divide by zero. */
90f413aae9SCharlie Jenkins 	if (!word_cycles || !byte_cycles) {
91f413aae9SCharlie Jenkins 		pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
92f413aae9SCharlie Jenkins 			cpu);
93f413aae9SCharlie Jenkins 
94f413aae9SCharlie Jenkins 		return 0;
95f413aae9SCharlie Jenkins 	}
96f413aae9SCharlie Jenkins 
97f413aae9SCharlie Jenkins 	if (word_cycles < byte_cycles)
98f413aae9SCharlie Jenkins 		speed = RISCV_HWPROBE_MISALIGNED_FAST;
99f413aae9SCharlie Jenkins 
100f413aae9SCharlie Jenkins 	ratio = div_u64((byte_cycles * 100), word_cycles);
101f413aae9SCharlie Jenkins 	pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
102f413aae9SCharlie Jenkins 		cpu,
103f413aae9SCharlie Jenkins 		ratio / 100,
104f413aae9SCharlie Jenkins 		ratio % 100,
105f413aae9SCharlie Jenkins 		(speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
106f413aae9SCharlie Jenkins 
107f413aae9SCharlie Jenkins 	per_cpu(misaligned_access_speed, cpu) = speed;
108f413aae9SCharlie Jenkins 
109f413aae9SCharlie Jenkins 	/*
110f413aae9SCharlie Jenkins 	 * Set the value of fast_misaligned_access of a CPU. These operations
111f413aae9SCharlie Jenkins 	 * are atomic to avoid race conditions.
112f413aae9SCharlie Jenkins 	 */
113f413aae9SCharlie Jenkins 	if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
114f413aae9SCharlie Jenkins 		cpumask_set_cpu(cpu, &fast_misaligned_access);
115f413aae9SCharlie Jenkins 	else
116f413aae9SCharlie Jenkins 		cpumask_clear_cpu(cpu, &fast_misaligned_access);
117f413aae9SCharlie Jenkins 
118f413aae9SCharlie Jenkins 	return 0;
119f413aae9SCharlie Jenkins }
120f413aae9SCharlie Jenkins 
check_unaligned_access_nonboot_cpu(void * param)121f413aae9SCharlie Jenkins static void check_unaligned_access_nonboot_cpu(void *param)
122f413aae9SCharlie Jenkins {
123f413aae9SCharlie Jenkins 	unsigned int cpu = smp_processor_id();
124f413aae9SCharlie Jenkins 	struct page **pages = param;
125f413aae9SCharlie Jenkins 
126f413aae9SCharlie Jenkins 	if (smp_processor_id() != 0)
127f413aae9SCharlie Jenkins 		check_unaligned_access(pages[cpu]);
128f413aae9SCharlie Jenkins }
129f413aae9SCharlie Jenkins 
130f413aae9SCharlie Jenkins DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
131f413aae9SCharlie Jenkins 
modify_unaligned_access_branches(cpumask_t * mask,int weight)132f413aae9SCharlie Jenkins static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
133f413aae9SCharlie Jenkins {
134f413aae9SCharlie Jenkins 	if (cpumask_weight(mask) == weight)
135f413aae9SCharlie Jenkins 		static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
136f413aae9SCharlie Jenkins 	else
137f413aae9SCharlie Jenkins 		static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
138f413aae9SCharlie Jenkins }
139f413aae9SCharlie Jenkins 
set_unaligned_access_static_branches_except_cpu(int cpu)140f413aae9SCharlie Jenkins static void set_unaligned_access_static_branches_except_cpu(int cpu)
141f413aae9SCharlie Jenkins {
142f413aae9SCharlie Jenkins 	/*
143f413aae9SCharlie Jenkins 	 * Same as set_unaligned_access_static_branches, except excludes the
144f413aae9SCharlie Jenkins 	 * given CPU from the result. When a CPU is hotplugged into an offline
145f413aae9SCharlie Jenkins 	 * state, this function is called before the CPU is set to offline in
146f413aae9SCharlie Jenkins 	 * the cpumask, and thus the CPU needs to be explicitly excluded.
147f413aae9SCharlie Jenkins 	 */
148f413aae9SCharlie Jenkins 
149f413aae9SCharlie Jenkins 	cpumask_t fast_except_me;
150f413aae9SCharlie Jenkins 
151f413aae9SCharlie Jenkins 	cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
152f413aae9SCharlie Jenkins 	cpumask_clear_cpu(cpu, &fast_except_me);
153f413aae9SCharlie Jenkins 
154f413aae9SCharlie Jenkins 	modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
155f413aae9SCharlie Jenkins }
156f413aae9SCharlie Jenkins 
set_unaligned_access_static_branches(void)157f413aae9SCharlie Jenkins static void set_unaligned_access_static_branches(void)
158f413aae9SCharlie Jenkins {
159f413aae9SCharlie Jenkins 	/*
160f413aae9SCharlie Jenkins 	 * This will be called after check_unaligned_access_all_cpus so the
161f413aae9SCharlie Jenkins 	 * result of unaligned access speed for all CPUs will be available.
162f413aae9SCharlie Jenkins 	 *
163f413aae9SCharlie Jenkins 	 * To avoid the number of online cpus changing between reading
164f413aae9SCharlie Jenkins 	 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
165f413aae9SCharlie Jenkins 	 * held before calling this function.
166f413aae9SCharlie Jenkins 	 */
167f413aae9SCharlie Jenkins 
168f413aae9SCharlie Jenkins 	cpumask_t fast_and_online;
169f413aae9SCharlie Jenkins 
170f413aae9SCharlie Jenkins 	cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
171f413aae9SCharlie Jenkins 
172f413aae9SCharlie Jenkins 	modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
173f413aae9SCharlie Jenkins }
174f413aae9SCharlie Jenkins 
lock_and_set_unaligned_access_static_branch(void)175f413aae9SCharlie Jenkins static int lock_and_set_unaligned_access_static_branch(void)
176f413aae9SCharlie Jenkins {
177f413aae9SCharlie Jenkins 	cpus_read_lock();
178f413aae9SCharlie Jenkins 	set_unaligned_access_static_branches();
179f413aae9SCharlie Jenkins 	cpus_read_unlock();
180f413aae9SCharlie Jenkins 
181f413aae9SCharlie Jenkins 	return 0;
182f413aae9SCharlie Jenkins }
183f413aae9SCharlie Jenkins 
184f413aae9SCharlie Jenkins arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
185f413aae9SCharlie Jenkins 
riscv_online_cpu(unsigned int cpu)186f413aae9SCharlie Jenkins static int riscv_online_cpu(unsigned int cpu)
187f413aae9SCharlie Jenkins {
188f413aae9SCharlie Jenkins 	static struct page *buf;
189f413aae9SCharlie Jenkins 
190f413aae9SCharlie Jenkins 	/* We are already set since the last check */
191f413aae9SCharlie Jenkins 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
192f413aae9SCharlie Jenkins 		goto exit;
193f413aae9SCharlie Jenkins 
194f413aae9SCharlie Jenkins 	buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
195f413aae9SCharlie Jenkins 	if (!buf) {
196f413aae9SCharlie Jenkins 		pr_warn("Allocation failure, not measuring misaligned performance\n");
197f413aae9SCharlie Jenkins 		return -ENOMEM;
198f413aae9SCharlie Jenkins 	}
199f413aae9SCharlie Jenkins 
200f413aae9SCharlie Jenkins 	check_unaligned_access(buf);
201f413aae9SCharlie Jenkins 	__free_pages(buf, MISALIGNED_BUFFER_ORDER);
202f413aae9SCharlie Jenkins 
203f413aae9SCharlie Jenkins exit:
204f413aae9SCharlie Jenkins 	set_unaligned_access_static_branches();
205f413aae9SCharlie Jenkins 
206f413aae9SCharlie Jenkins 	return 0;
207f413aae9SCharlie Jenkins }
208f413aae9SCharlie Jenkins 
riscv_offline_cpu(unsigned int cpu)209f413aae9SCharlie Jenkins static int riscv_offline_cpu(unsigned int cpu)
210f413aae9SCharlie Jenkins {
211f413aae9SCharlie Jenkins 	set_unaligned_access_static_branches_except_cpu(cpu);
212f413aae9SCharlie Jenkins 
213f413aae9SCharlie Jenkins 	return 0;
214f413aae9SCharlie Jenkins }
215f413aae9SCharlie Jenkins 
216f413aae9SCharlie Jenkins /* Measure unaligned access speed on all CPUs present at boot in parallel. */
check_unaligned_access_speed_all_cpus(void)217f413aae9SCharlie Jenkins static int check_unaligned_access_speed_all_cpus(void)
218f413aae9SCharlie Jenkins {
219f413aae9SCharlie Jenkins 	unsigned int cpu;
220f413aae9SCharlie Jenkins 	unsigned int cpu_count = num_possible_cpus();
221*28e4748eSErick Archer 	struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
222f413aae9SCharlie Jenkins 
223f413aae9SCharlie Jenkins 	if (!bufs) {
224f413aae9SCharlie Jenkins 		pr_warn("Allocation failure, not measuring misaligned performance\n");
225f413aae9SCharlie Jenkins 		return 0;
226f413aae9SCharlie Jenkins 	}
227f413aae9SCharlie Jenkins 
228f413aae9SCharlie Jenkins 	/*
229f413aae9SCharlie Jenkins 	 * Allocate separate buffers for each CPU so there's no fighting over
230f413aae9SCharlie Jenkins 	 * cache lines.
231f413aae9SCharlie Jenkins 	 */
232f413aae9SCharlie Jenkins 	for_each_cpu(cpu, cpu_online_mask) {
233f413aae9SCharlie Jenkins 		bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
234f413aae9SCharlie Jenkins 		if (!bufs[cpu]) {
235f413aae9SCharlie Jenkins 			pr_warn("Allocation failure, not measuring misaligned performance\n");
236f413aae9SCharlie Jenkins 			goto out;
237f413aae9SCharlie Jenkins 		}
238f413aae9SCharlie Jenkins 	}
239f413aae9SCharlie Jenkins 
240f413aae9SCharlie Jenkins 	/* Check everybody except 0, who stays behind to tend jiffies. */
241f413aae9SCharlie Jenkins 	on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
242f413aae9SCharlie Jenkins 
243f413aae9SCharlie Jenkins 	/* Check core 0. */
244f413aae9SCharlie Jenkins 	smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
245f413aae9SCharlie Jenkins 
246f413aae9SCharlie Jenkins 	/*
247f413aae9SCharlie Jenkins 	 * Setup hotplug callbacks for any new CPUs that come online or go
248f413aae9SCharlie Jenkins 	 * offline.
249f413aae9SCharlie Jenkins 	 */
250f413aae9SCharlie Jenkins 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
251f413aae9SCharlie Jenkins 				  riscv_online_cpu, riscv_offline_cpu);
252f413aae9SCharlie Jenkins 
253f413aae9SCharlie Jenkins out:
254f413aae9SCharlie Jenkins 	for_each_cpu(cpu, cpu_online_mask) {
255f413aae9SCharlie Jenkins 		if (bufs[cpu])
256f413aae9SCharlie Jenkins 			__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
257f413aae9SCharlie Jenkins 	}
258f413aae9SCharlie Jenkins 
259f413aae9SCharlie Jenkins 	kfree(bufs);
260f413aae9SCharlie Jenkins 	return 0;
261f413aae9SCharlie Jenkins }
262f413aae9SCharlie Jenkins 
check_unaligned_access_all_cpus(void)263f413aae9SCharlie Jenkins static int check_unaligned_access_all_cpus(void)
264f413aae9SCharlie Jenkins {
265f413aae9SCharlie Jenkins 	bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
266f413aae9SCharlie Jenkins 
267f413aae9SCharlie Jenkins 	if (!all_cpus_emulated)
268f413aae9SCharlie Jenkins 		return check_unaligned_access_speed_all_cpus();
269f413aae9SCharlie Jenkins 
270f413aae9SCharlie Jenkins 	return 0;
271f413aae9SCharlie Jenkins }
272f413aae9SCharlie Jenkins #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
check_unaligned_access_all_cpus(void)273f413aae9SCharlie Jenkins static int check_unaligned_access_all_cpus(void)
274f413aae9SCharlie Jenkins {
275f413aae9SCharlie Jenkins 	check_unaligned_access_emulated_all_cpus();
276f413aae9SCharlie Jenkins 
277f413aae9SCharlie Jenkins 	return 0;
278f413aae9SCharlie Jenkins }
279f413aae9SCharlie Jenkins #endif
280f413aae9SCharlie Jenkins 
281f413aae9SCharlie Jenkins arch_initcall(check_unaligned_access_all_cpus);
282