xref: /freebsd/sys/kern/kern_membarrier.c (revision 4b9d6057)
1 /*-
2  * Copyright (c) 2021 The FreeBSD Foundation
3  *
4  * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
5  * under sponsorship from the FreeBSD Foundation.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/cpuset.h>
32 #include <sys/lock.h>
33 #include <sys/membarrier.h>
34 #include <sys/mutex.h>
35 #include <sys/proc.h>
36 #include <sys/sched.h>
37 #include <sys/smp.h>
38 #include <sys/syscallsubr.h>
39 #include <sys/sysproto.h>
40 
41 #include <vm/vm_param.h>
42 #include <vm/vm.h>
43 #include <vm/pmap.h>
44 #include <vm/vm_map.h>
45 
46 #define MEMBARRIER_SUPPORTED_CMDS	(			\
47     MEMBARRIER_CMD_GLOBAL |					\
48     MEMBARRIER_CMD_GLOBAL_EXPEDITED |				\
49     MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED |			\
50     MEMBARRIER_CMD_PRIVATE_EXPEDITED |				\
51     MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED |			\
52     MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE |		\
53     MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
54 
55 static void
56 membarrier_action_seqcst(void *arg __unused)
57 {
58 	atomic_thread_fence_seq_cst();
59 }
60 
61 static void
62 membarrier_action_seqcst_sync_core(void *arg __unused)
63 {
64 	atomic_thread_fence_seq_cst();
65 	cpu_sync_core();
66 }
67 
68 static void
69 do_membarrier_ipi(cpuset_t *csp, void (*func)(void *))
70 {
71 	atomic_thread_fence_seq_cst();
72 	smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func,
73 	    smp_no_rendezvous_barrier, NULL);
74 	atomic_thread_fence_seq_cst();
75 }
76 
77 static void
78 check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init)
79 {
80 	struct pcpu *pc;
81 	uint64_t sw;
82 
83 	if (CPU_ISSET(c, csp))
84 		return;
85 
86 	pc = cpuid_to_pcpu[c];
87 	if (pc->pc_curthread == pc->pc_idlethread) {
88 		CPU_SET(c, csp);
89 		return;
90 	}
91 
92 	/*
93 	 * Sync with context switch to ensure that override of
94 	 * pc_curthread with non-idle thread pointer is visible before
95 	 * reading of pc_switchtime.
96 	 */
97 	atomic_thread_fence_acq();
98 
99 	sw = pc->pc_switchtime;
100 	if (init)
101 		swt[c] = sw;
102 	else if (sw != swt[c])
103 		CPU_SET(c, csp);
104 }
105 
106 /*
107  *
108  * XXXKIB: We execute the requested action (seq_cst and possibly
109  * sync_core) on current CPU as well.  There is no guarantee that
110  * current thread executes anything with the full fence semantics
111  * during syscall execution.  Similarly, cpu_core_sync() semantics
112  * might be not provided by the syscall return.  E.g. on amd64 we
113  * typically return without IRET.
114  */
115 int
116 kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id)
117 {
118 	struct proc *p, *p1;
119 	struct thread *td1;
120 	cpuset_t cs;
121 	uint64_t *swt;
122 	int c, error;
123 	bool first;
124 
125 	if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0)
126 		return (EINVAL);
127 
128 	if (cmd == MEMBARRIER_CMD_QUERY) {
129 		td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS;
130 		return (0);
131 	}
132 
133 	p = td->td_proc;
134 	error = 0;
135 
136 	switch (cmd) {
137 	case MEMBARRIER_CMD_GLOBAL:
138 		swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK);
139 		CPU_ZERO(&cs);
140 		sched_pin();
141 		CPU_SET(PCPU_GET(cpuid), &cs);
142 		for (first = true; error == 0; first = false) {
143 			CPU_FOREACH(c)
144 				check_cpu_switched(c, &cs, swt, first);
145 			if (CPU_CMP(&cs, &all_cpus) == 0)
146 				break;
147 			error = pause_sig("mmbr", 1);
148 			if (error == EWOULDBLOCK)
149 				error = 0;
150 		}
151 		sched_unpin();
152 		free(swt, M_TEMP);
153 		atomic_thread_fence_seq_cst();
154 		break;
155 
156 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
157 		if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
158 			error = EPERM;
159 		} else {
160 			CPU_ZERO(&cs);
161 			CPU_FOREACH(c) {
162 				td1 = cpuid_to_pcpu[c]->pc_curthread;
163 				p1 = td1->td_proc;
164 				if (p1 != NULL &&
165 				    (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0)
166 					CPU_SET(c, &cs);
167 			}
168 			do_membarrier_ipi(&cs, membarrier_action_seqcst);
169 		}
170 		break;
171 
172 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
173 		if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
174 			PROC_LOCK(p);
175 			p->p_flag2 |= P2_MEMBAR_GLOBE;
176 			PROC_UNLOCK(p);
177 		}
178 		break;
179 
180 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
181 		if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
182 			error = EPERM;
183 		} else {
184 			pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
185 			do_membarrier_ipi(&cs, membarrier_action_seqcst);
186 		}
187 		break;
188 
189 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
190 		if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
191 			PROC_LOCK(p);
192 			p->p_flag2 |= P2_MEMBAR_PRIVE;
193 			PROC_UNLOCK(p);
194 		}
195 		break;
196 
197 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
198 		if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
199 			error = EPERM;
200 		} else {
201 			/*
202 			 * Calculating the IPI multicast mask from
203 			 * pmap active mask means that we do not call
204 			 * cpu_sync_core() on CPUs that were missed
205 			 * from pmap active mask but could be switched
206 			 * from or to meantime.  This is fine at least
207 			 * on amd64 because threads always use slow
208 			 * (IRETQ) path to return from syscall after
209 			 * context switch.
210 			 */
211 			pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
212 
213 			do_membarrier_ipi(&cs,
214 			    membarrier_action_seqcst_sync_core);
215 		}
216 		break;
217 
218 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
219 		if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
220 			PROC_LOCK(p);
221 			p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE;
222 			PROC_UNLOCK(p);
223 		}
224 		break;
225 
226 	default:
227 		error = EINVAL;
228 		break;
229 	}
230 
231 	return (error);
232 }
233 
234 int
235 sys_membarrier(struct thread *td, struct membarrier_args *uap)
236 {
237 	return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id));
238 }
239