1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Copyright (C) 1995  Linus Torvalds
4  *
5  *  Pentium III FXSR, SSE support
6  *	Gareth Hughes <gareth@valinux.com>, May 2000
7  *
8  *  X86-64 port
9  *	Andi Kleen.
10  *
11  *	CPU hotplug support - ashok.raj@intel.com
12  */
13 
14 /*
15  * This file handles the architecture-dependent parts of process handling..
16  */
17 
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/sched/task.h>
22 #include <linux/sched/task_stack.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/delay.h>
32 #include <linux/export.h>
33 #include <linux/ptrace.h>
34 #include <linux/notifier.h>
35 #include <linux/kprobes.h>
36 #include <linux/kdebug.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
39 #include <linux/io.h>
40 #include <linux/ftrace.h>
41 #include <linux/syscalls.h>
42 
43 #include <asm/processor.h>
44 #include <asm/fpu/internal.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52 #include <asm/xen/hypervisor.h>
53 #include <asm/vdso.h>
54 #include <asm/resctrl.h>
55 #include <asm/unistd.h>
56 #include <asm/fsgsbase.h>
57 #ifdef CONFIG_IA32_EMULATION
58 /* Not included via unistd.h */
59 #include <asm/unistd_32_ia32.h>
60 #endif
61 
62 #include "process.h"
63 
64 /* Prints also some state that isn't saved in the pt_regs */
__show_regs(struct pt_regs * regs,enum show_regs_mode mode,const char * log_lvl)65 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
66 		 const char *log_lvl)
67 {
68 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
69 	unsigned long d0, d1, d2, d3, d6, d7;
70 	unsigned int fsindex, gsindex;
71 	unsigned int ds, es;
72 
73 	show_iret_regs(regs, log_lvl);
74 
75 	if (regs->orig_ax != -1)
76 		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
77 	else
78 		pr_cont("\n");
79 
80 	printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
81 	       log_lvl, regs->ax, regs->bx, regs->cx);
82 	printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
83 	       log_lvl, regs->dx, regs->si, regs->di);
84 	printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
85 	       log_lvl, regs->bp, regs->r8, regs->r9);
86 	printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
87 	       log_lvl, regs->r10, regs->r11, regs->r12);
88 	printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
89 	       log_lvl, regs->r13, regs->r14, regs->r15);
90 
91 	if (mode == SHOW_REGS_SHORT)
92 		return;
93 
94 	if (mode == SHOW_REGS_USER) {
95 		rdmsrl(MSR_FS_BASE, fs);
96 		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
97 		printk("%sFS:  %016lx GS:  %016lx\n",
98 		       log_lvl, fs, shadowgs);
99 		return;
100 	}
101 
102 	asm("movl %%ds,%0" : "=r" (ds));
103 	asm("movl %%es,%0" : "=r" (es));
104 	asm("movl %%fs,%0" : "=r" (fsindex));
105 	asm("movl %%gs,%0" : "=r" (gsindex));
106 
107 	rdmsrl(MSR_FS_BASE, fs);
108 	rdmsrl(MSR_GS_BASE, gs);
109 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
110 
111 	cr0 = read_cr0();
112 	cr2 = read_cr2();
113 	cr3 = __read_cr3();
114 	cr4 = __read_cr4();
115 
116 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
117 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
118 	printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
119 		log_lvl, regs->cs, ds, es, cr0);
120 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
121 		log_lvl, cr2, cr3, cr4);
122 
123 	get_debugreg(d0, 0);
124 	get_debugreg(d1, 1);
125 	get_debugreg(d2, 2);
126 	get_debugreg(d3, 3);
127 	get_debugreg(d6, 6);
128 	get_debugreg(d7, 7);
129 
130 	/* Only print out debug registers if they are in their non-default state. */
131 	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
132 	    (d6 == DR6_RESERVED) && (d7 == 0x400))) {
133 		printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
134 		       log_lvl, d0, d1, d2);
135 		printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
136 		       log_lvl, d3, d6, d7);
137 	}
138 
139 	if (boot_cpu_has(X86_FEATURE_OSPKE))
140 		printk("%sPKRU: %08x\n", log_lvl, read_pkru());
141 }
142 
release_thread(struct task_struct * dead_task)143 void release_thread(struct task_struct *dead_task)
144 {
145 	WARN_ON(dead_task->mm);
146 }
147 
148 enum which_selector {
149 	FS,
150 	GS
151 };
152 
153 /*
154  * Out of line to be protected from kprobes and tracing. If this would be
155  * traced or probed than any access to a per CPU variable happens with
156  * the wrong GS.
157  *
158  * It is not used on Xen paravirt. When paravirt support is needed, it
159  * needs to be renamed with native_ prefix.
160  */
__rdgsbase_inactive(void)161 static noinstr unsigned long __rdgsbase_inactive(void)
162 {
163 	unsigned long gsbase;
164 
165 	lockdep_assert_irqs_disabled();
166 
167 	if (!static_cpu_has(X86_FEATURE_XENPV)) {
168 		native_swapgs();
169 		gsbase = rdgsbase();
170 		native_swapgs();
171 	} else {
172 		instrumentation_begin();
173 		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
174 		instrumentation_end();
175 	}
176 
177 	return gsbase;
178 }
179 
180 /*
181  * Out of line to be protected from kprobes and tracing. If this would be
182  * traced or probed than any access to a per CPU variable happens with
183  * the wrong GS.
184  *
185  * It is not used on Xen paravirt. When paravirt support is needed, it
186  * needs to be renamed with native_ prefix.
187  */
__wrgsbase_inactive(unsigned long gsbase)188 static noinstr void __wrgsbase_inactive(unsigned long gsbase)
189 {
190 	lockdep_assert_irqs_disabled();
191 
192 	if (!static_cpu_has(X86_FEATURE_XENPV)) {
193 		native_swapgs();
194 		wrgsbase(gsbase);
195 		native_swapgs();
196 	} else {
197 		instrumentation_begin();
198 		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
199 		instrumentation_end();
200 	}
201 }
202 
203 /*
204  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
205  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
206  * It's forcibly inlined because it'll generate better code and this function
207  * is hot.
208  */
save_base_legacy(struct task_struct * prev_p,unsigned short selector,enum which_selector which)209 static __always_inline void save_base_legacy(struct task_struct *prev_p,
210 					     unsigned short selector,
211 					     enum which_selector which)
212 {
213 	if (likely(selector == 0)) {
214 		/*
215 		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
216 		 * be the pre-existing saved base or it could be zero.  On AMD
217 		 * (with X86_BUG_NULL_SEG), the segment base could be almost
218 		 * anything.
219 		 *
220 		 * This branch is very hot (it's hit twice on almost every
221 		 * context switch between 64-bit programs), and avoiding
222 		 * the RDMSR helps a lot, so we just assume that whatever
223 		 * value is already saved is correct.  This matches historical
224 		 * Linux behavior, so it won't break existing applications.
225 		 *
226 		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
227 		 * report that the base is zero, it needs to actually be zero:
228 		 * see the corresponding logic in load_seg_legacy.
229 		 */
230 	} else {
231 		/*
232 		 * If the selector is 1, 2, or 3, then the base is zero on
233 		 * !X86_BUG_NULL_SEG CPUs and could be anything on
234 		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
235 		 * has never attempted to preserve the base across context
236 		 * switches.
237 		 *
238 		 * If selector > 3, then it refers to a real segment, and
239 		 * saving the base isn't necessary.
240 		 */
241 		if (which == FS)
242 			prev_p->thread.fsbase = 0;
243 		else
244 			prev_p->thread.gsbase = 0;
245 	}
246 }
247 
save_fsgs(struct task_struct * task)248 static __always_inline void save_fsgs(struct task_struct *task)
249 {
250 	savesegment(fs, task->thread.fsindex);
251 	savesegment(gs, task->thread.gsindex);
252 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
253 		/*
254 		 * If FSGSBASE is enabled, we can't make any useful guesses
255 		 * about the base, and user code expects us to save the current
256 		 * value.  Fortunately, reading the base directly is efficient.
257 		 */
258 		task->thread.fsbase = rdfsbase();
259 		task->thread.gsbase = __rdgsbase_inactive();
260 	} else {
261 		save_base_legacy(task, task->thread.fsindex, FS);
262 		save_base_legacy(task, task->thread.gsindex, GS);
263 	}
264 }
265 
266 /*
267  * While a process is running,current->thread.fsbase and current->thread.gsbase
268  * may not match the corresponding CPU registers (see save_base_legacy()).
269  */
current_save_fsgs(void)270 void current_save_fsgs(void)
271 {
272 	unsigned long flags;
273 
274 	/* Interrupts need to be off for FSGSBASE */
275 	local_irq_save(flags);
276 	save_fsgs(current);
277 	local_irq_restore(flags);
278 }
279 #if IS_ENABLED(CONFIG_KVM)
280 EXPORT_SYMBOL_GPL(current_save_fsgs);
281 #endif
282 
loadseg(enum which_selector which,unsigned short sel)283 static __always_inline void loadseg(enum which_selector which,
284 				    unsigned short sel)
285 {
286 	if (which == FS)
287 		loadsegment(fs, sel);
288 	else
289 		load_gs_index(sel);
290 }
291 
load_seg_legacy(unsigned short prev_index,unsigned long prev_base,unsigned short next_index,unsigned long next_base,enum which_selector which)292 static __always_inline void load_seg_legacy(unsigned short prev_index,
293 					    unsigned long prev_base,
294 					    unsigned short next_index,
295 					    unsigned long next_base,
296 					    enum which_selector which)
297 {
298 	if (likely(next_index <= 3)) {
299 		/*
300 		 * The next task is using 64-bit TLS, is not using this
301 		 * segment at all, or is having fun with arcane CPU features.
302 		 */
303 		if (next_base == 0) {
304 			/*
305 			 * Nasty case: on AMD CPUs, we need to forcibly zero
306 			 * the base.
307 			 */
308 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
309 				loadseg(which, __USER_DS);
310 				loadseg(which, next_index);
311 			} else {
312 				/*
313 				 * We could try to exhaustively detect cases
314 				 * under which we can skip the segment load,
315 				 * but there's really only one case that matters
316 				 * for performance: if both the previous and
317 				 * next states are fully zeroed, we can skip
318 				 * the load.
319 				 *
320 				 * (This assumes that prev_base == 0 has no
321 				 * false positives.  This is the case on
322 				 * Intel-style CPUs.)
323 				 */
324 				if (likely(prev_index | next_index | prev_base))
325 					loadseg(which, next_index);
326 			}
327 		} else {
328 			if (prev_index != next_index)
329 				loadseg(which, next_index);
330 			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
331 			       next_base);
332 		}
333 	} else {
334 		/*
335 		 * The next task is using a real segment.  Loading the selector
336 		 * is sufficient.
337 		 */
338 		loadseg(which, next_index);
339 	}
340 }
341 
x86_fsgsbase_load(struct thread_struct * prev,struct thread_struct * next)342 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
343 					      struct thread_struct *next)
344 {
345 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
346 		/* Update the FS and GS selectors if they could have changed. */
347 		if (unlikely(prev->fsindex || next->fsindex))
348 			loadseg(FS, next->fsindex);
349 		if (unlikely(prev->gsindex || next->gsindex))
350 			loadseg(GS, next->gsindex);
351 
352 		/* Update the bases. */
353 		wrfsbase(next->fsbase);
354 		__wrgsbase_inactive(next->gsbase);
355 	} else {
356 		load_seg_legacy(prev->fsindex, prev->fsbase,
357 				next->fsindex, next->fsbase, FS);
358 		load_seg_legacy(prev->gsindex, prev->gsbase,
359 				next->gsindex, next->gsbase, GS);
360 	}
361 }
362 
x86_fsgsbase_read_task(struct task_struct * task,unsigned short selector)363 unsigned long x86_fsgsbase_read_task(struct task_struct *task,
364 				     unsigned short selector)
365 {
366 	unsigned short idx = selector >> 3;
367 	unsigned long base;
368 
369 	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
370 		if (unlikely(idx >= GDT_ENTRIES))
371 			return 0;
372 
373 		/*
374 		 * There are no user segments in the GDT with nonzero bases
375 		 * other than the TLS segments.
376 		 */
377 		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
378 			return 0;
379 
380 		idx -= GDT_ENTRY_TLS_MIN;
381 		base = get_desc_base(&task->thread.tls_array[idx]);
382 	} else {
383 #ifdef CONFIG_MODIFY_LDT_SYSCALL
384 		struct ldt_struct *ldt;
385 
386 		/*
387 		 * If performance here mattered, we could protect the LDT
388 		 * with RCU.  This is a slow path, though, so we can just
389 		 * take the mutex.
390 		 */
391 		mutex_lock(&task->mm->context.lock);
392 		ldt = task->mm->context.ldt;
393 		if (unlikely(!ldt || idx >= ldt->nr_entries))
394 			base = 0;
395 		else
396 			base = get_desc_base(ldt->entries + idx);
397 		mutex_unlock(&task->mm->context.lock);
398 #else
399 		base = 0;
400 #endif
401 	}
402 
403 	return base;
404 }
405 
x86_gsbase_read_cpu_inactive(void)406 unsigned long x86_gsbase_read_cpu_inactive(void)
407 {
408 	unsigned long gsbase;
409 
410 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
411 		unsigned long flags;
412 
413 		local_irq_save(flags);
414 		gsbase = __rdgsbase_inactive();
415 		local_irq_restore(flags);
416 	} else {
417 		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
418 	}
419 
420 	return gsbase;
421 }
422 
x86_gsbase_write_cpu_inactive(unsigned long gsbase)423 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
424 {
425 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
426 		unsigned long flags;
427 
428 		local_irq_save(flags);
429 		__wrgsbase_inactive(gsbase);
430 		local_irq_restore(flags);
431 	} else {
432 		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
433 	}
434 }
435 
x86_fsbase_read_task(struct task_struct * task)436 unsigned long x86_fsbase_read_task(struct task_struct *task)
437 {
438 	unsigned long fsbase;
439 
440 	if (task == current)
441 		fsbase = x86_fsbase_read_cpu();
442 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
443 		 (task->thread.fsindex == 0))
444 		fsbase = task->thread.fsbase;
445 	else
446 		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
447 
448 	return fsbase;
449 }
450 
x86_gsbase_read_task(struct task_struct * task)451 unsigned long x86_gsbase_read_task(struct task_struct *task)
452 {
453 	unsigned long gsbase;
454 
455 	if (task == current)
456 		gsbase = x86_gsbase_read_cpu_inactive();
457 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
458 		 (task->thread.gsindex == 0))
459 		gsbase = task->thread.gsbase;
460 	else
461 		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
462 
463 	return gsbase;
464 }
465 
x86_fsbase_write_task(struct task_struct * task,unsigned long fsbase)466 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
467 {
468 	WARN_ON_ONCE(task == current);
469 
470 	task->thread.fsbase = fsbase;
471 }
472 
x86_gsbase_write_task(struct task_struct * task,unsigned long gsbase)473 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
474 {
475 	WARN_ON_ONCE(task == current);
476 
477 	task->thread.gsbase = gsbase;
478 }
479 
480 static void
start_thread_common(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp,unsigned int _cs,unsigned int _ss,unsigned int _ds)481 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
482 		    unsigned long new_sp,
483 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
484 {
485 	WARN_ON_ONCE(regs != current_pt_regs());
486 
487 	if (static_cpu_has(X86_BUG_NULL_SEG)) {
488 		/* Loading zero below won't clear the base. */
489 		loadsegment(fs, __USER_DS);
490 		load_gs_index(__USER_DS);
491 	}
492 
493 	loadsegment(fs, 0);
494 	loadsegment(es, _ds);
495 	loadsegment(ds, _ds);
496 	load_gs_index(0);
497 
498 	regs->ip		= new_ip;
499 	regs->sp		= new_sp;
500 	regs->cs		= _cs;
501 	regs->ss		= _ss;
502 	regs->flags		= X86_EFLAGS_IF;
503 }
504 
505 void
start_thread(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp)506 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
507 {
508 	start_thread_common(regs, new_ip, new_sp,
509 			    __USER_CS, __USER_DS, 0);
510 }
511 EXPORT_SYMBOL_GPL(start_thread);
512 
513 #ifdef CONFIG_COMPAT
compat_start_thread(struct pt_regs * regs,u32 new_ip,u32 new_sp,bool x32)514 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
515 {
516 	start_thread_common(regs, new_ip, new_sp,
517 			    x32 ? __USER_CS : __USER32_CS,
518 			    __USER_DS, __USER_DS);
519 }
520 #endif
521 
522 /*
523  *	switch_to(x,y) should switch tasks from x to y.
524  *
525  * This could still be optimized:
526  * - fold all the options into a flag word and test it with a single test.
527  * - could test fs/gs bitsliced
528  *
529  * Kprobes not supported here. Set the probe on schedule instead.
530  * Function graph tracer not supported too.
531  */
532 __visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct * prev_p,struct task_struct * next_p)533 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
534 {
535 	struct thread_struct *prev = &prev_p->thread;
536 	struct thread_struct *next = &next_p->thread;
537 	struct fpu *prev_fpu = &prev->fpu;
538 	struct fpu *next_fpu = &next->fpu;
539 	int cpu = smp_processor_id();
540 
541 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
542 		     this_cpu_read(hardirq_stack_inuse));
543 
544 	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
545 		switch_fpu_prepare(prev_fpu, cpu);
546 
547 	/* We must save %fs and %gs before load_TLS() because
548 	 * %fs and %gs may be cleared by load_TLS().
549 	 *
550 	 * (e.g. xen_load_tls())
551 	 */
552 	save_fsgs(prev_p);
553 
554 	/*
555 	 * Load TLS before restoring any segments so that segment loads
556 	 * reference the correct GDT entries.
557 	 */
558 	load_TLS(next, cpu);
559 
560 	/*
561 	 * Leave lazy mode, flushing any hypercalls made here.  This
562 	 * must be done after loading TLS entries in the GDT but before
563 	 * loading segments that might reference them.
564 	 */
565 	arch_end_context_switch(next_p);
566 
567 	/* Switch DS and ES.
568 	 *
569 	 * Reading them only returns the selectors, but writing them (if
570 	 * nonzero) loads the full descriptor from the GDT or LDT.  The
571 	 * LDT for next is loaded in switch_mm, and the GDT is loaded
572 	 * above.
573 	 *
574 	 * We therefore need to write new values to the segment
575 	 * registers on every context switch unless both the new and old
576 	 * values are zero.
577 	 *
578 	 * Note that we don't need to do anything for CS and SS, as
579 	 * those are saved and restored as part of pt_regs.
580 	 */
581 	savesegment(es, prev->es);
582 	if (unlikely(next->es | prev->es))
583 		loadsegment(es, next->es);
584 
585 	savesegment(ds, prev->ds);
586 	if (unlikely(next->ds | prev->ds))
587 		loadsegment(ds, next->ds);
588 
589 	x86_fsgsbase_load(prev, next);
590 
591 	/*
592 	 * Switch the PDA and FPU contexts.
593 	 */
594 	this_cpu_write(current_task, next_p);
595 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
596 
597 	switch_fpu_finish(next_fpu);
598 
599 	/* Reload sp0. */
600 	update_task_stack(next_p);
601 
602 	switch_to_extra(prev_p, next_p);
603 
604 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
605 		/*
606 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
607 		 * does not update the cached descriptor.  As a result, if we
608 		 * do SYSRET while SS is NULL, we'll end up in user mode with
609 		 * SS apparently equal to __USER_DS but actually unusable.
610 		 *
611 		 * The straightforward workaround would be to fix it up just
612 		 * before SYSRET, but that would slow down the system call
613 		 * fast paths.  Instead, we ensure that SS is never NULL in
614 		 * system call context.  We do this by replacing NULL SS
615 		 * selectors at every context switch.  SYSCALL sets up a valid
616 		 * SS, so the only way to get NULL is to re-enter the kernel
617 		 * from CPL 3 through an interrupt.  Since that can't happen
618 		 * in the same task as a running syscall, we are guaranteed to
619 		 * context switch between every interrupt vector entry and a
620 		 * subsequent SYSRET.
621 		 *
622 		 * We read SS first because SS reads are much faster than
623 		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
624 		 * it previously had a different non-NULL value.
625 		 */
626 		unsigned short ss_sel;
627 		savesegment(ss, ss_sel);
628 		if (ss_sel != __KERNEL_DS)
629 			loadsegment(ss, __KERNEL_DS);
630 	}
631 
632 	/* Load the Intel cache allocation PQR MSR. */
633 	resctrl_sched_in();
634 
635 	return prev_p;
636 }
637 
set_personality_64bit(void)638 void set_personality_64bit(void)
639 {
640 	/* inherit personality from parent */
641 
642 	/* Make sure to be in 64bit mode */
643 	clear_thread_flag(TIF_ADDR32);
644 	/* Pretend that this comes from a 64bit execve */
645 	task_pt_regs(current)->orig_ax = __NR_execve;
646 	current_thread_info()->status &= ~TS_COMPAT;
647 	if (current->mm)
648 		current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL;
649 
650 	/* TBD: overwrites user setup. Should have two bits.
651 	   But 64bit processes have always behaved this way,
652 	   so it's not too bad. The main problem is just that
653 	   32bit children are affected again. */
654 	current->personality &= ~READ_IMPLIES_EXEC;
655 }
656 
__set_personality_x32(void)657 static void __set_personality_x32(void)
658 {
659 #ifdef CONFIG_X86_X32
660 	if (current->mm)
661 		current->mm->context.flags = 0;
662 
663 	current->personality &= ~READ_IMPLIES_EXEC;
664 	/*
665 	 * in_32bit_syscall() uses the presence of the x32 syscall bit
666 	 * flag to determine compat status.  The x86 mmap() code relies on
667 	 * the syscall bitness so set x32 syscall bit right here to make
668 	 * in_32bit_syscall() work during exec().
669 	 *
670 	 * Pretend to come from a x32 execve.
671 	 */
672 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
673 	current_thread_info()->status &= ~TS_COMPAT;
674 #endif
675 }
676 
__set_personality_ia32(void)677 static void __set_personality_ia32(void)
678 {
679 #ifdef CONFIG_IA32_EMULATION
680 	if (current->mm) {
681 		/*
682 		 * uprobes applied to this MM need to know this and
683 		 * cannot use user_64bit_mode() at that time.
684 		 */
685 		current->mm->context.flags = MM_CONTEXT_UPROBE_IA32;
686 	}
687 
688 	current->personality |= force_personality32;
689 	/* Prepare the first "return" to user space */
690 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
691 	current_thread_info()->status |= TS_COMPAT;
692 #endif
693 }
694 
set_personality_ia32(bool x32)695 void set_personality_ia32(bool x32)
696 {
697 	/* Make sure to be in 32bit mode */
698 	set_thread_flag(TIF_ADDR32);
699 
700 	if (x32)
701 		__set_personality_x32();
702 	else
703 		__set_personality_ia32();
704 }
705 EXPORT_SYMBOL_GPL(set_personality_ia32);
706 
707 #ifdef CONFIG_CHECKPOINT_RESTORE
prctl_map_vdso(const struct vdso_image * image,unsigned long addr)708 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
709 {
710 	int ret;
711 
712 	ret = map_vdso_once(image, addr);
713 	if (ret)
714 		return ret;
715 
716 	return (long)image->size;
717 }
718 #endif
719 
do_arch_prctl_64(struct task_struct * task,int option,unsigned long arg2)720 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
721 {
722 	int ret = 0;
723 
724 	switch (option) {
725 	case ARCH_SET_GS: {
726 		if (unlikely(arg2 >= TASK_SIZE_MAX))
727 			return -EPERM;
728 
729 		preempt_disable();
730 		/*
731 		 * ARCH_SET_GS has always overwritten the index
732 		 * and the base. Zero is the most sensible value
733 		 * to put in the index, and is the only value that
734 		 * makes any sense if FSGSBASE is unavailable.
735 		 */
736 		if (task == current) {
737 			loadseg(GS, 0);
738 			x86_gsbase_write_cpu_inactive(arg2);
739 
740 			/*
741 			 * On non-FSGSBASE systems, save_base_legacy() expects
742 			 * that we also fill in thread.gsbase.
743 			 */
744 			task->thread.gsbase = arg2;
745 
746 		} else {
747 			task->thread.gsindex = 0;
748 			x86_gsbase_write_task(task, arg2);
749 		}
750 		preempt_enable();
751 		break;
752 	}
753 	case ARCH_SET_FS: {
754 		/*
755 		 * Not strictly needed for %fs, but do it for symmetry
756 		 * with %gs
757 		 */
758 		if (unlikely(arg2 >= TASK_SIZE_MAX))
759 			return -EPERM;
760 
761 		preempt_disable();
762 		/*
763 		 * Set the selector to 0 for the same reason
764 		 * as %gs above.
765 		 */
766 		if (task == current) {
767 			loadseg(FS, 0);
768 			x86_fsbase_write_cpu(arg2);
769 
770 			/*
771 			 * On non-FSGSBASE systems, save_base_legacy() expects
772 			 * that we also fill in thread.fsbase.
773 			 */
774 			task->thread.fsbase = arg2;
775 		} else {
776 			task->thread.fsindex = 0;
777 			x86_fsbase_write_task(task, arg2);
778 		}
779 		preempt_enable();
780 		break;
781 	}
782 	case ARCH_GET_FS: {
783 		unsigned long base = x86_fsbase_read_task(task);
784 
785 		ret = put_user(base, (unsigned long __user *)arg2);
786 		break;
787 	}
788 	case ARCH_GET_GS: {
789 		unsigned long base = x86_gsbase_read_task(task);
790 
791 		ret = put_user(base, (unsigned long __user *)arg2);
792 		break;
793 	}
794 
795 #ifdef CONFIG_CHECKPOINT_RESTORE
796 # ifdef CONFIG_X86_X32_ABI
797 	case ARCH_MAP_VDSO_X32:
798 		return prctl_map_vdso(&vdso_image_x32, arg2);
799 # endif
800 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
801 	case ARCH_MAP_VDSO_32:
802 		return prctl_map_vdso(&vdso_image_32, arg2);
803 # endif
804 	case ARCH_MAP_VDSO_64:
805 		return prctl_map_vdso(&vdso_image_64, arg2);
806 #endif
807 
808 	default:
809 		ret = -EINVAL;
810 		break;
811 	}
812 
813 	return ret;
814 }
815 
SYSCALL_DEFINE2(arch_prctl,int,option,unsigned long,arg2)816 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
817 {
818 	long ret;
819 
820 	ret = do_arch_prctl_64(current, option, arg2);
821 	if (ret == -EINVAL)
822 		ret = do_arch_prctl_common(current, option, arg2);
823 
824 	return ret;
825 }
826 
827 #ifdef CONFIG_IA32_EMULATION
COMPAT_SYSCALL_DEFINE2(arch_prctl,int,option,unsigned long,arg2)828 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
829 {
830 	return do_arch_prctl_common(current, option, arg2);
831 }
832 #endif
833 
KSTK_ESP(struct task_struct * task)834 unsigned long KSTK_ESP(struct task_struct *task)
835 {
836 	return task_pt_regs(task)->sp;
837 }
838