xref: /linux/arch/powerpc/platforms/pseries/lpar.c (revision 0be3ff0c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * pSeries_lpar.c
4  * Copyright (C) 2001 Todd Inglett, IBM Corporation
5  *
6  * pSeries LPAR support.
7  */
8 
9 /* Enables debugging of low-level hash table routines - careful! */
10 #undef DEBUG
11 #define pr_fmt(fmt) "lpar: " fmt
12 
13 #include <linux/kernel.h>
14 #include <linux/dma-mapping.h>
15 #include <linux/console.h>
16 #include <linux/export.h>
17 #include <linux/jump_label.h>
18 #include <linux/delay.h>
19 #include <linux/stop_machine.h>
20 #include <linux/spinlock.h>
21 #include <linux/cpuhotplug.h>
22 #include <linux/workqueue.h>
23 #include <linux/proc_fs.h>
24 #include <linux/pgtable.h>
25 #include <linux/debugfs.h>
26 
27 #include <asm/processor.h>
28 #include <asm/mmu.h>
29 #include <asm/page.h>
30 #include <asm/machdep.h>
31 #include <asm/mmu_context.h>
32 #include <asm/iommu.h>
33 #include <asm/tlb.h>
34 #include <asm/prom.h>
35 #include <asm/cputable.h>
36 #include <asm/udbg.h>
37 #include <asm/smp.h>
38 #include <asm/trace.h>
39 #include <asm/firmware.h>
40 #include <asm/plpar_wrappers.h>
41 #include <asm/kexec.h>
42 #include <asm/fadump.h>
43 #include <asm/dtl.h>
44 
45 #include "pseries.h"
46 
47 /* Flag bits for H_BULK_REMOVE */
48 #define HBR_REQUEST	0x4000000000000000UL
49 #define HBR_RESPONSE	0x8000000000000000UL
50 #define HBR_END		0xc000000000000000UL
51 #define HBR_AVPN	0x0200000000000000UL
52 #define HBR_ANDCOND	0x0100000000000000UL
53 
54 
55 /* in hvCall.S */
56 EXPORT_SYMBOL(plpar_hcall);
57 EXPORT_SYMBOL(plpar_hcall9);
58 EXPORT_SYMBOL(plpar_hcall_norets);
59 
60 #ifdef CONFIG_PPC_64S_HASH_MMU
61 /*
62  * H_BLOCK_REMOVE supported block size for this page size in segment who's base
63  * page size is that page size.
64  *
65  * The first index is the segment base page size, the second one is the actual
66  * page size.
67  */
68 static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init;
69 #endif
70 
71 /*
72  * Due to the involved complexity, and that the current hypervisor is only
73  * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE
74  * buffer size to 8 size block.
75  */
76 #define HBLKRM_SUPPORTED_BLOCK_SIZE 8
77 
78 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
79 static u8 dtl_mask = DTL_LOG_PREEMPT;
80 #else
81 static u8 dtl_mask;
82 #endif
83 
84 void alloc_dtl_buffers(unsigned long *time_limit)
85 {
86 	int cpu;
87 	struct paca_struct *pp;
88 	struct dtl_entry *dtl;
89 
90 	for_each_possible_cpu(cpu) {
91 		pp = paca_ptrs[cpu];
92 		if (pp->dispatch_log)
93 			continue;
94 		dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
95 		if (!dtl) {
96 			pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
97 				cpu);
98 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
99 			pr_warn("Stolen time statistics will be unreliable\n");
100 #endif
101 			break;
102 		}
103 
104 		pp->dtl_ridx = 0;
105 		pp->dispatch_log = dtl;
106 		pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
107 		pp->dtl_curr = dtl;
108 
109 		if (time_limit && time_after(jiffies, *time_limit)) {
110 			cond_resched();
111 			*time_limit = jiffies + HZ;
112 		}
113 	}
114 }
115 
116 void register_dtl_buffer(int cpu)
117 {
118 	long ret;
119 	struct paca_struct *pp;
120 	struct dtl_entry *dtl;
121 	int hwcpu = get_hard_smp_processor_id(cpu);
122 
123 	pp = paca_ptrs[cpu];
124 	dtl = pp->dispatch_log;
125 	if (dtl && dtl_mask) {
126 		pp->dtl_ridx = 0;
127 		pp->dtl_curr = dtl;
128 		lppaca_of(cpu).dtl_idx = 0;
129 
130 		/* hypervisor reads buffer length from this field */
131 		dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
132 		ret = register_dtl(hwcpu, __pa(dtl));
133 		if (ret)
134 			pr_err("WARNING: DTL registration of cpu %d (hw %d) failed with %ld\n",
135 			       cpu, hwcpu, ret);
136 
137 		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
138 	}
139 }
140 
141 #ifdef CONFIG_PPC_SPLPAR
142 struct dtl_worker {
143 	struct delayed_work work;
144 	int cpu;
145 };
146 
147 struct vcpu_dispatch_data {
148 	int last_disp_cpu;
149 
150 	int total_disp;
151 
152 	int same_cpu_disp;
153 	int same_chip_disp;
154 	int diff_chip_disp;
155 	int far_chip_disp;
156 
157 	int numa_home_disp;
158 	int numa_remote_disp;
159 	int numa_far_disp;
160 };
161 
162 /*
163  * This represents the number of cpus in the hypervisor. Since there is no
164  * architected way to discover the number of processors in the host, we
165  * provision for dealing with NR_CPUS. This is currently 2048 by default, and
166  * is sufficient for our purposes. This will need to be tweaked if
167  * CONFIG_NR_CPUS is changed.
168  */
169 #define NR_CPUS_H	NR_CPUS
170 
171 DEFINE_RWLOCK(dtl_access_lock);
172 static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data);
173 static DEFINE_PER_CPU(u64, dtl_entry_ridx);
174 static DEFINE_PER_CPU(struct dtl_worker, dtl_workers);
175 static enum cpuhp_state dtl_worker_state;
176 static DEFINE_MUTEX(dtl_enable_mutex);
177 static int vcpudispatch_stats_on __read_mostly;
178 static int vcpudispatch_stats_freq = 50;
179 static __be32 *vcpu_associativity, *pcpu_associativity;
180 
181 
182 static void free_dtl_buffers(unsigned long *time_limit)
183 {
184 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
185 	int cpu;
186 	struct paca_struct *pp;
187 
188 	for_each_possible_cpu(cpu) {
189 		pp = paca_ptrs[cpu];
190 		if (!pp->dispatch_log)
191 			continue;
192 		kmem_cache_free(dtl_cache, pp->dispatch_log);
193 		pp->dtl_ridx = 0;
194 		pp->dispatch_log = 0;
195 		pp->dispatch_log_end = 0;
196 		pp->dtl_curr = 0;
197 
198 		if (time_limit && time_after(jiffies, *time_limit)) {
199 			cond_resched();
200 			*time_limit = jiffies + HZ;
201 		}
202 	}
203 #endif
204 }
205 
206 static int init_cpu_associativity(void)
207 {
208 	vcpu_associativity = kcalloc(num_possible_cpus() / threads_per_core,
209 			VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
210 	pcpu_associativity = kcalloc(NR_CPUS_H / threads_per_core,
211 			VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
212 
213 	if (!vcpu_associativity || !pcpu_associativity) {
214 		pr_err("error allocating memory for associativity information\n");
215 		return -ENOMEM;
216 	}
217 
218 	return 0;
219 }
220 
221 static void destroy_cpu_associativity(void)
222 {
223 	kfree(vcpu_associativity);
224 	kfree(pcpu_associativity);
225 	vcpu_associativity = pcpu_associativity = 0;
226 }
227 
228 static __be32 *__get_cpu_associativity(int cpu, __be32 *cpu_assoc, int flag)
229 {
230 	__be32 *assoc;
231 	int rc = 0;
232 
233 	assoc = &cpu_assoc[(int)(cpu / threads_per_core) * VPHN_ASSOC_BUFSIZE];
234 	if (!assoc[0]) {
235 		rc = hcall_vphn(cpu, flag, &assoc[0]);
236 		if (rc)
237 			return NULL;
238 	}
239 
240 	return assoc;
241 }
242 
243 static __be32 *get_pcpu_associativity(int cpu)
244 {
245 	return __get_cpu_associativity(cpu, pcpu_associativity, VPHN_FLAG_PCPU);
246 }
247 
248 static __be32 *get_vcpu_associativity(int cpu)
249 {
250 	return __get_cpu_associativity(cpu, vcpu_associativity, VPHN_FLAG_VCPU);
251 }
252 
253 static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu)
254 {
255 	__be32 *last_disp_cpu_assoc, *cur_disp_cpu_assoc;
256 
257 	if (last_disp_cpu >= NR_CPUS_H || cur_disp_cpu >= NR_CPUS_H)
258 		return -EINVAL;
259 
260 	last_disp_cpu_assoc = get_pcpu_associativity(last_disp_cpu);
261 	cur_disp_cpu_assoc = get_pcpu_associativity(cur_disp_cpu);
262 
263 	if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
264 		return -EIO;
265 
266 	return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
267 }
268 
269 static int cpu_home_node_dispatch_distance(int disp_cpu)
270 {
271 	__be32 *disp_cpu_assoc, *vcpu_assoc;
272 	int vcpu_id = smp_processor_id();
273 
274 	if (disp_cpu >= NR_CPUS_H) {
275 		pr_debug_ratelimited("vcpu dispatch cpu %d > %d\n",
276 						disp_cpu, NR_CPUS_H);
277 		return -EINVAL;
278 	}
279 
280 	disp_cpu_assoc = get_pcpu_associativity(disp_cpu);
281 	vcpu_assoc = get_vcpu_associativity(vcpu_id);
282 
283 	if (!disp_cpu_assoc || !vcpu_assoc)
284 		return -EIO;
285 
286 	return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc);
287 }
288 
289 static void update_vcpu_disp_stat(int disp_cpu)
290 {
291 	struct vcpu_dispatch_data *disp;
292 	int distance;
293 
294 	disp = this_cpu_ptr(&vcpu_disp_data);
295 	if (disp->last_disp_cpu == -1) {
296 		disp->last_disp_cpu = disp_cpu;
297 		return;
298 	}
299 
300 	disp->total_disp++;
301 
302 	if (disp->last_disp_cpu == disp_cpu ||
303 		(cpu_first_thread_sibling(disp->last_disp_cpu) ==
304 					cpu_first_thread_sibling(disp_cpu)))
305 		disp->same_cpu_disp++;
306 	else {
307 		distance = cpu_relative_dispatch_distance(disp->last_disp_cpu,
308 								disp_cpu);
309 		if (distance < 0)
310 			pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
311 					smp_processor_id());
312 		else {
313 			switch (distance) {
314 			case 0:
315 				disp->same_chip_disp++;
316 				break;
317 			case 1:
318 				disp->diff_chip_disp++;
319 				break;
320 			case 2:
321 				disp->far_chip_disp++;
322 				break;
323 			default:
324 				pr_debug_ratelimited("vcpudispatch_stats: cpu %d (%d -> %d): unexpected relative dispatch distance %d\n",
325 						 smp_processor_id(),
326 						 disp->last_disp_cpu,
327 						 disp_cpu,
328 						 distance);
329 			}
330 		}
331 	}
332 
333 	distance = cpu_home_node_dispatch_distance(disp_cpu);
334 	if (distance < 0)
335 		pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
336 				smp_processor_id());
337 	else {
338 		switch (distance) {
339 		case 0:
340 			disp->numa_home_disp++;
341 			break;
342 		case 1:
343 			disp->numa_remote_disp++;
344 			break;
345 		case 2:
346 			disp->numa_far_disp++;
347 			break;
348 		default:
349 			pr_debug_ratelimited("vcpudispatch_stats: cpu %d on %d: unexpected numa dispatch distance %d\n",
350 						 smp_processor_id(),
351 						 disp_cpu,
352 						 distance);
353 		}
354 	}
355 
356 	disp->last_disp_cpu = disp_cpu;
357 }
358 
359 static void process_dtl_buffer(struct work_struct *work)
360 {
361 	struct dtl_entry dtle;
362 	u64 i = __this_cpu_read(dtl_entry_ridx);
363 	struct dtl_entry *dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
364 	struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
365 	struct lppaca *vpa = local_paca->lppaca_ptr;
366 	struct dtl_worker *d = container_of(work, struct dtl_worker, work.work);
367 
368 	if (!local_paca->dispatch_log)
369 		return;
370 
371 	/* if we have been migrated away, we cancel ourself */
372 	if (d->cpu != smp_processor_id()) {
373 		pr_debug("vcpudispatch_stats: cpu %d worker migrated -- canceling worker\n",
374 						smp_processor_id());
375 		return;
376 	}
377 
378 	if (i == be64_to_cpu(vpa->dtl_idx))
379 		goto out;
380 
381 	while (i < be64_to_cpu(vpa->dtl_idx)) {
382 		dtle = *dtl;
383 		barrier();
384 		if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
385 			/* buffer has overflowed */
386 			pr_debug_ratelimited("vcpudispatch_stats: cpu %d lost %lld DTL samples\n",
387 				d->cpu,
388 				be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG - i);
389 			i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
390 			dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
391 			continue;
392 		}
393 		update_vcpu_disp_stat(be16_to_cpu(dtle.processor_id));
394 		++i;
395 		++dtl;
396 		if (dtl == dtl_end)
397 			dtl = local_paca->dispatch_log;
398 	}
399 
400 	__this_cpu_write(dtl_entry_ridx, i);
401 
402 out:
403 	schedule_delayed_work_on(d->cpu, to_delayed_work(work),
404 					HZ / vcpudispatch_stats_freq);
405 }
406 
407 static int dtl_worker_online(unsigned int cpu)
408 {
409 	struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
410 
411 	memset(d, 0, sizeof(*d));
412 	INIT_DELAYED_WORK(&d->work, process_dtl_buffer);
413 	d->cpu = cpu;
414 
415 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
416 	per_cpu(dtl_entry_ridx, cpu) = 0;
417 	register_dtl_buffer(cpu);
418 #else
419 	per_cpu(dtl_entry_ridx, cpu) = be64_to_cpu(lppaca_of(cpu).dtl_idx);
420 #endif
421 
422 	schedule_delayed_work_on(cpu, &d->work, HZ / vcpudispatch_stats_freq);
423 	return 0;
424 }
425 
426 static int dtl_worker_offline(unsigned int cpu)
427 {
428 	struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
429 
430 	cancel_delayed_work_sync(&d->work);
431 
432 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
433 	unregister_dtl(get_hard_smp_processor_id(cpu));
434 #endif
435 
436 	return 0;
437 }
438 
439 static void set_global_dtl_mask(u8 mask)
440 {
441 	int cpu;
442 
443 	dtl_mask = mask;
444 	for_each_present_cpu(cpu)
445 		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
446 }
447 
448 static void reset_global_dtl_mask(void)
449 {
450 	int cpu;
451 
452 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
453 	dtl_mask = DTL_LOG_PREEMPT;
454 #else
455 	dtl_mask = 0;
456 #endif
457 	for_each_present_cpu(cpu)
458 		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
459 }
460 
461 static int dtl_worker_enable(unsigned long *time_limit)
462 {
463 	int rc = 0, state;
464 
465 	if (!write_trylock(&dtl_access_lock)) {
466 		rc = -EBUSY;
467 		goto out;
468 	}
469 
470 	set_global_dtl_mask(DTL_LOG_ALL);
471 
472 	/* Setup dtl buffers and register those */
473 	alloc_dtl_buffers(time_limit);
474 
475 	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/dtl:online",
476 					dtl_worker_online, dtl_worker_offline);
477 	if (state < 0) {
478 		pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n");
479 		free_dtl_buffers(time_limit);
480 		reset_global_dtl_mask();
481 		write_unlock(&dtl_access_lock);
482 		rc = -EINVAL;
483 		goto out;
484 	}
485 	dtl_worker_state = state;
486 
487 out:
488 	return rc;
489 }
490 
491 static void dtl_worker_disable(unsigned long *time_limit)
492 {
493 	cpuhp_remove_state(dtl_worker_state);
494 	free_dtl_buffers(time_limit);
495 	reset_global_dtl_mask();
496 	write_unlock(&dtl_access_lock);
497 }
498 
499 static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p,
500 		size_t count, loff_t *ppos)
501 {
502 	unsigned long time_limit = jiffies + HZ;
503 	struct vcpu_dispatch_data *disp;
504 	int rc, cmd, cpu;
505 	char buf[16];
506 
507 	if (count > 15)
508 		return -EINVAL;
509 
510 	if (copy_from_user(buf, p, count))
511 		return -EFAULT;
512 
513 	buf[count] = 0;
514 	rc = kstrtoint(buf, 0, &cmd);
515 	if (rc || cmd < 0 || cmd > 1) {
516 		pr_err("vcpudispatch_stats: please use 0 to disable or 1 to enable dispatch statistics\n");
517 		return rc ? rc : -EINVAL;
518 	}
519 
520 	mutex_lock(&dtl_enable_mutex);
521 
522 	if ((cmd == 0 && !vcpudispatch_stats_on) ||
523 			(cmd == 1 && vcpudispatch_stats_on))
524 		goto out;
525 
526 	if (cmd) {
527 		rc = init_cpu_associativity();
528 		if (rc)
529 			goto out;
530 
531 		for_each_possible_cpu(cpu) {
532 			disp = per_cpu_ptr(&vcpu_disp_data, cpu);
533 			memset(disp, 0, sizeof(*disp));
534 			disp->last_disp_cpu = -1;
535 		}
536 
537 		rc = dtl_worker_enable(&time_limit);
538 		if (rc) {
539 			destroy_cpu_associativity();
540 			goto out;
541 		}
542 	} else {
543 		dtl_worker_disable(&time_limit);
544 		destroy_cpu_associativity();
545 	}
546 
547 	vcpudispatch_stats_on = cmd;
548 
549 out:
550 	mutex_unlock(&dtl_enable_mutex);
551 	if (rc)
552 		return rc;
553 	return count;
554 }
555 
556 static int vcpudispatch_stats_display(struct seq_file *p, void *v)
557 {
558 	int cpu;
559 	struct vcpu_dispatch_data *disp;
560 
561 	if (!vcpudispatch_stats_on) {
562 		seq_puts(p, "off\n");
563 		return 0;
564 	}
565 
566 	for_each_online_cpu(cpu) {
567 		disp = per_cpu_ptr(&vcpu_disp_data, cpu);
568 		seq_printf(p, "cpu%d", cpu);
569 		seq_put_decimal_ull(p, " ", disp->total_disp);
570 		seq_put_decimal_ull(p, " ", disp->same_cpu_disp);
571 		seq_put_decimal_ull(p, " ", disp->same_chip_disp);
572 		seq_put_decimal_ull(p, " ", disp->diff_chip_disp);
573 		seq_put_decimal_ull(p, " ", disp->far_chip_disp);
574 		seq_put_decimal_ull(p, " ", disp->numa_home_disp);
575 		seq_put_decimal_ull(p, " ", disp->numa_remote_disp);
576 		seq_put_decimal_ull(p, " ", disp->numa_far_disp);
577 		seq_puts(p, "\n");
578 	}
579 
580 	return 0;
581 }
582 
583 static int vcpudispatch_stats_open(struct inode *inode, struct file *file)
584 {
585 	return single_open(file, vcpudispatch_stats_display, NULL);
586 }
587 
588 static const struct proc_ops vcpudispatch_stats_proc_ops = {
589 	.proc_open	= vcpudispatch_stats_open,
590 	.proc_read	= seq_read,
591 	.proc_write	= vcpudispatch_stats_write,
592 	.proc_lseek	= seq_lseek,
593 	.proc_release	= single_release,
594 };
595 
596 static ssize_t vcpudispatch_stats_freq_write(struct file *file,
597 		const char __user *p, size_t count, loff_t *ppos)
598 {
599 	int rc, freq;
600 	char buf[16];
601 
602 	if (count > 15)
603 		return -EINVAL;
604 
605 	if (copy_from_user(buf, p, count))
606 		return -EFAULT;
607 
608 	buf[count] = 0;
609 	rc = kstrtoint(buf, 0, &freq);
610 	if (rc || freq < 1 || freq > HZ) {
611 		pr_err("vcpudispatch_stats_freq: please specify a frequency between 1 and %d\n",
612 				HZ);
613 		return rc ? rc : -EINVAL;
614 	}
615 
616 	vcpudispatch_stats_freq = freq;
617 
618 	return count;
619 }
620 
621 static int vcpudispatch_stats_freq_display(struct seq_file *p, void *v)
622 {
623 	seq_printf(p, "%d\n", vcpudispatch_stats_freq);
624 	return 0;
625 }
626 
627 static int vcpudispatch_stats_freq_open(struct inode *inode, struct file *file)
628 {
629 	return single_open(file, vcpudispatch_stats_freq_display, NULL);
630 }
631 
632 static const struct proc_ops vcpudispatch_stats_freq_proc_ops = {
633 	.proc_open	= vcpudispatch_stats_freq_open,
634 	.proc_read	= seq_read,
635 	.proc_write	= vcpudispatch_stats_freq_write,
636 	.proc_lseek	= seq_lseek,
637 	.proc_release	= single_release,
638 };
639 
640 static int __init vcpudispatch_stats_procfs_init(void)
641 {
642 	/*
643 	 * Avoid smp_processor_id while preemptible. All CPUs should have
644 	 * the same value for lppaca_shared_proc.
645 	 */
646 	preempt_disable();
647 	if (!lppaca_shared_proc(get_lppaca())) {
648 		preempt_enable();
649 		return 0;
650 	}
651 	preempt_enable();
652 
653 	if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL,
654 					&vcpudispatch_stats_proc_ops))
655 		pr_err("vcpudispatch_stats: error creating procfs file\n");
656 	else if (!proc_create("powerpc/vcpudispatch_stats_freq", 0600, NULL,
657 					&vcpudispatch_stats_freq_proc_ops))
658 		pr_err("vcpudispatch_stats_freq: error creating procfs file\n");
659 
660 	return 0;
661 }
662 
663 machine_device_initcall(pseries, vcpudispatch_stats_procfs_init);
664 #endif /* CONFIG_PPC_SPLPAR */
665 
666 void vpa_init(int cpu)
667 {
668 	int hwcpu = get_hard_smp_processor_id(cpu);
669 	unsigned long addr;
670 	long ret;
671 
672 	/*
673 	 * The spec says it "may be problematic" if CPU x registers the VPA of
674 	 * CPU y. We should never do that, but wail if we ever do.
675 	 */
676 	WARN_ON(cpu != smp_processor_id());
677 
678 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
679 		lppaca_of(cpu).vmxregs_in_use = 1;
680 
681 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
682 		lppaca_of(cpu).ebb_regs_in_use = 1;
683 
684 	addr = __pa(&lppaca_of(cpu));
685 	ret = register_vpa(hwcpu, addr);
686 
687 	if (ret) {
688 		pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
689 		       "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
690 		return;
691 	}
692 
693 #ifdef CONFIG_PPC_64S_HASH_MMU
694 	/*
695 	 * PAPR says this feature is SLB-Buffer but firmware never
696 	 * reports that.  All SPLPAR support SLB shadow buffer.
697 	 */
698 	if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) {
699 		addr = __pa(paca_ptrs[cpu]->slb_shadow_ptr);
700 		ret = register_slb_shadow(hwcpu, addr);
701 		if (ret)
702 			pr_err("WARNING: SLB shadow buffer registration for "
703 			       "cpu %d (hw %d) of area %lx failed with %ld\n",
704 			       cpu, hwcpu, addr, ret);
705 	}
706 #endif /* CONFIG_PPC_64S_HASH_MMU */
707 
708 	/*
709 	 * Register dispatch trace log, if one has been allocated.
710 	 */
711 	register_dtl_buffer(cpu);
712 }
713 
714 #ifdef CONFIG_PPC_BOOK3S_64
715 
716 static int __init pseries_lpar_register_process_table(unsigned long base,
717 			unsigned long page_size, unsigned long table_size)
718 {
719 	long rc;
720 	unsigned long flags = 0;
721 
722 	if (table_size)
723 		flags |= PROC_TABLE_NEW;
724 	if (radix_enabled()) {
725 		flags |= PROC_TABLE_RADIX;
726 		if (mmu_has_feature(MMU_FTR_GTSE))
727 			flags |= PROC_TABLE_GTSE;
728 	} else
729 		flags |= PROC_TABLE_HPT_SLB;
730 	for (;;) {
731 		rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
732 					page_size, table_size);
733 		if (!H_IS_LONG_BUSY(rc))
734 			break;
735 		mdelay(get_longbusy_msecs(rc));
736 	}
737 	if (rc != H_SUCCESS) {
738 		pr_err("Failed to register process table (rc=%ld)\n", rc);
739 		BUG();
740 	}
741 	return rc;
742 }
743 
744 #ifdef CONFIG_PPC_64S_HASH_MMU
745 
746 static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
747 				     unsigned long vpn, unsigned long pa,
748 				     unsigned long rflags, unsigned long vflags,
749 				     int psize, int apsize, int ssize)
750 {
751 	unsigned long lpar_rc;
752 	unsigned long flags;
753 	unsigned long slot;
754 	unsigned long hpte_v, hpte_r;
755 
756 	if (!(vflags & HPTE_V_BOLTED))
757 		pr_devel("hpte_insert(group=%lx, vpn=%016lx, "
758 			 "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
759 			 hpte_group, vpn,  pa, rflags, vflags, psize);
760 
761 	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
762 	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
763 
764 	if (!(vflags & HPTE_V_BOLTED))
765 		pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
766 
767 	/* Now fill in the actual HPTE */
768 	/* Set CEC cookie to 0         */
769 	/* Zero page = 0               */
770 	/* I-cache Invalidate = 0      */
771 	/* I-cache synchronize = 0     */
772 	/* Exact = 0                   */
773 	flags = 0;
774 
775 	if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
776 		flags |= H_COALESCE_CAND;
777 
778 	lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
779 	if (unlikely(lpar_rc == H_PTEG_FULL)) {
780 		pr_devel("Hash table group is full\n");
781 		return -1;
782 	}
783 
784 	/*
785 	 * Since we try and ioremap PHBs we don't own, the pte insert
786 	 * will fail. However we must catch the failure in hash_page
787 	 * or we will loop forever, so return -2 in this case.
788 	 */
789 	if (unlikely(lpar_rc != H_SUCCESS)) {
790 		pr_err("Failed hash pte insert with error %ld\n", lpar_rc);
791 		return -2;
792 	}
793 	if (!(vflags & HPTE_V_BOLTED))
794 		pr_devel(" -> slot: %lu\n", slot & 7);
795 
796 	/* Because of iSeries, we have to pass down the secondary
797 	 * bucket bit here as well
798 	 */
799 	return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
800 }
801 
802 static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
803 
804 static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
805 {
806 	unsigned long slot_offset;
807 	unsigned long lpar_rc;
808 	int i;
809 	unsigned long dummy1, dummy2;
810 
811 	/* pick a random slot to start at */
812 	slot_offset = mftb() & 0x7;
813 
814 	for (i = 0; i < HPTES_PER_GROUP; i++) {
815 
816 		/* don't remove a bolted entry */
817 		lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
818 					   HPTE_V_BOLTED, &dummy1, &dummy2);
819 		if (lpar_rc == H_SUCCESS)
820 			return i;
821 
822 		/*
823 		 * The test for adjunct partition is performed before the
824 		 * ANDCOND test.  H_RESOURCE may be returned, so we need to
825 		 * check for that as well.
826 		 */
827 		BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE);
828 
829 		slot_offset++;
830 		slot_offset &= 0x7;
831 	}
832 
833 	return -1;
834 }
835 
836 /* Called during kexec sequence with MMU off */
837 static notrace void manual_hpte_clear_all(void)
838 {
839 	unsigned long size_bytes = 1UL << ppc64_pft_size;
840 	unsigned long hpte_count = size_bytes >> 4;
841 	struct {
842 		unsigned long pteh;
843 		unsigned long ptel;
844 	} ptes[4];
845 	long lpar_rc;
846 	unsigned long i, j;
847 
848 	/* Read in batches of 4,
849 	 * invalidate only valid entries not in the VRMA
850 	 * hpte_count will be a multiple of 4
851          */
852 	for (i = 0; i < hpte_count; i += 4) {
853 		lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
854 		if (lpar_rc != H_SUCCESS) {
855 			pr_info("Failed to read hash page table at %ld err %ld\n",
856 				i, lpar_rc);
857 			continue;
858 		}
859 		for (j = 0; j < 4; j++){
860 			if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
861 				HPTE_V_VRMA_MASK)
862 				continue;
863 			if (ptes[j].pteh & HPTE_V_VALID)
864 				plpar_pte_remove_raw(0, i + j, 0,
865 					&(ptes[j].pteh), &(ptes[j].ptel));
866 		}
867 	}
868 }
869 
870 /* Called during kexec sequence with MMU off */
871 static notrace int hcall_hpte_clear_all(void)
872 {
873 	int rc;
874 
875 	do {
876 		rc = plpar_hcall_norets(H_CLEAR_HPT);
877 	} while (rc == H_CONTINUE);
878 
879 	return rc;
880 }
881 
882 /* Called during kexec sequence with MMU off */
883 static notrace void pseries_hpte_clear_all(void)
884 {
885 	int rc;
886 
887 	rc = hcall_hpte_clear_all();
888 	if (rc != H_SUCCESS)
889 		manual_hpte_clear_all();
890 
891 #ifdef __LITTLE_ENDIAN__
892 	/*
893 	 * Reset exceptions to big endian.
894 	 *
895 	 * FIXME this is a hack for kexec, we need to reset the exception
896 	 * endian before starting the new kernel and this is a convenient place
897 	 * to do it.
898 	 *
899 	 * This is also called on boot when a fadump happens. In that case we
900 	 * must not change the exception endian mode.
901 	 */
902 	if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active())
903 		pseries_big_endian_exceptions();
904 #endif
905 }
906 
907 /*
908  * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
909  * the low 3 bits of flags happen to line up.  So no transform is needed.
910  * We can probably optimize here and assume the high bits of newpp are
911  * already zero.  For now I am paranoid.
912  */
913 static long pSeries_lpar_hpte_updatepp(unsigned long slot,
914 				       unsigned long newpp,
915 				       unsigned long vpn,
916 				       int psize, int apsize,
917 				       int ssize, unsigned long inv_flags)
918 {
919 	unsigned long lpar_rc;
920 	unsigned long flags;
921 	unsigned long want_v;
922 
923 	want_v = hpte_encode_avpn(vpn, psize, ssize);
924 
925 	flags = (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO)) | H_AVPN;
926 	flags |= (newpp & HPTE_R_KEY_HI) >> 48;
927 	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
928 		/* Move pp0 into bit 8 (IBM 55) */
929 		flags |= (newpp & HPTE_R_PP0) >> 55;
930 
931 	pr_devel("    update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
932 		 want_v, slot, flags, psize);
933 
934 	lpar_rc = plpar_pte_protect(flags, slot, want_v);
935 
936 	if (lpar_rc == H_NOT_FOUND) {
937 		pr_devel("not found !\n");
938 		return -1;
939 	}
940 
941 	pr_devel("ok\n");
942 
943 	BUG_ON(lpar_rc != H_SUCCESS);
944 
945 	return 0;
946 }
947 
948 static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_group)
949 {
950 	long lpar_rc;
951 	unsigned long i, j;
952 	struct {
953 		unsigned long pteh;
954 		unsigned long ptel;
955 	} ptes[4];
956 
957 	for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
958 
959 		lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
960 		if (lpar_rc != H_SUCCESS) {
961 			pr_info("Failed to read hash page table at %ld err %ld\n",
962 				hpte_group, lpar_rc);
963 			continue;
964 		}
965 
966 		for (j = 0; j < 4; j++) {
967 			if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
968 			    (ptes[j].pteh & HPTE_V_VALID))
969 				return i + j;
970 		}
971 	}
972 
973 	return -1;
974 }
975 
976 static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
977 {
978 	long slot;
979 	unsigned long hash;
980 	unsigned long want_v;
981 	unsigned long hpte_group;
982 
983 	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
984 	want_v = hpte_encode_avpn(vpn, psize, ssize);
985 
986 	/*
987 	 * We try to keep bolted entries always in primary hash
988 	 * But in some case we can find them in secondary too.
989 	 */
990 	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
991 	slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
992 	if (slot < 0) {
993 		/* Try in secondary */
994 		hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
995 		slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
996 		if (slot < 0)
997 			return -1;
998 	}
999 	return hpte_group + slot;
1000 }
1001 
1002 static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
1003 					     unsigned long ea,
1004 					     int psize, int ssize)
1005 {
1006 	unsigned long vpn;
1007 	unsigned long lpar_rc, slot, vsid, flags;
1008 
1009 	vsid = get_kernel_vsid(ea, ssize);
1010 	vpn = hpt_vpn(ea, vsid, ssize);
1011 
1012 	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
1013 	BUG_ON(slot == -1);
1014 
1015 	flags = newpp & (HPTE_R_PP | HPTE_R_N);
1016 	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
1017 		/* Move pp0 into bit 8 (IBM 55) */
1018 		flags |= (newpp & HPTE_R_PP0) >> 55;
1019 
1020 	flags |= ((newpp & HPTE_R_KEY_HI) >> 48) | (newpp & HPTE_R_KEY_LO);
1021 
1022 	lpar_rc = plpar_pte_protect(flags, slot, 0);
1023 
1024 	BUG_ON(lpar_rc != H_SUCCESS);
1025 }
1026 
1027 static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
1028 					 int psize, int apsize,
1029 					 int ssize, int local)
1030 {
1031 	unsigned long want_v;
1032 	unsigned long lpar_rc;
1033 	unsigned long dummy1, dummy2;
1034 
1035 	pr_devel("    inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
1036 		 slot, vpn, psize, local);
1037 
1038 	want_v = hpte_encode_avpn(vpn, psize, ssize);
1039 	lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
1040 	if (lpar_rc == H_NOT_FOUND)
1041 		return;
1042 
1043 	BUG_ON(lpar_rc != H_SUCCESS);
1044 }
1045 
1046 
1047 /*
1048  * As defined in the PAPR's section 14.5.4.1.8
1049  * The control mask doesn't include the returned reference and change bit from
1050  * the processed PTE.
1051  */
1052 #define HBLKR_AVPN		0x0100000000000000UL
1053 #define HBLKR_CTRL_MASK		0xf800000000000000UL
1054 #define HBLKR_CTRL_SUCCESS	0x8000000000000000UL
1055 #define HBLKR_CTRL_ERRNOTFOUND	0x8800000000000000UL
1056 #define HBLKR_CTRL_ERRBUSY	0xa000000000000000UL
1057 
1058 /*
1059  * Returned true if we are supporting this block size for the specified segment
1060  * base page size and actual page size.
1061  *
1062  * Currently, we only support 8 size block.
1063  */
1064 static inline bool is_supported_hlbkrm(int bpsize, int psize)
1065 {
1066 	return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE);
1067 }
1068 
1069 /**
1070  * H_BLOCK_REMOVE caller.
1071  * @idx should point to the latest @param entry set with a PTEX.
1072  * If PTE cannot be processed because another CPUs has already locked that
1073  * group, those entries are put back in @param starting at index 1.
1074  * If entries has to be retried and @retry_busy is set to true, these entries
1075  * are retried until success. If @retry_busy is set to false, the returned
1076  * is the number of entries yet to process.
1077  */
1078 static unsigned long call_block_remove(unsigned long idx, unsigned long *param,
1079 				       bool retry_busy)
1080 {
1081 	unsigned long i, rc, new_idx;
1082 	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
1083 
1084 	if (idx < 2) {
1085 		pr_warn("Unexpected empty call to H_BLOCK_REMOVE");
1086 		return 0;
1087 	}
1088 again:
1089 	new_idx = 0;
1090 	if (idx > PLPAR_HCALL9_BUFSIZE) {
1091 		pr_err("Too many PTEs (%lu) for H_BLOCK_REMOVE", idx);
1092 		idx = PLPAR_HCALL9_BUFSIZE;
1093 	} else if (idx < PLPAR_HCALL9_BUFSIZE)
1094 		param[idx] = HBR_END;
1095 
1096 	rc = plpar_hcall9(H_BLOCK_REMOVE, retbuf,
1097 			  param[0], /* AVA */
1098 			  param[1],  param[2],  param[3],  param[4], /* TS0-7 */
1099 			  param[5],  param[6],  param[7],  param[8]);
1100 	if (rc == H_SUCCESS)
1101 		return 0;
1102 
1103 	BUG_ON(rc != H_PARTIAL);
1104 
1105 	/* Check that the unprocessed entries were 'not found' or 'busy' */
1106 	for (i = 0; i < idx-1; i++) {
1107 		unsigned long ctrl = retbuf[i] & HBLKR_CTRL_MASK;
1108 
1109 		if (ctrl == HBLKR_CTRL_ERRBUSY) {
1110 			param[++new_idx] = param[i+1];
1111 			continue;
1112 		}
1113 
1114 		BUG_ON(ctrl != HBLKR_CTRL_SUCCESS
1115 		       && ctrl != HBLKR_CTRL_ERRNOTFOUND);
1116 	}
1117 
1118 	/*
1119 	 * If there were entries found busy, retry these entries if requested,
1120 	 * of if all the entries have to be retried.
1121 	 */
1122 	if (new_idx && (retry_busy || new_idx == (PLPAR_HCALL9_BUFSIZE-1))) {
1123 		idx = new_idx + 1;
1124 		goto again;
1125 	}
1126 
1127 	return new_idx;
1128 }
1129 
1130 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1131 /*
1132  * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
1133  * to make sure that we avoid bouncing the hypervisor tlbie lock.
1134  */
1135 #define PPC64_HUGE_HPTE_BATCH 12
1136 
1137 static void hugepage_block_invalidate(unsigned long *slot, unsigned long *vpn,
1138 				      int count, int psize, int ssize)
1139 {
1140 	unsigned long param[PLPAR_HCALL9_BUFSIZE];
1141 	unsigned long shift, current_vpgb, vpgb;
1142 	int i, pix = 0;
1143 
1144 	shift = mmu_psize_defs[psize].shift;
1145 
1146 	for (i = 0; i < count; i++) {
1147 		/*
1148 		 * Shifting 3 bits more on the right to get a
1149 		 * 8 pages aligned virtual addresse.
1150 		 */
1151 		vpgb = (vpn[i] >> (shift - VPN_SHIFT + 3));
1152 		if (!pix || vpgb != current_vpgb) {
1153 			/*
1154 			 * Need to start a new 8 pages block, flush
1155 			 * the current one if needed.
1156 			 */
1157 			if (pix)
1158 				(void)call_block_remove(pix, param, true);
1159 			current_vpgb = vpgb;
1160 			param[0] = hpte_encode_avpn(vpn[i], psize, ssize);
1161 			pix = 1;
1162 		}
1163 
1164 		param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot[i];
1165 		if (pix == PLPAR_HCALL9_BUFSIZE) {
1166 			pix = call_block_remove(pix, param, false);
1167 			/*
1168 			 * pix = 0 means that all the entries were
1169 			 * removed, we can start a new block.
1170 			 * Otherwise, this means that there are entries
1171 			 * to retry, and pix points to latest one, so
1172 			 * we should increment it and try to continue
1173 			 * the same block.
1174 			 */
1175 			if (pix)
1176 				pix++;
1177 		}
1178 	}
1179 	if (pix)
1180 		(void)call_block_remove(pix, param, true);
1181 }
1182 
1183 static void hugepage_bulk_invalidate(unsigned long *slot, unsigned long *vpn,
1184 				     int count, int psize, int ssize)
1185 {
1186 	unsigned long param[PLPAR_HCALL9_BUFSIZE];
1187 	int i = 0, pix = 0, rc;
1188 
1189 	for (i = 0; i < count; i++) {
1190 
1191 		if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1192 			pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
1193 						     ssize, 0);
1194 		} else {
1195 			param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
1196 			param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
1197 			pix += 2;
1198 			if (pix == 8) {
1199 				rc = plpar_hcall9(H_BULK_REMOVE, param,
1200 						  param[0], param[1], param[2],
1201 						  param[3], param[4], param[5],
1202 						  param[6], param[7]);
1203 				BUG_ON(rc != H_SUCCESS);
1204 				pix = 0;
1205 			}
1206 		}
1207 	}
1208 	if (pix) {
1209 		param[pix] = HBR_END;
1210 		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
1211 				  param[2], param[3], param[4], param[5],
1212 				  param[6], param[7]);
1213 		BUG_ON(rc != H_SUCCESS);
1214 	}
1215 }
1216 
1217 static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
1218 						      unsigned long *vpn,
1219 						      int count, int psize,
1220 						      int ssize)
1221 {
1222 	unsigned long flags = 0;
1223 	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
1224 
1225 	if (lock_tlbie)
1226 		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
1227 
1228 	/* Assuming THP size is 16M */
1229 	if (is_supported_hlbkrm(psize, MMU_PAGE_16M))
1230 		hugepage_block_invalidate(slot, vpn, count, psize, ssize);
1231 	else
1232 		hugepage_bulk_invalidate(slot, vpn, count, psize, ssize);
1233 
1234 	if (lock_tlbie)
1235 		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
1236 }
1237 
1238 static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
1239 					     unsigned long addr,
1240 					     unsigned char *hpte_slot_array,
1241 					     int psize, int ssize, int local)
1242 {
1243 	int i, index = 0;
1244 	unsigned long s_addr = addr;
1245 	unsigned int max_hpte_count, valid;
1246 	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
1247 	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
1248 	unsigned long shift, hidx, vpn = 0, hash, slot;
1249 
1250 	shift = mmu_psize_defs[psize].shift;
1251 	max_hpte_count = 1U << (PMD_SHIFT - shift);
1252 
1253 	for (i = 0; i < max_hpte_count; i++) {
1254 		valid = hpte_valid(hpte_slot_array, i);
1255 		if (!valid)
1256 			continue;
1257 		hidx =  hpte_hash_index(hpte_slot_array, i);
1258 
1259 		/* get the vpn */
1260 		addr = s_addr + (i * (1ul << shift));
1261 		vpn = hpt_vpn(addr, vsid, ssize);
1262 		hash = hpt_hash(vpn, shift, ssize);
1263 		if (hidx & _PTEIDX_SECONDARY)
1264 			hash = ~hash;
1265 
1266 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1267 		slot += hidx & _PTEIDX_GROUP_IX;
1268 
1269 		slot_array[index] = slot;
1270 		vpn_array[index] = vpn;
1271 		if (index == PPC64_HUGE_HPTE_BATCH - 1) {
1272 			/*
1273 			 * Now do a bluk invalidate
1274 			 */
1275 			__pSeries_lpar_hugepage_invalidate(slot_array,
1276 							   vpn_array,
1277 							   PPC64_HUGE_HPTE_BATCH,
1278 							   psize, ssize);
1279 			index = 0;
1280 		} else
1281 			index++;
1282 	}
1283 	if (index)
1284 		__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
1285 						   index, psize, ssize);
1286 }
1287 #else
1288 static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
1289 					     unsigned long addr,
1290 					     unsigned char *hpte_slot_array,
1291 					     int psize, int ssize, int local)
1292 {
1293 	WARN(1, "%s called without THP support\n", __func__);
1294 }
1295 #endif
1296 
1297 static int pSeries_lpar_hpte_removebolted(unsigned long ea,
1298 					  int psize, int ssize)
1299 {
1300 	unsigned long vpn;
1301 	unsigned long slot, vsid;
1302 
1303 	vsid = get_kernel_vsid(ea, ssize);
1304 	vpn = hpt_vpn(ea, vsid, ssize);
1305 
1306 	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
1307 	if (slot == -1)
1308 		return -ENOENT;
1309 
1310 	/*
1311 	 * lpar doesn't use the passed actual page size
1312 	 */
1313 	pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
1314 	return 0;
1315 }
1316 
1317 
1318 static inline unsigned long compute_slot(real_pte_t pte,
1319 					 unsigned long vpn,
1320 					 unsigned long index,
1321 					 unsigned long shift,
1322 					 int ssize)
1323 {
1324 	unsigned long slot, hash, hidx;
1325 
1326 	hash = hpt_hash(vpn, shift, ssize);
1327 	hidx = __rpte_to_hidx(pte, index);
1328 	if (hidx & _PTEIDX_SECONDARY)
1329 		hash = ~hash;
1330 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1331 	slot += hidx & _PTEIDX_GROUP_IX;
1332 	return slot;
1333 }
1334 
1335 /**
1336  * The hcall H_BLOCK_REMOVE implies that the virtual pages to processed are
1337  * "all within the same naturally aligned 8 page virtual address block".
1338  */
1339 static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch,
1340 			    unsigned long *param)
1341 {
1342 	unsigned long vpn;
1343 	unsigned long i, pix = 0;
1344 	unsigned long index, shift, slot, current_vpgb, vpgb;
1345 	real_pte_t pte;
1346 	int psize, ssize;
1347 
1348 	psize = batch->psize;
1349 	ssize = batch->ssize;
1350 
1351 	for (i = 0; i < number; i++) {
1352 		vpn = batch->vpn[i];
1353 		pte = batch->pte[i];
1354 		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
1355 			/*
1356 			 * Shifting 3 bits more on the right to get a
1357 			 * 8 pages aligned virtual addresse.
1358 			 */
1359 			vpgb = (vpn >> (shift - VPN_SHIFT + 3));
1360 			if (!pix || vpgb != current_vpgb) {
1361 				/*
1362 				 * Need to start a new 8 pages block, flush
1363 				 * the current one if needed.
1364 				 */
1365 				if (pix)
1366 					(void)call_block_remove(pix, param,
1367 								true);
1368 				current_vpgb = vpgb;
1369 				param[0] = hpte_encode_avpn(vpn, psize,
1370 							    ssize);
1371 				pix = 1;
1372 			}
1373 
1374 			slot = compute_slot(pte, vpn, index, shift, ssize);
1375 			param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot;
1376 
1377 			if (pix == PLPAR_HCALL9_BUFSIZE) {
1378 				pix = call_block_remove(pix, param, false);
1379 				/*
1380 				 * pix = 0 means that all the entries were
1381 				 * removed, we can start a new block.
1382 				 * Otherwise, this means that there are entries
1383 				 * to retry, and pix points to latest one, so
1384 				 * we should increment it and try to continue
1385 				 * the same block.
1386 				 */
1387 				if (pix)
1388 					pix++;
1389 			}
1390 		} pte_iterate_hashed_end();
1391 	}
1392 
1393 	if (pix)
1394 		(void)call_block_remove(pix, param, true);
1395 }
1396 
1397 /*
1398  * TLB Block Invalidate Characteristics
1399  *
1400  * These characteristics define the size of the block the hcall H_BLOCK_REMOVE
1401  * is able to process for each couple segment base page size, actual page size.
1402  *
1403  * The ibm,get-system-parameter properties is returning a buffer with the
1404  * following layout:
1405  *
1406  * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ]
1407  * -----------------
1408  * TLB Block Invalidate Specifiers:
1409  * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ]
1410  * [ 1 byte Number of page sizes (N) that are supported for the specified
1411  *          TLB invalidate block size ]
1412  * [ 1 byte Encoded segment base page size and actual page size
1413  *          MSB=0 means 4k segment base page size and actual page size
1414  *          MSB=1 the penc value in mmu_psize_def ]
1415  * ...
1416  * -----------------
1417  * Next TLB Block Invalidate Specifiers...
1418  * -----------------
1419  * [ 0 ]
1420  */
1421 static inline void set_hblkrm_bloc_size(int bpsize, int psize,
1422 					unsigned int block_size)
1423 {
1424 	if (block_size > hblkrm_size[bpsize][psize])
1425 		hblkrm_size[bpsize][psize] = block_size;
1426 }
1427 
1428 /*
1429  * Decode the Encoded segment base page size and actual page size.
1430  * PAPR specifies:
1431  *   - bit 7 is the L bit
1432  *   - bits 0-5 are the penc value
1433  * If the L bit is 0, this means 4K segment base page size and actual page size
1434  * otherwise the penc value should be read.
1435  */
1436 #define HBLKRM_L_MASK		0x80
1437 #define HBLKRM_PENC_MASK	0x3f
1438 static inline void __init check_lp_set_hblkrm(unsigned int lp,
1439 					      unsigned int block_size)
1440 {
1441 	unsigned int bpsize, psize;
1442 
1443 	/* First, check the L bit, if not set, this means 4K */
1444 	if ((lp & HBLKRM_L_MASK) == 0) {
1445 		set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size);
1446 		return;
1447 	}
1448 
1449 	lp &= HBLKRM_PENC_MASK;
1450 	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) {
1451 		struct mmu_psize_def *def = &mmu_psize_defs[bpsize];
1452 
1453 		for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
1454 			if (def->penc[psize] == lp) {
1455 				set_hblkrm_bloc_size(bpsize, psize, block_size);
1456 				return;
1457 			}
1458 		}
1459 	}
1460 }
1461 
1462 #define SPLPAR_TLB_BIC_TOKEN		50
1463 
1464 /*
1465  * The size of the TLB Block Invalidate Characteristics is variable. But at the
1466  * maximum it will be the number of possible page sizes *2 + 10 bytes.
1467  * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size
1468  * (128 bytes) for the buffer to get plenty of space.
1469  */
1470 #define SPLPAR_TLB_BIC_MAXLENGTH	128
1471 
1472 void __init pseries_lpar_read_hblkrm_characteristics(void)
1473 {
1474 	unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH];
1475 	int call_status, len, idx, bpsize;
1476 
1477 	if (!firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
1478 		return;
1479 
1480 	spin_lock(&rtas_data_buf_lock);
1481 	memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
1482 	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
1483 				NULL,
1484 				SPLPAR_TLB_BIC_TOKEN,
1485 				__pa(rtas_data_buf),
1486 				RTAS_DATA_BUF_SIZE);
1487 	memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH);
1488 	local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0';
1489 	spin_unlock(&rtas_data_buf_lock);
1490 
1491 	if (call_status != 0) {
1492 		pr_warn("%s %s Error calling get-system-parameter (0x%x)\n",
1493 			__FILE__, __func__, call_status);
1494 		return;
1495 	}
1496 
1497 	/*
1498 	 * The first two (2) bytes of the data in the buffer are the length of
1499 	 * the returned data, not counting these first two (2) bytes.
1500 	 */
1501 	len = be16_to_cpu(*((u16 *)local_buffer)) + 2;
1502 	if (len > SPLPAR_TLB_BIC_MAXLENGTH) {
1503 		pr_warn("%s too large returned buffer %d", __func__, len);
1504 		return;
1505 	}
1506 
1507 	idx = 2;
1508 	while (idx < len) {
1509 		u8 block_shift = local_buffer[idx++];
1510 		u32 block_size;
1511 		unsigned int npsize;
1512 
1513 		if (!block_shift)
1514 			break;
1515 
1516 		block_size = 1 << block_shift;
1517 
1518 		for (npsize = local_buffer[idx++];
1519 		     npsize > 0 && idx < len; npsize--)
1520 			check_lp_set_hblkrm((unsigned int) local_buffer[idx++],
1521 					    block_size);
1522 	}
1523 
1524 	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
1525 		for (idx = 0; idx < MMU_PAGE_COUNT; idx++)
1526 			if (hblkrm_size[bpsize][idx])
1527 				pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d",
1528 					bpsize, idx, hblkrm_size[bpsize][idx]);
1529 }
1530 
1531 /*
1532  * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
1533  * lock.
1534  */
1535 static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
1536 {
1537 	unsigned long vpn;
1538 	unsigned long i, pix, rc;
1539 	unsigned long flags = 0;
1540 	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
1541 	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
1542 	unsigned long param[PLPAR_HCALL9_BUFSIZE];
1543 	unsigned long index, shift, slot;
1544 	real_pte_t pte;
1545 	int psize, ssize;
1546 
1547 	if (lock_tlbie)
1548 		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
1549 
1550 	if (is_supported_hlbkrm(batch->psize, batch->psize)) {
1551 		do_block_remove(number, batch, param);
1552 		goto out;
1553 	}
1554 
1555 	psize = batch->psize;
1556 	ssize = batch->ssize;
1557 	pix = 0;
1558 	for (i = 0; i < number; i++) {
1559 		vpn = batch->vpn[i];
1560 		pte = batch->pte[i];
1561 		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
1562 			slot = compute_slot(pte, vpn, index, shift, ssize);
1563 			if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1564 				/*
1565 				 * lpar doesn't use the passed actual page size
1566 				 */
1567 				pSeries_lpar_hpte_invalidate(slot, vpn, psize,
1568 							     0, ssize, local);
1569 			} else {
1570 				param[pix] = HBR_REQUEST | HBR_AVPN | slot;
1571 				param[pix+1] = hpte_encode_avpn(vpn, psize,
1572 								ssize);
1573 				pix += 2;
1574 				if (pix == 8) {
1575 					rc = plpar_hcall9(H_BULK_REMOVE, param,
1576 						param[0], param[1], param[2],
1577 						param[3], param[4], param[5],
1578 						param[6], param[7]);
1579 					BUG_ON(rc != H_SUCCESS);
1580 					pix = 0;
1581 				}
1582 			}
1583 		} pte_iterate_hashed_end();
1584 	}
1585 	if (pix) {
1586 		param[pix] = HBR_END;
1587 		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
1588 				  param[2], param[3], param[4], param[5],
1589 				  param[6], param[7]);
1590 		BUG_ON(rc != H_SUCCESS);
1591 	}
1592 
1593 out:
1594 	if (lock_tlbie)
1595 		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
1596 }
1597 
1598 static int __init disable_bulk_remove(char *str)
1599 {
1600 	if (strcmp(str, "off") == 0 &&
1601 	    firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1602 		pr_info("Disabling BULK_REMOVE firmware feature");
1603 		powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
1604 	}
1605 	return 1;
1606 }
1607 
1608 __setup("bulk_remove=", disable_bulk_remove);
1609 
1610 #define HPT_RESIZE_TIMEOUT	10000 /* ms */
1611 
1612 struct hpt_resize_state {
1613 	unsigned long shift;
1614 	int commit_rc;
1615 };
1616 
1617 static int pseries_lpar_resize_hpt_commit(void *data)
1618 {
1619 	struct hpt_resize_state *state = data;
1620 
1621 	state->commit_rc = plpar_resize_hpt_commit(0, state->shift);
1622 	if (state->commit_rc != H_SUCCESS)
1623 		return -EIO;
1624 
1625 	/* Hypervisor has transitioned the HTAB, update our globals */
1626 	ppc64_pft_size = state->shift;
1627 	htab_size_bytes = 1UL << ppc64_pft_size;
1628 	htab_hash_mask = (htab_size_bytes >> 7) - 1;
1629 
1630 	return 0;
1631 }
1632 
1633 /*
1634  * Must be called in process context. The caller must hold the
1635  * cpus_lock.
1636  */
1637 static int pseries_lpar_resize_hpt(unsigned long shift)
1638 {
1639 	struct hpt_resize_state state = {
1640 		.shift = shift,
1641 		.commit_rc = H_FUNCTION,
1642 	};
1643 	unsigned int delay, total_delay = 0;
1644 	int rc;
1645 	ktime_t t0, t1, t2;
1646 
1647 	might_sleep();
1648 
1649 	if (!firmware_has_feature(FW_FEATURE_HPT_RESIZE))
1650 		return -ENODEV;
1651 
1652 	pr_info("Attempting to resize HPT to shift %lu\n", shift);
1653 
1654 	t0 = ktime_get();
1655 
1656 	rc = plpar_resize_hpt_prepare(0, shift);
1657 	while (H_IS_LONG_BUSY(rc)) {
1658 		delay = get_longbusy_msecs(rc);
1659 		total_delay += delay;
1660 		if (total_delay > HPT_RESIZE_TIMEOUT) {
1661 			/* prepare with shift==0 cancels an in-progress resize */
1662 			rc = plpar_resize_hpt_prepare(0, 0);
1663 			if (rc != H_SUCCESS)
1664 				pr_warn("Unexpected error %d cancelling timed out HPT resize\n",
1665 				       rc);
1666 			return -ETIMEDOUT;
1667 		}
1668 		msleep(delay);
1669 		rc = plpar_resize_hpt_prepare(0, shift);
1670 	}
1671 
1672 	switch (rc) {
1673 	case H_SUCCESS:
1674 		/* Continue on */
1675 		break;
1676 
1677 	case H_PARAMETER:
1678 		pr_warn("Invalid argument from H_RESIZE_HPT_PREPARE\n");
1679 		return -EINVAL;
1680 	case H_RESOURCE:
1681 		pr_warn("Operation not permitted from H_RESIZE_HPT_PREPARE\n");
1682 		return -EPERM;
1683 	default:
1684 		pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc);
1685 		return -EIO;
1686 	}
1687 
1688 	t1 = ktime_get();
1689 
1690 	rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
1691 				     &state, NULL);
1692 
1693 	t2 = ktime_get();
1694 
1695 	if (rc != 0) {
1696 		switch (state.commit_rc) {
1697 		case H_PTEG_FULL:
1698 			return -ENOSPC;
1699 
1700 		default:
1701 			pr_warn("Unexpected error %d from H_RESIZE_HPT_COMMIT\n",
1702 				state.commit_rc);
1703 			return -EIO;
1704 		};
1705 	}
1706 
1707 	pr_info("HPT resize to shift %lu complete (%lld ms / %lld ms)\n",
1708 		shift, (long long) ktime_ms_delta(t1, t0),
1709 		(long long) ktime_ms_delta(t2, t1));
1710 
1711 	return 0;
1712 }
1713 
1714 void __init hpte_init_pseries(void)
1715 {
1716 	mmu_hash_ops.hpte_invalidate	 = pSeries_lpar_hpte_invalidate;
1717 	mmu_hash_ops.hpte_updatepp	 = pSeries_lpar_hpte_updatepp;
1718 	mmu_hash_ops.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
1719 	mmu_hash_ops.hpte_insert	 = pSeries_lpar_hpte_insert;
1720 	mmu_hash_ops.hpte_remove	 = pSeries_lpar_hpte_remove;
1721 	mmu_hash_ops.hpte_removebolted   = pSeries_lpar_hpte_removebolted;
1722 	mmu_hash_ops.flush_hash_range	 = pSeries_lpar_flush_hash_range;
1723 	mmu_hash_ops.hpte_clear_all      = pseries_hpte_clear_all;
1724 	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
1725 
1726 	if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
1727 		mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
1728 
1729 	/*
1730 	 * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
1731 	 * to inform the hypervisor that we wish to use the HPT.
1732 	 */
1733 	if (cpu_has_feature(CPU_FTR_ARCH_300))
1734 		pseries_lpar_register_process_table(0, 0, 0);
1735 }
1736 #endif /* CONFIG_PPC_64S_HASH_MMU */
1737 
1738 #ifdef CONFIG_PPC_RADIX_MMU
1739 void __init radix_init_pseries(void)
1740 {
1741 	pr_info("Using radix MMU under hypervisor\n");
1742 
1743 	pseries_lpar_register_process_table(__pa(process_tb),
1744 						0, PRTB_SIZE_SHIFT - 12);
1745 }
1746 #endif
1747 
1748 #ifdef CONFIG_PPC_SMLPAR
1749 #define CMO_FREE_HINT_DEFAULT 1
1750 static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
1751 
1752 static int __init cmo_free_hint(char *str)
1753 {
1754 	char *parm;
1755 	parm = strstrip(str);
1756 
1757 	if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
1758 		pr_info("%s: CMO free page hinting is not active.\n", __func__);
1759 		cmo_free_hint_flag = 0;
1760 		return 1;
1761 	}
1762 
1763 	cmo_free_hint_flag = 1;
1764 	pr_info("%s: CMO free page hinting is active.\n", __func__);
1765 
1766 	if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
1767 		return 1;
1768 
1769 	return 0;
1770 }
1771 
1772 __setup("cmo_free_hint=", cmo_free_hint);
1773 
1774 static void pSeries_set_page_state(struct page *page, int order,
1775 				   unsigned long state)
1776 {
1777 	int i, j;
1778 	unsigned long cmo_page_sz, addr;
1779 
1780 	cmo_page_sz = cmo_get_page_size();
1781 	addr = __pa((unsigned long)page_address(page));
1782 
1783 	for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
1784 		for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
1785 			plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
1786 	}
1787 }
1788 
1789 void arch_free_page(struct page *page, int order)
1790 {
1791 	if (radix_enabled())
1792 		return;
1793 	if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
1794 		return;
1795 
1796 	pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
1797 }
1798 EXPORT_SYMBOL(arch_free_page);
1799 
1800 #endif /* CONFIG_PPC_SMLPAR */
1801 #endif /* CONFIG_PPC_BOOK3S_64 */
1802 
1803 #ifdef CONFIG_TRACEPOINTS
1804 #ifdef CONFIG_JUMP_LABEL
1805 struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;
1806 
1807 int hcall_tracepoint_regfunc(void)
1808 {
1809 	static_key_slow_inc(&hcall_tracepoint_key);
1810 	return 0;
1811 }
1812 
1813 void hcall_tracepoint_unregfunc(void)
1814 {
1815 	static_key_slow_dec(&hcall_tracepoint_key);
1816 }
1817 #else
1818 /*
1819  * We optimise our hcall path by placing hcall_tracepoint_refcount
1820  * directly in the TOC so we can check if the hcall tracepoints are
1821  * enabled via a single load.
1822  */
1823 
1824 /* NB: reg/unreg are called while guarded with the tracepoints_mutex */
1825 extern long hcall_tracepoint_refcount;
1826 
1827 int hcall_tracepoint_regfunc(void)
1828 {
1829 	hcall_tracepoint_refcount++;
1830 	return 0;
1831 }
1832 
1833 void hcall_tracepoint_unregfunc(void)
1834 {
1835 	hcall_tracepoint_refcount--;
1836 }
1837 #endif
1838 
1839 /*
1840  * Keep track of hcall tracing depth and prevent recursion. Warn if any is
1841  * detected because it may indicate a problem. This will not catch all
1842  * problems with tracing code making hcalls, because the tracing might have
1843  * been invoked from a non-hcall, so the first hcall could recurse into it
1844  * without warning here, but this better than nothing.
1845  *
1846  * Hcalls with specific problems being traced should use the _notrace
1847  * plpar_hcall variants.
1848  */
1849 static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
1850 
1851 
1852 notrace void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
1853 {
1854 	unsigned long flags;
1855 	unsigned int *depth;
1856 
1857 	local_irq_save(flags);
1858 
1859 	depth = this_cpu_ptr(&hcall_trace_depth);
1860 
1861 	if (WARN_ON_ONCE(*depth))
1862 		goto out;
1863 
1864 	(*depth)++;
1865 	preempt_disable();
1866 	trace_hcall_entry(opcode, args);
1867 	(*depth)--;
1868 
1869 out:
1870 	local_irq_restore(flags);
1871 }
1872 
1873 notrace void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf)
1874 {
1875 	unsigned long flags;
1876 	unsigned int *depth;
1877 
1878 	local_irq_save(flags);
1879 
1880 	depth = this_cpu_ptr(&hcall_trace_depth);
1881 
1882 	if (*depth) /* Don't warn again on the way out */
1883 		goto out;
1884 
1885 	(*depth)++;
1886 	trace_hcall_exit(opcode, retval, retbuf);
1887 	preempt_enable();
1888 	(*depth)--;
1889 
1890 out:
1891 	local_irq_restore(flags);
1892 }
1893 #endif
1894 
1895 /**
1896  * h_get_mpp
1897  * H_GET_MPP hcall returns info in 7 parms
1898  */
1899 int h_get_mpp(struct hvcall_mpp_data *mpp_data)
1900 {
1901 	int rc;
1902 	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
1903 
1904 	rc = plpar_hcall9(H_GET_MPP, retbuf);
1905 
1906 	mpp_data->entitled_mem = retbuf[0];
1907 	mpp_data->mapped_mem = retbuf[1];
1908 
1909 	mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
1910 	mpp_data->pool_num = retbuf[2] & 0xffff;
1911 
1912 	mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
1913 	mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
1914 	mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL;
1915 
1916 	mpp_data->pool_size = retbuf[4];
1917 	mpp_data->loan_request = retbuf[5];
1918 	mpp_data->backing_mem = retbuf[6];
1919 
1920 	return rc;
1921 }
1922 EXPORT_SYMBOL(h_get_mpp);
1923 
1924 int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
1925 {
1926 	int rc;
1927 	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
1928 
1929 	rc = plpar_hcall9(H_GET_MPP_X, retbuf);
1930 
1931 	mpp_x_data->coalesced_bytes = retbuf[0];
1932 	mpp_x_data->pool_coalesced_bytes = retbuf[1];
1933 	mpp_x_data->pool_purr_cycles = retbuf[2];
1934 	mpp_x_data->pool_spurr_cycles = retbuf[3];
1935 
1936 	return rc;
1937 }
1938 
1939 #ifdef CONFIG_PPC_64S_HASH_MMU
1940 static unsigned long __init vsid_unscramble(unsigned long vsid, int ssize)
1941 {
1942 	unsigned long protovsid;
1943 	unsigned long va_bits = VA_BITS;
1944 	unsigned long modinv, vsid_modulus;
1945 	unsigned long max_mod_inv, tmp_modinv;
1946 
1947 	if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
1948 		va_bits = 65;
1949 
1950 	if (ssize == MMU_SEGSIZE_256M) {
1951 		modinv = VSID_MULINV_256M;
1952 		vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
1953 	} else {
1954 		modinv = VSID_MULINV_1T;
1955 		vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
1956 	}
1957 
1958 	/*
1959 	 * vsid outside our range.
1960 	 */
1961 	if (vsid >= vsid_modulus)
1962 		return 0;
1963 
1964 	/*
1965 	 * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
1966 	 * and vsid = (protovsid * x) % vsid_modulus, then we say:
1967 	 *   protovsid = (vsid * modinv) % vsid_modulus
1968 	 */
1969 
1970 	/* Check if (vsid * modinv) overflow (63 bits) */
1971 	max_mod_inv = 0x7fffffffffffffffull / vsid;
1972 	if (modinv < max_mod_inv)
1973 		return (vsid * modinv) % vsid_modulus;
1974 
1975 	tmp_modinv = modinv/max_mod_inv;
1976 	modinv %= max_mod_inv;
1977 
1978 	protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
1979 	protovsid = (protovsid + vsid * modinv) % vsid_modulus;
1980 
1981 	return protovsid;
1982 }
1983 
1984 static int __init reserve_vrma_context_id(void)
1985 {
1986 	unsigned long protovsid;
1987 
1988 	/*
1989 	 * Reserve context ids which map to reserved virtual addresses. For now
1990 	 * we only reserve the context id which maps to the VRMA VSID. We ignore
1991 	 * the addresses in "ibm,adjunct-virtual-addresses" because we don't
1992 	 * enable adjunct support via the "ibm,client-architecture-support"
1993 	 * interface.
1994 	 */
1995 	protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
1996 	hash__reserve_context_id(protovsid >> ESID_BITS_1T);
1997 	return 0;
1998 }
1999 machine_device_initcall(pseries, reserve_vrma_context_id);
2000 #endif
2001 
2002 #ifdef CONFIG_DEBUG_FS
2003 /* debugfs file interface for vpa data */
2004 static ssize_t vpa_file_read(struct file *filp, char __user *buf, size_t len,
2005 			      loff_t *pos)
2006 {
2007 	int cpu = (long)filp->private_data;
2008 	struct lppaca *lppaca = &lppaca_of(cpu);
2009 
2010 	return simple_read_from_buffer(buf, len, pos, lppaca,
2011 				sizeof(struct lppaca));
2012 }
2013 
2014 static const struct file_operations vpa_fops = {
2015 	.open		= simple_open,
2016 	.read		= vpa_file_read,
2017 	.llseek		= default_llseek,
2018 };
2019 
2020 static int __init vpa_debugfs_init(void)
2021 {
2022 	char name[16];
2023 	long i;
2024 	struct dentry *vpa_dir;
2025 
2026 	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
2027 		return 0;
2028 
2029 	vpa_dir = debugfs_create_dir("vpa", arch_debugfs_dir);
2030 
2031 	/* set up the per-cpu vpa file*/
2032 	for_each_possible_cpu(i) {
2033 		sprintf(name, "cpu-%ld", i);
2034 		debugfs_create_file(name, 0400, vpa_dir, (void *)i, &vpa_fops);
2035 	}
2036 
2037 	return 0;
2038 }
2039 machine_arch_initcall(pseries, vpa_debugfs_init);
2040 #endif /* CONFIG_DEBUG_FS */
2041