xref: /qemu/hw/ppc/spapr.c (revision d072cdf3)
1 /*
2  * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
3  *
4  * Copyright (c) 2004-2007 Fabrice Bellard
5  * Copyright (c) 2007 Jocelyn Mayer
6  * Copyright (c) 2010 David Gibson, IBM Corporation.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  *
26  */
27 #include "sysemu/sysemu.h"
28 #include "hw/hw.h"
29 #include "hw/fw-path-provider.h"
30 #include "elf.h"
31 #include "net/net.h"
32 #include "sysemu/blockdev.h"
33 #include "sysemu/cpus.h"
34 #include "sysemu/kvm.h"
35 #include "kvm_ppc.h"
36 #include "mmu-hash64.h"
37 #include "qom/cpu.h"
38 
39 #include "hw/boards.h"
40 #include "hw/ppc/ppc.h"
41 #include "hw/loader.h"
42 
43 #include "hw/ppc/spapr.h"
44 #include "hw/ppc/spapr_vio.h"
45 #include "hw/pci-host/spapr.h"
46 #include "hw/ppc/xics.h"
47 #include "hw/pci/msi.h"
48 
49 #include "hw/pci/pci.h"
50 #include "hw/scsi/scsi.h"
51 #include "hw/virtio/virtio-scsi.h"
52 
53 #include "exec/address-spaces.h"
54 #include "hw/usb.h"
55 #include "qemu/config-file.h"
56 #include "qemu/error-report.h"
57 #include "trace.h"
58 
59 #include <libfdt.h>
60 
61 /* SLOF memory layout:
62  *
63  * SLOF raw image loaded at 0, copies its romfs right below the flat
64  * device-tree, then position SLOF itself 31M below that
65  *
66  * So we set FW_OVERHEAD to 40MB which should account for all of that
67  * and more
68  *
69  * We load our kernel at 4M, leaving space for SLOF initial image
70  */
71 #define FDT_MAX_SIZE            0x40000
72 #define RTAS_MAX_SIZE           0x10000
73 #define FW_MAX_SIZE             0x400000
74 #define FW_FILE_NAME            "slof.bin"
75 #define FW_OVERHEAD             0x2800000
76 #define KERNEL_LOAD_ADDR        FW_MAX_SIZE
77 
78 #define MIN_RMA_SLOF            128UL
79 
80 #define TIMEBASE_FREQ           512000000ULL
81 
82 #define MAX_CPUS                256
83 
84 #define PHANDLE_XICP            0x00001111
85 
86 #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
87 
88 typedef struct sPAPRMachineState sPAPRMachineState;
89 
90 #define TYPE_SPAPR_MACHINE      "spapr-machine"
91 #define SPAPR_MACHINE(obj) \
92     OBJECT_CHECK(sPAPRMachineState, (obj), TYPE_SPAPR_MACHINE)
93 
94 /**
95  * sPAPRMachineState:
96  */
97 struct sPAPRMachineState {
98     /*< private >*/
99     MachineState parent_obj;
100 
101     /*< public >*/
102     char *kvm_type;
103 };
104 
105 sPAPREnvironment *spapr;
106 
107 static XICSState *try_create_xics(const char *type, int nr_servers,
108                                   int nr_irqs)
109 {
110     DeviceState *dev;
111 
112     dev = qdev_create(NULL, type);
113     qdev_prop_set_uint32(dev, "nr_servers", nr_servers);
114     qdev_prop_set_uint32(dev, "nr_irqs", nr_irqs);
115     if (qdev_init(dev) < 0) {
116         return NULL;
117     }
118 
119     return XICS_COMMON(dev);
120 }
121 
122 static XICSState *xics_system_init(int nr_servers, int nr_irqs)
123 {
124     XICSState *icp = NULL;
125 
126     if (kvm_enabled()) {
127         QemuOpts *machine_opts = qemu_get_machine_opts();
128         bool irqchip_allowed = qemu_opt_get_bool(machine_opts,
129                                                 "kernel_irqchip", true);
130         bool irqchip_required = qemu_opt_get_bool(machine_opts,
131                                                   "kernel_irqchip", false);
132         if (irqchip_allowed) {
133             icp = try_create_xics(TYPE_KVM_XICS, nr_servers, nr_irqs);
134         }
135 
136         if (irqchip_required && !icp) {
137             perror("Failed to create in-kernel XICS\n");
138             abort();
139         }
140     }
141 
142     if (!icp) {
143         icp = try_create_xics(TYPE_XICS, nr_servers, nr_irqs);
144     }
145 
146     if (!icp) {
147         perror("Failed to create XICS\n");
148         abort();
149     }
150 
151     return icp;
152 }
153 
154 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
155                                   int smt_threads)
156 {
157     int i, ret = 0;
158     uint32_t servers_prop[smt_threads];
159     uint32_t gservers_prop[smt_threads * 2];
160     int index = ppc_get_vcpu_dt_id(cpu);
161 
162     if (cpu->cpu_version) {
163         ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->cpu_version);
164         if (ret < 0) {
165             return ret;
166         }
167     }
168 
169     /* Build interrupt servers and gservers properties */
170     for (i = 0; i < smt_threads; i++) {
171         servers_prop[i] = cpu_to_be32(index + i);
172         /* Hack, direct the group queues back to cpu 0 */
173         gservers_prop[i*2] = cpu_to_be32(index + i);
174         gservers_prop[i*2 + 1] = 0;
175     }
176     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
177                       servers_prop, sizeof(servers_prop));
178     if (ret < 0) {
179         return ret;
180     }
181     ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
182                       gservers_prop, sizeof(gservers_prop));
183 
184     return ret;
185 }
186 
187 static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
188 {
189     int ret = 0, offset, cpus_offset;
190     CPUState *cs;
191     char cpu_model[32];
192     int smt = kvmppc_smt_threads();
193     uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
194 
195     CPU_FOREACH(cs) {
196         PowerPCCPU *cpu = POWERPC_CPU(cs);
197         DeviceClass *dc = DEVICE_GET_CLASS(cs);
198         int index = ppc_get_vcpu_dt_id(cpu);
199         uint32_t associativity[] = {cpu_to_be32(0x5),
200                                     cpu_to_be32(0x0),
201                                     cpu_to_be32(0x0),
202                                     cpu_to_be32(0x0),
203                                     cpu_to_be32(cs->numa_node),
204                                     cpu_to_be32(index)};
205 
206         if ((index % smt) != 0) {
207             continue;
208         }
209 
210         snprintf(cpu_model, 32, "%s@%x", dc->fw_name, index);
211 
212         cpus_offset = fdt_path_offset(fdt, "/cpus");
213         if (cpus_offset < 0) {
214             cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
215                                           "cpus");
216             if (cpus_offset < 0) {
217                 return cpus_offset;
218             }
219         }
220         offset = fdt_subnode_offset(fdt, cpus_offset, cpu_model);
221         if (offset < 0) {
222             offset = fdt_add_subnode(fdt, cpus_offset, cpu_model);
223             if (offset < 0) {
224                 return offset;
225             }
226         }
227 
228         if (nb_numa_nodes > 1) {
229             ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
230                               sizeof(associativity));
231             if (ret < 0) {
232                 return ret;
233             }
234         }
235 
236         ret = fdt_setprop(fdt, offset, "ibm,pft-size",
237                           pft_size_prop, sizeof(pft_size_prop));
238         if (ret < 0) {
239             return ret;
240         }
241 
242         ret = spapr_fixup_cpu_smt_dt(fdt, offset, cpu,
243                                      ppc_get_compat_smt_threads(cpu));
244         if (ret < 0) {
245             return ret;
246         }
247     }
248     return ret;
249 }
250 
251 
252 static size_t create_page_sizes_prop(CPUPPCState *env, uint32_t *prop,
253                                      size_t maxsize)
254 {
255     size_t maxcells = maxsize / sizeof(uint32_t);
256     int i, j, count;
257     uint32_t *p = prop;
258 
259     for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
260         struct ppc_one_seg_page_size *sps = &env->sps.sps[i];
261 
262         if (!sps->page_shift) {
263             break;
264         }
265         for (count = 0; count < PPC_PAGE_SIZES_MAX_SZ; count++) {
266             if (sps->enc[count].page_shift == 0) {
267                 break;
268             }
269         }
270         if ((p - prop) >= (maxcells - 3 - count * 2)) {
271             break;
272         }
273         *(p++) = cpu_to_be32(sps->page_shift);
274         *(p++) = cpu_to_be32(sps->slb_enc);
275         *(p++) = cpu_to_be32(count);
276         for (j = 0; j < count; j++) {
277             *(p++) = cpu_to_be32(sps->enc[j].page_shift);
278             *(p++) = cpu_to_be32(sps->enc[j].pte_enc);
279         }
280     }
281 
282     return (p - prop) * sizeof(uint32_t);
283 }
284 
285 #define _FDT(exp) \
286     do { \
287         int ret = (exp);                                           \
288         if (ret < 0) {                                             \
289             fprintf(stderr, "qemu: error creating device tree: %s: %s\n", \
290                     #exp, fdt_strerror(ret));                      \
291             exit(1);                                               \
292         }                                                          \
293     } while (0)
294 
295 static void add_str(GString *s, const gchar *s1)
296 {
297     g_string_append_len(s, s1, strlen(s1) + 1);
298 }
299 
300 static void *spapr_create_fdt_skel(hwaddr initrd_base,
301                                    hwaddr initrd_size,
302                                    hwaddr kernel_size,
303                                    bool little_endian,
304                                    const char *boot_device,
305                                    const char *kernel_cmdline,
306                                    uint32_t epow_irq)
307 {
308     void *fdt;
309     CPUState *cs;
310     uint32_t start_prop = cpu_to_be32(initrd_base);
311     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
312     GString *hypertas = g_string_sized_new(256);
313     GString *qemu_hypertas = g_string_sized_new(256);
314     uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)};
315     uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
316     int smt = kvmppc_smt_threads();
317     unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
318     QemuOpts *opts = qemu_opts_find(qemu_find_opts("smp-opts"), NULL);
319     unsigned sockets = opts ? qemu_opt_get_number(opts, "sockets", 0) : 0;
320     uint32_t cpus_per_socket = sockets ? (smp_cpus / sockets) : 1;
321 
322     add_str(hypertas, "hcall-pft");
323     add_str(hypertas, "hcall-term");
324     add_str(hypertas, "hcall-dabr");
325     add_str(hypertas, "hcall-interrupt");
326     add_str(hypertas, "hcall-tce");
327     add_str(hypertas, "hcall-vio");
328     add_str(hypertas, "hcall-splpar");
329     add_str(hypertas, "hcall-bulk");
330     add_str(hypertas, "hcall-set-mode");
331     add_str(qemu_hypertas, "hcall-memop1");
332 
333     fdt = g_malloc0(FDT_MAX_SIZE);
334     _FDT((fdt_create(fdt, FDT_MAX_SIZE)));
335 
336     if (kernel_size) {
337         _FDT((fdt_add_reservemap_entry(fdt, KERNEL_LOAD_ADDR, kernel_size)));
338     }
339     if (initrd_size) {
340         _FDT((fdt_add_reservemap_entry(fdt, initrd_base, initrd_size)));
341     }
342     _FDT((fdt_finish_reservemap(fdt)));
343 
344     /* Root node */
345     _FDT((fdt_begin_node(fdt, "")));
346     _FDT((fdt_property_string(fdt, "device_type", "chrp")));
347     _FDT((fdt_property_string(fdt, "model", "IBM pSeries (emulated by qemu)")));
348     _FDT((fdt_property_string(fdt, "compatible", "qemu,pseries")));
349 
350     _FDT((fdt_property_cell(fdt, "#address-cells", 0x2)));
351     _FDT((fdt_property_cell(fdt, "#size-cells", 0x2)));
352 
353     /* /chosen */
354     _FDT((fdt_begin_node(fdt, "chosen")));
355 
356     /* Set Form1_affinity */
357     _FDT((fdt_property(fdt, "ibm,architecture-vec-5", vec5, sizeof(vec5))));
358 
359     _FDT((fdt_property_string(fdt, "bootargs", kernel_cmdline)));
360     _FDT((fdt_property(fdt, "linux,initrd-start",
361                        &start_prop, sizeof(start_prop))));
362     _FDT((fdt_property(fdt, "linux,initrd-end",
363                        &end_prop, sizeof(end_prop))));
364     if (kernel_size) {
365         uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR),
366                               cpu_to_be64(kernel_size) };
367 
368         _FDT((fdt_property(fdt, "qemu,boot-kernel", &kprop, sizeof(kprop))));
369         if (little_endian) {
370             _FDT((fdt_property(fdt, "qemu,boot-kernel-le", NULL, 0)));
371         }
372     }
373     if (boot_device) {
374         _FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device)));
375     }
376     if (boot_menu) {
377         _FDT((fdt_property_cell(fdt, "qemu,boot-menu", boot_menu)));
378     }
379     _FDT((fdt_property_cell(fdt, "qemu,graphic-width", graphic_width)));
380     _FDT((fdt_property_cell(fdt, "qemu,graphic-height", graphic_height)));
381     _FDT((fdt_property_cell(fdt, "qemu,graphic-depth", graphic_depth)));
382 
383     _FDT((fdt_end_node(fdt)));
384 
385     /* cpus */
386     _FDT((fdt_begin_node(fdt, "cpus")));
387 
388     _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
389     _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
390 
391     CPU_FOREACH(cs) {
392         PowerPCCPU *cpu = POWERPC_CPU(cs);
393         CPUPPCState *env = &cpu->env;
394         DeviceClass *dc = DEVICE_GET_CLASS(cs);
395         PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
396         int index = ppc_get_vcpu_dt_id(cpu);
397         char *nodename;
398         uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
399                            0xffffffff, 0xffffffff};
400         uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TIMEBASE_FREQ;
401         uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
402         uint32_t page_sizes_prop[64];
403         size_t page_sizes_prop_size;
404 
405         if ((index % smt) != 0) {
406             continue;
407         }
408 
409         nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
410 
411         _FDT((fdt_begin_node(fdt, nodename)));
412 
413         g_free(nodename);
414 
415         _FDT((fdt_property_cell(fdt, "reg", index)));
416         _FDT((fdt_property_string(fdt, "device_type", "cpu")));
417 
418         _FDT((fdt_property_cell(fdt, "cpu-version", env->spr[SPR_PVR])));
419         _FDT((fdt_property_cell(fdt, "d-cache-block-size",
420                                 env->dcache_line_size)));
421         _FDT((fdt_property_cell(fdt, "d-cache-line-size",
422                                 env->dcache_line_size)));
423         _FDT((fdt_property_cell(fdt, "i-cache-block-size",
424                                 env->icache_line_size)));
425         _FDT((fdt_property_cell(fdt, "i-cache-line-size",
426                                 env->icache_line_size)));
427 
428         if (pcc->l1_dcache_size) {
429             _FDT((fdt_property_cell(fdt, "d-cache-size", pcc->l1_dcache_size)));
430         } else {
431             fprintf(stderr, "Warning: Unknown L1 dcache size for cpu\n");
432         }
433         if (pcc->l1_icache_size) {
434             _FDT((fdt_property_cell(fdt, "i-cache-size", pcc->l1_icache_size)));
435         } else {
436             fprintf(stderr, "Warning: Unknown L1 icache size for cpu\n");
437         }
438 
439         _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq)));
440         _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq)));
441         _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
442         _FDT((fdt_property_string(fdt, "status", "okay")));
443         _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
444 
445         if (env->spr_cb[SPR_PURR].oea_read) {
446             _FDT((fdt_property(fdt, "ibm,purr", NULL, 0)));
447         }
448 
449         if (env->mmu_model & POWERPC_MMU_1TSEG) {
450             _FDT((fdt_property(fdt, "ibm,processor-segment-sizes",
451                                segs, sizeof(segs))));
452         }
453 
454         /* Advertise VMX/VSX (vector extensions) if available
455          *   0 / no property == no vector extensions
456          *   1               == VMX / Altivec available
457          *   2               == VSX available */
458         if (env->insns_flags & PPC_ALTIVEC) {
459             uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1;
460 
461             _FDT((fdt_property_cell(fdt, "ibm,vmx", vmx)));
462         }
463 
464         /* Advertise DFP (Decimal Floating Point) if available
465          *   0 / no property == no DFP
466          *   1               == DFP available */
467         if (env->insns_flags2 & PPC2_DFP) {
468             _FDT((fdt_property_cell(fdt, "ibm,dfp", 1)));
469         }
470 
471         page_sizes_prop_size = create_page_sizes_prop(env, page_sizes_prop,
472                                                       sizeof(page_sizes_prop));
473         if (page_sizes_prop_size) {
474             _FDT((fdt_property(fdt, "ibm,segment-page-sizes",
475                                page_sizes_prop, page_sizes_prop_size)));
476         }
477 
478         _FDT((fdt_property_cell(fdt, "ibm,chip-id",
479                                 cs->cpu_index / cpus_per_socket)));
480 
481         _FDT((fdt_end_node(fdt)));
482     }
483 
484     _FDT((fdt_end_node(fdt)));
485 
486     /* RTAS */
487     _FDT((fdt_begin_node(fdt, "rtas")));
488 
489     if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
490         add_str(hypertas, "hcall-multi-tce");
491     }
492     _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas->str,
493                        hypertas->len)));
494     g_string_free(hypertas, TRUE);
495     _FDT((fdt_property(fdt, "qemu,hypertas-functions", qemu_hypertas->str,
496                        qemu_hypertas->len)));
497     g_string_free(qemu_hypertas, TRUE);
498 
499     _FDT((fdt_property(fdt, "ibm,associativity-reference-points",
500         refpoints, sizeof(refpoints))));
501 
502     _FDT((fdt_property_cell(fdt, "rtas-error-log-max", RTAS_ERROR_LOG_MAX)));
503 
504     _FDT((fdt_end_node(fdt)));
505 
506     /* interrupt controller */
507     _FDT((fdt_begin_node(fdt, "interrupt-controller")));
508 
509     _FDT((fdt_property_string(fdt, "device_type",
510                               "PowerPC-External-Interrupt-Presentation")));
511     _FDT((fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")));
512     _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
513     _FDT((fdt_property(fdt, "ibm,interrupt-server-ranges",
514                        interrupt_server_ranges_prop,
515                        sizeof(interrupt_server_ranges_prop))));
516     _FDT((fdt_property_cell(fdt, "#interrupt-cells", 2)));
517     _FDT((fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP)));
518     _FDT((fdt_property_cell(fdt, "phandle", PHANDLE_XICP)));
519 
520     _FDT((fdt_end_node(fdt)));
521 
522     /* vdevice */
523     _FDT((fdt_begin_node(fdt, "vdevice")));
524 
525     _FDT((fdt_property_string(fdt, "device_type", "vdevice")));
526     _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
527     _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
528     _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
529     _FDT((fdt_property_cell(fdt, "#interrupt-cells", 0x2)));
530     _FDT((fdt_property(fdt, "interrupt-controller", NULL, 0)));
531 
532     _FDT((fdt_end_node(fdt)));
533 
534     /* event-sources */
535     spapr_events_fdt_skel(fdt, epow_irq);
536 
537     /* /hypervisor node */
538     if (kvm_enabled()) {
539         uint8_t hypercall[16];
540 
541         /* indicate KVM hypercall interface */
542         _FDT((fdt_begin_node(fdt, "hypervisor")));
543         _FDT((fdt_property_string(fdt, "compatible", "linux,kvm")));
544         if (kvmppc_has_cap_fixup_hcalls()) {
545             /*
546              * Older KVM versions with older guest kernels were broken with the
547              * magic page, don't allow the guest to map it.
548              */
549             kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
550                                  sizeof(hypercall));
551             _FDT((fdt_property(fdt, "hcall-instructions", hypercall,
552                               sizeof(hypercall))));
553         }
554         _FDT((fdt_end_node(fdt)));
555     }
556 
557     _FDT((fdt_end_node(fdt))); /* close root node */
558     _FDT((fdt_finish(fdt)));
559 
560     return fdt;
561 }
562 
563 int spapr_h_cas_compose_response(target_ulong addr, target_ulong size)
564 {
565     void *fdt, *fdt_skel;
566     sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 };
567 
568     size -= sizeof(hdr);
569 
570     /* Create sceleton */
571     fdt_skel = g_malloc0(size);
572     _FDT((fdt_create(fdt_skel, size)));
573     _FDT((fdt_begin_node(fdt_skel, "")));
574     _FDT((fdt_end_node(fdt_skel)));
575     _FDT((fdt_finish(fdt_skel)));
576     fdt = g_malloc0(size);
577     _FDT((fdt_open_into(fdt_skel, fdt, size)));
578     g_free(fdt_skel);
579 
580     /* Fix skeleton up */
581     _FDT((spapr_fixup_cpu_dt(fdt, spapr)));
582 
583     /* Pack resulting tree */
584     _FDT((fdt_pack(fdt)));
585 
586     if (fdt_totalsize(fdt) + sizeof(hdr) > size) {
587         trace_spapr_cas_failed(size);
588         return -1;
589     }
590 
591     cpu_physical_memory_write(addr, &hdr, sizeof(hdr));
592     cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt));
593     trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
594     g_free(fdt);
595 
596     return 0;
597 }
598 
599 static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt)
600 {
601     uint32_t associativity[] = {cpu_to_be32(0x4), cpu_to_be32(0x0),
602                                 cpu_to_be32(0x0), cpu_to_be32(0x0),
603                                 cpu_to_be32(0x0)};
604     char mem_name[32];
605     hwaddr node0_size, mem_start, node_size;
606     uint64_t mem_reg_property[2];
607     int i, off;
608 
609     /* memory node(s) */
610     if (nb_numa_nodes > 1 && numa_info[0].node_mem < ram_size) {
611         node0_size = numa_info[0].node_mem;
612     } else {
613         node0_size = ram_size;
614     }
615 
616     /* RMA */
617     mem_reg_property[0] = 0;
618     mem_reg_property[1] = cpu_to_be64(spapr->rma_size);
619     off = fdt_add_subnode(fdt, 0, "memory@0");
620     _FDT(off);
621     _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
622     _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
623                       sizeof(mem_reg_property))));
624     _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
625                       sizeof(associativity))));
626 
627     /* RAM: Node 0 */
628     if (node0_size > spapr->rma_size) {
629         mem_reg_property[0] = cpu_to_be64(spapr->rma_size);
630         mem_reg_property[1] = cpu_to_be64(node0_size - spapr->rma_size);
631 
632         sprintf(mem_name, "memory@" TARGET_FMT_lx, spapr->rma_size);
633         off = fdt_add_subnode(fdt, 0, mem_name);
634         _FDT(off);
635         _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
636         _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
637                           sizeof(mem_reg_property))));
638         _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
639                           sizeof(associativity))));
640     }
641 
642     /* RAM: Node 1 and beyond */
643     mem_start = node0_size;
644     for (i = 1; i < nb_numa_nodes; i++) {
645         mem_reg_property[0] = cpu_to_be64(mem_start);
646         if (mem_start >= ram_size) {
647             node_size = 0;
648         } else {
649             node_size = numa_info[i].node_mem;
650             if (node_size > ram_size - mem_start) {
651                 node_size = ram_size - mem_start;
652             }
653         }
654         mem_reg_property[1] = cpu_to_be64(node_size);
655         associativity[3] = associativity[4] = cpu_to_be32(i);
656         sprintf(mem_name, "memory@" TARGET_FMT_lx, mem_start);
657         off = fdt_add_subnode(fdt, 0, mem_name);
658         _FDT(off);
659         _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
660         _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
661                           sizeof(mem_reg_property))));
662         _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
663                           sizeof(associativity))));
664         mem_start += node_size;
665     }
666 
667     return 0;
668 }
669 
670 static void spapr_finalize_fdt(sPAPREnvironment *spapr,
671                                hwaddr fdt_addr,
672                                hwaddr rtas_addr,
673                                hwaddr rtas_size)
674 {
675     int ret, i;
676     size_t cb = 0;
677     char *bootlist;
678     void *fdt;
679     sPAPRPHBState *phb;
680 
681     fdt = g_malloc(FDT_MAX_SIZE);
682 
683     /* open out the base tree into a temp buffer for the final tweaks */
684     _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE)));
685 
686     ret = spapr_populate_memory(spapr, fdt);
687     if (ret < 0) {
688         fprintf(stderr, "couldn't setup memory nodes in fdt\n");
689         exit(1);
690     }
691 
692     ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
693     if (ret < 0) {
694         fprintf(stderr, "couldn't setup vio devices in fdt\n");
695         exit(1);
696     }
697 
698     QLIST_FOREACH(phb, &spapr->phbs, list) {
699         ret = spapr_populate_pci_dt(phb, PHANDLE_XICP, fdt);
700     }
701 
702     if (ret < 0) {
703         fprintf(stderr, "couldn't setup PCI devices in fdt\n");
704         exit(1);
705     }
706 
707     /* RTAS */
708     ret = spapr_rtas_device_tree_setup(fdt, rtas_addr, rtas_size);
709     if (ret < 0) {
710         fprintf(stderr, "Couldn't set up RTAS device tree properties\n");
711     }
712 
713     /* Advertise NUMA via ibm,associativity */
714     ret = spapr_fixup_cpu_dt(fdt, spapr);
715     if (ret < 0) {
716         fprintf(stderr, "Couldn't finalize CPU device tree properties\n");
717     }
718 
719     bootlist = get_boot_devices_list(&cb, true);
720     if (cb && bootlist) {
721         int offset = fdt_path_offset(fdt, "/chosen");
722         if (offset < 0) {
723             exit(1);
724         }
725         for (i = 0; i < cb; i++) {
726             if (bootlist[i] == '\n') {
727                 bootlist[i] = ' ';
728             }
729 
730         }
731         ret = fdt_setprop_string(fdt, offset, "qemu,boot-list", bootlist);
732     }
733 
734     if (!spapr->has_graphics) {
735         spapr_populate_chosen_stdout(fdt, spapr->vio_bus);
736     }
737 
738     _FDT((fdt_pack(fdt)));
739 
740     if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
741         hw_error("FDT too big ! 0x%x bytes (max is 0x%x)\n",
742                  fdt_totalsize(fdt), FDT_MAX_SIZE);
743         exit(1);
744     }
745 
746     cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
747 
748     g_free(fdt);
749 }
750 
751 static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
752 {
753     return (addr & 0x0fffffff) + KERNEL_LOAD_ADDR;
754 }
755 
756 static void emulate_spapr_hypercall(PowerPCCPU *cpu)
757 {
758     CPUPPCState *env = &cpu->env;
759 
760     if (msr_pr) {
761         hcall_dprintf("Hypercall made with MSR[PR]=1\n");
762         env->gpr[3] = H_PRIVILEGE;
763     } else {
764         env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
765     }
766 }
767 
768 static void spapr_reset_htab(sPAPREnvironment *spapr)
769 {
770     long shift;
771 
772     /* allocate hash page table.  For now we always make this 16mb,
773      * later we should probably make it scale to the size of guest
774      * RAM */
775 
776     shift = kvmppc_reset_htab(spapr->htab_shift);
777 
778     if (shift > 0) {
779         /* Kernel handles htab, we don't need to allocate one */
780         spapr->htab_shift = shift;
781         kvmppc_kern_htab = true;
782     } else {
783         if (!spapr->htab) {
784             /* Allocate an htab if we don't yet have one */
785             spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
786         }
787 
788         /* And clear it */
789         memset(spapr->htab, 0, HTAB_SIZE(spapr));
790     }
791 
792     /* Update the RMA size if necessary */
793     if (spapr->vrma_adjust) {
794         hwaddr node0_size = (nb_numa_nodes > 1) ?
795             numa_info[0].node_mem : ram_size;
796         spapr->rma_size = kvmppc_rma_size(node0_size, spapr->htab_shift);
797     }
798 }
799 
800 static void ppc_spapr_reset(void)
801 {
802     PowerPCCPU *first_ppc_cpu;
803 
804     /* Reset the hash table & recalc the RMA */
805     spapr_reset_htab(spapr);
806 
807     qemu_devices_reset();
808 
809     /* Load the fdt */
810     spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
811                        spapr->rtas_size);
812 
813     /* Set up the entry state */
814     first_ppc_cpu = POWERPC_CPU(first_cpu);
815     first_ppc_cpu->env.gpr[3] = spapr->fdt_addr;
816     first_ppc_cpu->env.gpr[5] = 0;
817     first_cpu->halted = 0;
818     first_ppc_cpu->env.nip = spapr->entry_point;
819 
820 }
821 
822 static void spapr_cpu_reset(void *opaque)
823 {
824     PowerPCCPU *cpu = opaque;
825     CPUState *cs = CPU(cpu);
826     CPUPPCState *env = &cpu->env;
827 
828     cpu_reset(cs);
829 
830     /* All CPUs start halted.  CPU0 is unhalted from the machine level
831      * reset code and the rest are explicitly started up by the guest
832      * using an RTAS call */
833     cs->halted = 1;
834 
835     env->spr[SPR_HIOR] = 0;
836 
837     env->external_htab = (uint8_t *)spapr->htab;
838     if (kvm_enabled() && !env->external_htab) {
839         /*
840          * HV KVM, set external_htab to 1 so our ppc_hash64_load_hpte*
841          * functions do the right thing.
842          */
843         env->external_htab = (void *)1;
844     }
845     env->htab_base = -1;
846     /*
847      * htab_mask is the mask used to normalize hash value to PTEG index.
848      * htab_shift is log2 of hash table size.
849      * We have 8 hpte per group, and each hpte is 16 bytes.
850      * ie have 128 bytes per hpte entry.
851      */
852     env->htab_mask = (1ULL << ((spapr)->htab_shift - 7)) - 1;
853     env->spr[SPR_SDR1] = (target_ulong)(uintptr_t)spapr->htab |
854         (spapr->htab_shift - 18);
855 }
856 
857 static void spapr_create_nvram(sPAPREnvironment *spapr)
858 {
859     DeviceState *dev = qdev_create(&spapr->vio_bus->bus, "spapr-nvram");
860     DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
861 
862     if (dinfo) {
863         qdev_prop_set_drive_nofail(dev, "drive", dinfo->bdrv);
864     }
865 
866     qdev_init_nofail(dev);
867 
868     spapr->nvram = (struct sPAPRNVRAM *)dev;
869 }
870 
871 /* Returns whether we want to use VGA or not */
872 static int spapr_vga_init(PCIBus *pci_bus)
873 {
874     switch (vga_interface_type) {
875     case VGA_NONE:
876         return false;
877     case VGA_DEVICE:
878         return true;
879     case VGA_STD:
880         return pci_vga_init(pci_bus) != NULL;
881     default:
882         fprintf(stderr, "This vga model is not supported,"
883                 "currently it only supports -vga std\n");
884         exit(0);
885     }
886 }
887 
888 static const VMStateDescription vmstate_spapr = {
889     .name = "spapr",
890     .version_id = 2,
891     .minimum_version_id = 1,
892     .fields = (VMStateField[]) {
893         VMSTATE_UNUSED(4), /* used to be @next_irq */
894 
895         /* RTC offset */
896         VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
897         VMSTATE_PPC_TIMEBASE_V(tb, sPAPREnvironment, 2),
898         VMSTATE_END_OF_LIST()
899     },
900 };
901 
902 #define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
903 #define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
904 #define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
905 #define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))
906 
907 static int htab_save_setup(QEMUFile *f, void *opaque)
908 {
909     sPAPREnvironment *spapr = opaque;
910 
911     /* "Iteration" header */
912     qemu_put_be32(f, spapr->htab_shift);
913 
914     if (spapr->htab) {
915         spapr->htab_save_index = 0;
916         spapr->htab_first_pass = true;
917     } else {
918         assert(kvm_enabled());
919 
920         spapr->htab_fd = kvmppc_get_htab_fd(false);
921         if (spapr->htab_fd < 0) {
922             fprintf(stderr, "Unable to open fd for reading hash table from KVM: %s\n",
923                     strerror(errno));
924             return -1;
925         }
926     }
927 
928 
929     return 0;
930 }
931 
932 static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr,
933                                  int64_t max_ns)
934 {
935     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
936     int index = spapr->htab_save_index;
937     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
938 
939     assert(spapr->htab_first_pass);
940 
941     do {
942         int chunkstart;
943 
944         /* Consume invalid HPTEs */
945         while ((index < htabslots)
946                && !HPTE_VALID(HPTE(spapr->htab, index))) {
947             index++;
948             CLEAN_HPTE(HPTE(spapr->htab, index));
949         }
950 
951         /* Consume valid HPTEs */
952         chunkstart = index;
953         while ((index < htabslots)
954                && HPTE_VALID(HPTE(spapr->htab, index))) {
955             index++;
956             CLEAN_HPTE(HPTE(spapr->htab, index));
957         }
958 
959         if (index > chunkstart) {
960             int n_valid = index - chunkstart;
961 
962             qemu_put_be32(f, chunkstart);
963             qemu_put_be16(f, n_valid);
964             qemu_put_be16(f, 0);
965             qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
966                             HASH_PTE_SIZE_64 * n_valid);
967 
968             if ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
969                 break;
970             }
971         }
972     } while ((index < htabslots) && !qemu_file_rate_limit(f));
973 
974     if (index >= htabslots) {
975         assert(index == htabslots);
976         index = 0;
977         spapr->htab_first_pass = false;
978     }
979     spapr->htab_save_index = index;
980 }
981 
982 static int htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr,
983                                 int64_t max_ns)
984 {
985     bool final = max_ns < 0;
986     int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
987     int examined = 0, sent = 0;
988     int index = spapr->htab_save_index;
989     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
990 
991     assert(!spapr->htab_first_pass);
992 
993     do {
994         int chunkstart, invalidstart;
995 
996         /* Consume non-dirty HPTEs */
997         while ((index < htabslots)
998                && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
999             index++;
1000             examined++;
1001         }
1002 
1003         chunkstart = index;
1004         /* Consume valid dirty HPTEs */
1005         while ((index < htabslots)
1006                && HPTE_DIRTY(HPTE(spapr->htab, index))
1007                && HPTE_VALID(HPTE(spapr->htab, index))) {
1008             CLEAN_HPTE(HPTE(spapr->htab, index));
1009             index++;
1010             examined++;
1011         }
1012 
1013         invalidstart = index;
1014         /* Consume invalid dirty HPTEs */
1015         while ((index < htabslots)
1016                && HPTE_DIRTY(HPTE(spapr->htab, index))
1017                && !HPTE_VALID(HPTE(spapr->htab, index))) {
1018             CLEAN_HPTE(HPTE(spapr->htab, index));
1019             index++;
1020             examined++;
1021         }
1022 
1023         if (index > chunkstart) {
1024             int n_valid = invalidstart - chunkstart;
1025             int n_invalid = index - invalidstart;
1026 
1027             qemu_put_be32(f, chunkstart);
1028             qemu_put_be16(f, n_valid);
1029             qemu_put_be16(f, n_invalid);
1030             qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
1031                             HASH_PTE_SIZE_64 * n_valid);
1032             sent += index - chunkstart;
1033 
1034             if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
1035                 break;
1036             }
1037         }
1038 
1039         if (examined >= htabslots) {
1040             break;
1041         }
1042 
1043         if (index >= htabslots) {
1044             assert(index == htabslots);
1045             index = 0;
1046         }
1047     } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
1048 
1049     if (index >= htabslots) {
1050         assert(index == htabslots);
1051         index = 0;
1052     }
1053 
1054     spapr->htab_save_index = index;
1055 
1056     return (examined >= htabslots) && (sent == 0) ? 1 : 0;
1057 }
1058 
1059 #define MAX_ITERATION_NS    5000000 /* 5 ms */
1060 #define MAX_KVM_BUF_SIZE    2048
1061 
1062 static int htab_save_iterate(QEMUFile *f, void *opaque)
1063 {
1064     sPAPREnvironment *spapr = opaque;
1065     int rc = 0;
1066 
1067     /* Iteration header */
1068     qemu_put_be32(f, 0);
1069 
1070     if (!spapr->htab) {
1071         assert(kvm_enabled());
1072 
1073         rc = kvmppc_save_htab(f, spapr->htab_fd,
1074                               MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
1075         if (rc < 0) {
1076             return rc;
1077         }
1078     } else  if (spapr->htab_first_pass) {
1079         htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
1080     } else {
1081         rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
1082     }
1083 
1084     /* End marker */
1085     qemu_put_be32(f, 0);
1086     qemu_put_be16(f, 0);
1087     qemu_put_be16(f, 0);
1088 
1089     return rc;
1090 }
1091 
1092 static int htab_save_complete(QEMUFile *f, void *opaque)
1093 {
1094     sPAPREnvironment *spapr = opaque;
1095 
1096     /* Iteration header */
1097     qemu_put_be32(f, 0);
1098 
1099     if (!spapr->htab) {
1100         int rc;
1101 
1102         assert(kvm_enabled());
1103 
1104         rc = kvmppc_save_htab(f, spapr->htab_fd, MAX_KVM_BUF_SIZE, -1);
1105         if (rc < 0) {
1106             return rc;
1107         }
1108         close(spapr->htab_fd);
1109         spapr->htab_fd = -1;
1110     } else {
1111         htab_save_later_pass(f, spapr, -1);
1112     }
1113 
1114     /* End marker */
1115     qemu_put_be32(f, 0);
1116     qemu_put_be16(f, 0);
1117     qemu_put_be16(f, 0);
1118 
1119     return 0;
1120 }
1121 
1122 static int htab_load(QEMUFile *f, void *opaque, int version_id)
1123 {
1124     sPAPREnvironment *spapr = opaque;
1125     uint32_t section_hdr;
1126     int fd = -1;
1127 
1128     if (version_id < 1 || version_id > 1) {
1129         fprintf(stderr, "htab_load() bad version\n");
1130         return -EINVAL;
1131     }
1132 
1133     section_hdr = qemu_get_be32(f);
1134 
1135     if (section_hdr) {
1136         /* First section, just the hash shift */
1137         if (spapr->htab_shift != section_hdr) {
1138             return -EINVAL;
1139         }
1140         return 0;
1141     }
1142 
1143     if (!spapr->htab) {
1144         assert(kvm_enabled());
1145 
1146         fd = kvmppc_get_htab_fd(true);
1147         if (fd < 0) {
1148             fprintf(stderr, "Unable to open fd to restore KVM hash table: %s\n",
1149                     strerror(errno));
1150         }
1151     }
1152 
1153     while (true) {
1154         uint32_t index;
1155         uint16_t n_valid, n_invalid;
1156 
1157         index = qemu_get_be32(f);
1158         n_valid = qemu_get_be16(f);
1159         n_invalid = qemu_get_be16(f);
1160 
1161         if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
1162             /* End of Stream */
1163             break;
1164         }
1165 
1166         if ((index + n_valid + n_invalid) >
1167             (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
1168             /* Bad index in stream */
1169             fprintf(stderr, "htab_load() bad index %d (%hd+%hd entries) "
1170                     "in htab stream (htab_shift=%d)\n", index, n_valid, n_invalid,
1171                     spapr->htab_shift);
1172             return -EINVAL;
1173         }
1174 
1175         if (spapr->htab) {
1176             if (n_valid) {
1177                 qemu_get_buffer(f, HPTE(spapr->htab, index),
1178                                 HASH_PTE_SIZE_64 * n_valid);
1179             }
1180             if (n_invalid) {
1181                 memset(HPTE(spapr->htab, index + n_valid), 0,
1182                        HASH_PTE_SIZE_64 * n_invalid);
1183             }
1184         } else {
1185             int rc;
1186 
1187             assert(fd >= 0);
1188 
1189             rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid);
1190             if (rc < 0) {
1191                 return rc;
1192             }
1193         }
1194     }
1195 
1196     if (!spapr->htab) {
1197         assert(fd >= 0);
1198         close(fd);
1199     }
1200 
1201     return 0;
1202 }
1203 
1204 static SaveVMHandlers savevm_htab_handlers = {
1205     .save_live_setup = htab_save_setup,
1206     .save_live_iterate = htab_save_iterate,
1207     .save_live_complete = htab_save_complete,
1208     .load_state = htab_load,
1209 };
1210 
1211 /* pSeries LPAR / sPAPR hardware init */
1212 static void ppc_spapr_init(MachineState *machine)
1213 {
1214     ram_addr_t ram_size = machine->ram_size;
1215     const char *cpu_model = machine->cpu_model;
1216     const char *kernel_filename = machine->kernel_filename;
1217     const char *kernel_cmdline = machine->kernel_cmdline;
1218     const char *initrd_filename = machine->initrd_filename;
1219     const char *boot_device = machine->boot_order;
1220     PowerPCCPU *cpu;
1221     CPUPPCState *env;
1222     PCIHostState *phb;
1223     int i;
1224     MemoryRegion *sysmem = get_system_memory();
1225     MemoryRegion *ram = g_new(MemoryRegion, 1);
1226     MemoryRegion *rma_region;
1227     void *rma = NULL;
1228     hwaddr rma_alloc_size;
1229     hwaddr node0_size = (nb_numa_nodes > 1) ? numa_info[0].node_mem : ram_size;
1230     uint32_t initrd_base = 0;
1231     long kernel_size = 0, initrd_size = 0;
1232     long load_limit, rtas_limit, fw_size;
1233     bool kernel_le = false;
1234     char *filename;
1235 
1236     msi_supported = true;
1237 
1238     spapr = g_malloc0(sizeof(*spapr));
1239     QLIST_INIT(&spapr->phbs);
1240 
1241     cpu_ppc_hypercall = emulate_spapr_hypercall;
1242 
1243     /* Allocate RMA if necessary */
1244     rma_alloc_size = kvmppc_alloc_rma(&rma);
1245 
1246     if (rma_alloc_size == -1) {
1247         hw_error("qemu: Unable to create RMA\n");
1248         exit(1);
1249     }
1250 
1251     if (rma_alloc_size && (rma_alloc_size < node0_size)) {
1252         spapr->rma_size = rma_alloc_size;
1253     } else {
1254         spapr->rma_size = node0_size;
1255 
1256         /* With KVM, we don't actually know whether KVM supports an
1257          * unbounded RMA (PR KVM) or is limited by the hash table size
1258          * (HV KVM using VRMA), so we always assume the latter
1259          *
1260          * In that case, we also limit the initial allocations for RTAS
1261          * etc... to 256M since we have no way to know what the VRMA size
1262          * is going to be as it depends on the size of the hash table
1263          * isn't determined yet.
1264          */
1265         if (kvm_enabled()) {
1266             spapr->vrma_adjust = 1;
1267             spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
1268         }
1269     }
1270 
1271     if (spapr->rma_size > node0_size) {
1272         fprintf(stderr, "Error: Numa node 0 has to span the RMA (%#08"HWADDR_PRIx")\n",
1273                 spapr->rma_size);
1274         exit(1);
1275     }
1276 
1277     /* We place the device tree and RTAS just below either the top of the RMA,
1278      * or just below 2GB, whichever is lowere, so that it can be
1279      * processed with 32-bit real mode code if necessary */
1280     rtas_limit = MIN(spapr->rma_size, 0x80000000);
1281     spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
1282     spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
1283     load_limit = spapr->fdt_addr - FW_OVERHEAD;
1284 
1285     /* We aim for a hash table of size 1/128 the size of RAM.  The
1286      * normal rule of thumb is 1/64 the size of RAM, but that's much
1287      * more than needed for the Linux guests we support. */
1288     spapr->htab_shift = 18; /* Minimum architected size */
1289     while (spapr->htab_shift <= 46) {
1290         if ((1ULL << (spapr->htab_shift + 7)) >= ram_size) {
1291             break;
1292         }
1293         spapr->htab_shift++;
1294     }
1295 
1296     /* Set up Interrupt Controller before we create the VCPUs */
1297     spapr->icp = xics_system_init(smp_cpus * kvmppc_smt_threads() / smp_threads,
1298                                   XICS_IRQS);
1299 
1300     /* init CPUs */
1301     if (cpu_model == NULL) {
1302         cpu_model = kvm_enabled() ? "host" : "POWER7";
1303     }
1304     for (i = 0; i < smp_cpus; i++) {
1305         cpu = cpu_ppc_init(cpu_model);
1306         if (cpu == NULL) {
1307             fprintf(stderr, "Unable to find PowerPC CPU definition\n");
1308             exit(1);
1309         }
1310         env = &cpu->env;
1311 
1312         /* Set time-base frequency to 512 MHz */
1313         cpu_ppc_tb_init(env, TIMEBASE_FREQ);
1314 
1315         /* PAPR always has exception vectors in RAM not ROM. To ensure this,
1316          * MSR[IP] should never be set.
1317          */
1318         env->msr_mask &= ~(1 << 6);
1319 
1320         /* Tell KVM that we're in PAPR mode */
1321         if (kvm_enabled()) {
1322             kvmppc_set_papr(cpu);
1323         }
1324 
1325         if (cpu->max_compat) {
1326             if (ppc_set_compat(cpu, cpu->max_compat) < 0) {
1327                 exit(1);
1328             }
1329         }
1330 
1331         xics_cpu_setup(spapr->icp, cpu);
1332 
1333         qemu_register_reset(spapr_cpu_reset, cpu);
1334     }
1335 
1336     /* allocate RAM */
1337     spapr->ram_limit = ram_size;
1338     memory_region_allocate_system_memory(ram, NULL, "ppc_spapr.ram",
1339                                          spapr->ram_limit);
1340     memory_region_add_subregion(sysmem, 0, ram);
1341 
1342     if (rma_alloc_size && rma) {
1343         rma_region = g_new(MemoryRegion, 1);
1344         memory_region_init_ram_ptr(rma_region, NULL, "ppc_spapr.rma",
1345                                    rma_alloc_size, rma);
1346         vmstate_register_ram_global(rma_region);
1347         memory_region_add_subregion(sysmem, 0, rma_region);
1348     }
1349 
1350     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
1351     spapr->rtas_size = load_image_targphys(filename, spapr->rtas_addr,
1352                                            rtas_limit - spapr->rtas_addr);
1353     if (spapr->rtas_size < 0) {
1354         hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
1355         exit(1);
1356     }
1357     if (spapr->rtas_size > RTAS_MAX_SIZE) {
1358         hw_error("RTAS too big ! 0x%lx bytes (max is 0x%x)\n",
1359                  spapr->rtas_size, RTAS_MAX_SIZE);
1360         exit(1);
1361     }
1362     g_free(filename);
1363 
1364     /* Set up EPOW events infrastructure */
1365     spapr_events_init(spapr);
1366 
1367     /* Set up VIO bus */
1368     spapr->vio_bus = spapr_vio_bus_init();
1369 
1370     for (i = 0; i < MAX_SERIAL_PORTS; i++) {
1371         if (serial_hds[i]) {
1372             spapr_vty_create(spapr->vio_bus, serial_hds[i]);
1373         }
1374     }
1375 
1376     /* We always have at least the nvram device on VIO */
1377     spapr_create_nvram(spapr);
1378 
1379     /* Set up PCI */
1380     spapr_pci_msi_init(spapr, SPAPR_PCI_MSI_WINDOW);
1381     spapr_pci_rtas_init();
1382 
1383     phb = spapr_create_phb(spapr, 0);
1384 
1385     for (i = 0; i < nb_nics; i++) {
1386         NICInfo *nd = &nd_table[i];
1387 
1388         if (!nd->model) {
1389             nd->model = g_strdup("ibmveth");
1390         }
1391 
1392         if (strcmp(nd->model, "ibmveth") == 0) {
1393             spapr_vlan_create(spapr->vio_bus, nd);
1394         } else {
1395             pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
1396         }
1397     }
1398 
1399     for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
1400         spapr_vscsi_create(spapr->vio_bus);
1401     }
1402 
1403     /* Graphics */
1404     if (spapr_vga_init(phb->bus)) {
1405         spapr->has_graphics = true;
1406     }
1407 
1408     if (usb_enabled(spapr->has_graphics)) {
1409         pci_create_simple(phb->bus, -1, "pci-ohci");
1410         if (spapr->has_graphics) {
1411             usbdevice_create("keyboard");
1412             usbdevice_create("mouse");
1413         }
1414     }
1415 
1416     if (spapr->rma_size < (MIN_RMA_SLOF << 20)) {
1417         fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
1418                 "%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
1419         exit(1);
1420     }
1421 
1422     if (kernel_filename) {
1423         uint64_t lowaddr = 0;
1424 
1425         kernel_size = load_elf(kernel_filename, translate_kernel_address, NULL,
1426                                NULL, &lowaddr, NULL, 1, ELF_MACHINE, 0);
1427         if (kernel_size == ELF_LOAD_WRONG_ENDIAN) {
1428             kernel_size = load_elf(kernel_filename,
1429                                    translate_kernel_address, NULL,
1430                                    NULL, &lowaddr, NULL, 0, ELF_MACHINE, 0);
1431             kernel_le = kernel_size > 0;
1432         }
1433         if (kernel_size < 0) {
1434             fprintf(stderr, "qemu: error loading %s: %s\n",
1435                     kernel_filename, load_elf_strerror(kernel_size));
1436             exit(1);
1437         }
1438 
1439         /* load initrd */
1440         if (initrd_filename) {
1441             /* Try to locate the initrd in the gap between the kernel
1442              * and the firmware. Add a bit of space just in case
1443              */
1444             initrd_base = (KERNEL_LOAD_ADDR + kernel_size + 0x1ffff) & ~0xffff;
1445             initrd_size = load_image_targphys(initrd_filename, initrd_base,
1446                                               load_limit - initrd_base);
1447             if (initrd_size < 0) {
1448                 fprintf(stderr, "qemu: could not load initial ram disk '%s'\n",
1449                         initrd_filename);
1450                 exit(1);
1451             }
1452         } else {
1453             initrd_base = 0;
1454             initrd_size = 0;
1455         }
1456     }
1457 
1458     if (bios_name == NULL) {
1459         bios_name = FW_FILE_NAME;
1460     }
1461     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
1462     fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
1463     if (fw_size < 0) {
1464         hw_error("qemu: could not load LPAR rtas '%s'\n", filename);
1465         exit(1);
1466     }
1467     g_free(filename);
1468 
1469     spapr->entry_point = 0x100;
1470 
1471     vmstate_register(NULL, 0, &vmstate_spapr, spapr);
1472     register_savevm_live(NULL, "spapr/htab", -1, 1,
1473                          &savevm_htab_handlers, spapr);
1474 
1475     /* Prepare the device tree */
1476     spapr->fdt_skel = spapr_create_fdt_skel(initrd_base, initrd_size,
1477                                             kernel_size, kernel_le,
1478                                             boot_device, kernel_cmdline,
1479                                             spapr->epow_irq);
1480     assert(spapr->fdt_skel != NULL);
1481 }
1482 
1483 static int spapr_kvm_type(const char *vm_type)
1484 {
1485     if (!vm_type) {
1486         return 0;
1487     }
1488 
1489     if (!strcmp(vm_type, "HV")) {
1490         return 1;
1491     }
1492 
1493     if (!strcmp(vm_type, "PR")) {
1494         return 2;
1495     }
1496 
1497     error_report("Unknown kvm-type specified '%s'", vm_type);
1498     exit(1);
1499 }
1500 
1501 /*
1502  * Implementation of an interface to adjust firmware patch
1503  * for the bootindex property handling.
1504  */
1505 static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
1506                                    DeviceState *dev)
1507 {
1508 #define CAST(type, obj, name) \
1509     ((type *)object_dynamic_cast(OBJECT(obj), (name)))
1510     SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
1511     sPAPRPHBState *phb = CAST(sPAPRPHBState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
1512 
1513     if (d) {
1514         void *spapr = CAST(void, bus->parent, "spapr-vscsi");
1515         VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
1516         USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);
1517 
1518         if (spapr) {
1519             /*
1520              * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
1521              * We use SRP luns of the form 8000 | (bus << 8) | (id << 5) | lun
1522              * in the top 16 bits of the 64-bit LUN
1523              */
1524             unsigned id = 0x8000 | (d->id << 8) | d->lun;
1525             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
1526                                    (uint64_t)id << 48);
1527         } else if (virtio) {
1528             /*
1529              * We use SRP luns of the form 01000000 | (target << 8) | lun
1530              * in the top 32 bits of the 64-bit LUN
1531              * Note: the quote above is from SLOF and it is wrong,
1532              * the actual binding is:
1533              * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
1534              */
1535             unsigned id = 0x1000000 | (d->id << 16) | d->lun;
1536             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
1537                                    (uint64_t)id << 32);
1538         } else if (usb) {
1539             /*
1540              * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
1541              * in the top 32 bits of the 64-bit LUN
1542              */
1543             unsigned usb_port = atoi(usb->port->path);
1544             unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
1545             return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
1546                                    (uint64_t)id << 32);
1547         }
1548     }
1549 
1550     if (phb) {
1551         /* Replace "pci" with "pci@800000020000000" */
1552         return g_strdup_printf("pci@%"PRIX64, phb->buid);
1553     }
1554 
1555     return NULL;
1556 }
1557 
1558 static char *spapr_get_kvm_type(Object *obj, Error **errp)
1559 {
1560     sPAPRMachineState *sm = SPAPR_MACHINE(obj);
1561 
1562     return g_strdup(sm->kvm_type);
1563 }
1564 
1565 static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
1566 {
1567     sPAPRMachineState *sm = SPAPR_MACHINE(obj);
1568 
1569     g_free(sm->kvm_type);
1570     sm->kvm_type = g_strdup(value);
1571 }
1572 
1573 static void spapr_machine_initfn(Object *obj)
1574 {
1575     object_property_add_str(obj, "kvm-type",
1576                             spapr_get_kvm_type, spapr_set_kvm_type, NULL);
1577 }
1578 
1579 static void spapr_machine_class_init(ObjectClass *oc, void *data)
1580 {
1581     MachineClass *mc = MACHINE_CLASS(oc);
1582     FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
1583 
1584     mc->name = "pseries";
1585     mc->desc = "pSeries Logical Partition (PAPR compliant)";
1586     mc->is_default = 1;
1587     mc->init = ppc_spapr_init;
1588     mc->reset = ppc_spapr_reset;
1589     mc->block_default_type = IF_SCSI;
1590     mc->max_cpus = MAX_CPUS;
1591     mc->no_parallel = 1;
1592     mc->default_boot_order = NULL;
1593     mc->kvm_type = spapr_kvm_type;
1594 
1595     fwc->get_dev_path = spapr_get_fw_dev_path;
1596 }
1597 
1598 static const TypeInfo spapr_machine_info = {
1599     .name          = TYPE_SPAPR_MACHINE,
1600     .parent        = TYPE_MACHINE,
1601     .instance_size = sizeof(sPAPRMachineState),
1602     .instance_init = spapr_machine_initfn,
1603     .class_init    = spapr_machine_class_init,
1604     .interfaces = (InterfaceInfo[]) {
1605         { TYPE_FW_PATH_PROVIDER },
1606         { }
1607     },
1608 };
1609 
1610 static void spapr_machine_2_1_class_init(ObjectClass *oc, void *data)
1611 {
1612     MachineClass *mc = MACHINE_CLASS(oc);
1613 
1614     mc->name = "pseries-2.1";
1615     mc->desc = "pSeries Logical Partition (PAPR compliant) v2.1";
1616     mc->is_default = 0;
1617 }
1618 
1619 static const TypeInfo spapr_machine_2_1_info = {
1620     .name          = TYPE_SPAPR_MACHINE "2.1",
1621     .parent        = TYPE_SPAPR_MACHINE,
1622     .class_init    = spapr_machine_2_1_class_init,
1623 };
1624 
1625 static void spapr_machine_register_types(void)
1626 {
1627     type_register_static(&spapr_machine_info);
1628     type_register_static(&spapr_machine_2_1_info);
1629 }
1630 
1631 type_init(spapr_machine_register_types)
1632