1 /*
2  *  Virtual page mapping
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "qemu-common.h"
22 #include "qapi/error.h"
23 
24 #include "qemu/cutils.h"
25 #include "cpu.h"
26 #include "exec/exec-all.h"
27 #include "exec/target_page.h"
28 #include "tcg.h"
29 #include "hw/qdev-core.h"
30 #include "hw/qdev-properties.h"
31 #if !defined(CONFIG_USER_ONLY)
32 #include "hw/boards.h"
33 #include "hw/xen/xen.h"
34 #endif
35 #include "sysemu/kvm.h"
36 #include "sysemu/sysemu.h"
37 #include "sysemu/tcg.h"
38 #include "qemu/timer.h"
39 #include "qemu/config-file.h"
40 #include "qemu/error-report.h"
41 #include "qemu/qemu-print.h"
42 #if defined(CONFIG_USER_ONLY)
43 #include "qemu.h"
44 #else /* !CONFIG_USER_ONLY */
45 #include "exec/memory.h"
46 #include "exec/ioport.h"
47 #include "sysemu/dma.h"
48 #include "sysemu/hostmem.h"
49 #include "sysemu/hw_accel.h"
50 #include "exec/address-spaces.h"
51 #include "sysemu/xen-mapcache.h"
52 #include "trace-root.h"
53 
54 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
55 #include <linux/falloc.h>
56 #endif
57 
58 #endif
59 #include "qemu/rcu_queue.h"
60 #include "qemu/main-loop.h"
61 #include "translate-all.h"
62 #include "sysemu/replay.h"
63 
64 #include "exec/memory-internal.h"
65 #include "exec/ram_addr.h"
66 #include "exec/log.h"
67 
68 #include "migration/vmstate.h"
69 
70 #include "qemu/range.h"
71 #ifndef _WIN32
72 #include "qemu/mmap-alloc.h"
73 #endif
74 
75 #include "monitor/monitor.h"
76 
77 //#define DEBUG_SUBPAGE
78 
79 #if !defined(CONFIG_USER_ONLY)
80 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
81  * are protected by the ramlist lock.
82  */
83 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
84 
85 static MemoryRegion *system_memory;
86 static MemoryRegion *system_io;
87 
88 AddressSpace address_space_io;
89 AddressSpace address_space_memory;
90 
91 static MemoryRegion io_mem_unassigned;
92 #endif
93 
94 CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
95 
96 /* current CPU in the current thread. It is only valid inside
97    cpu_exec() */
98 __thread CPUState *current_cpu;
99 /* 0 = Do not count executed instructions.
100    1 = Precise instruction counting.
101    2 = Adaptive rate instruction counting.  */
102 int use_icount;
103 
104 uintptr_t qemu_host_page_size;
105 intptr_t qemu_host_page_mask;
106 
107 #if !defined(CONFIG_USER_ONLY)
108 
109 typedef struct PhysPageEntry PhysPageEntry;
110 
111 struct PhysPageEntry {
112     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
113     uint32_t skip : 6;
114      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
115     uint32_t ptr : 26;
116 };
117 
118 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
119 
120 /* Size of the L2 (and L3, etc) page tables.  */
121 #define ADDR_SPACE_BITS 64
122 
123 #define P_L2_BITS 9
124 #define P_L2_SIZE (1 << P_L2_BITS)
125 
126 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
127 
128 typedef PhysPageEntry Node[P_L2_SIZE];
129 
130 typedef struct PhysPageMap {
131     struct rcu_head rcu;
132 
133     unsigned sections_nb;
134     unsigned sections_nb_alloc;
135     unsigned nodes_nb;
136     unsigned nodes_nb_alloc;
137     Node *nodes;
138     MemoryRegionSection *sections;
139 } PhysPageMap;
140 
141 struct AddressSpaceDispatch {
142     MemoryRegionSection *mru_section;
143     /* This is a multi-level map on the physical address space.
144      * The bottom level has pointers to MemoryRegionSections.
145      */
146     PhysPageEntry phys_map;
147     PhysPageMap map;
148 };
149 
150 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
151 typedef struct subpage_t {
152     MemoryRegion iomem;
153     FlatView *fv;
154     hwaddr base;
155     uint16_t sub_section[];
156 } subpage_t;
157 
158 #define PHYS_SECTION_UNASSIGNED 0
159 
160 static void io_mem_init(void);
161 static void memory_map_init(void);
162 static void tcg_log_global_after_sync(MemoryListener *listener);
163 static void tcg_commit(MemoryListener *listener);
164 
165 /**
166  * CPUAddressSpace: all the information a CPU needs about an AddressSpace
167  * @cpu: the CPU whose AddressSpace this is
168  * @as: the AddressSpace itself
169  * @memory_dispatch: its dispatch pointer (cached, RCU protected)
170  * @tcg_as_listener: listener for tracking changes to the AddressSpace
171  */
172 struct CPUAddressSpace {
173     CPUState *cpu;
174     AddressSpace *as;
175     struct AddressSpaceDispatch *memory_dispatch;
176     MemoryListener tcg_as_listener;
177 };
178 
179 struct DirtyBitmapSnapshot {
180     ram_addr_t start;
181     ram_addr_t end;
182     unsigned long dirty[];
183 };
184 
185 #endif
186 
187 #if !defined(CONFIG_USER_ONLY)
188 
phys_map_node_reserve(PhysPageMap * map,unsigned nodes)189 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
190 {
191     static unsigned alloc_hint = 16;
192     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
193         map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
194         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
195         alloc_hint = map->nodes_nb_alloc;
196     }
197 }
198 
phys_map_node_alloc(PhysPageMap * map,bool leaf)199 static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
200 {
201     unsigned i;
202     uint32_t ret;
203     PhysPageEntry e;
204     PhysPageEntry *p;
205 
206     ret = map->nodes_nb++;
207     p = map->nodes[ret];
208     assert(ret != PHYS_MAP_NODE_NIL);
209     assert(ret != map->nodes_nb_alloc);
210 
211     e.skip = leaf ? 0 : 1;
212     e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
213     for (i = 0; i < P_L2_SIZE; ++i) {
214         memcpy(&p[i], &e, sizeof(e));
215     }
216     return ret;
217 }
218 
phys_page_set_level(PhysPageMap * map,PhysPageEntry * lp,hwaddr * index,uint64_t * nb,uint16_t leaf,int level)219 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
220                                 hwaddr *index, uint64_t *nb, uint16_t leaf,
221                                 int level)
222 {
223     PhysPageEntry *p;
224     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
225 
226     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
227         lp->ptr = phys_map_node_alloc(map, level == 0);
228     }
229     p = map->nodes[lp->ptr];
230     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
231 
232     while (*nb && lp < &p[P_L2_SIZE]) {
233         if ((*index & (step - 1)) == 0 && *nb >= step) {
234             lp->skip = 0;
235             lp->ptr = leaf;
236             *index += step;
237             *nb -= step;
238         } else {
239             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
240         }
241         ++lp;
242     }
243 }
244 
phys_page_set(AddressSpaceDispatch * d,hwaddr index,uint64_t nb,uint16_t leaf)245 static void phys_page_set(AddressSpaceDispatch *d,
246                           hwaddr index, uint64_t nb,
247                           uint16_t leaf)
248 {
249     /* Wildly overreserve - it doesn't matter much. */
250     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
251 
252     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
253 }
254 
255 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
256  * and update our entry so we can skip it and go directly to the destination.
257  */
phys_page_compact(PhysPageEntry * lp,Node * nodes)258 static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
259 {
260     unsigned valid_ptr = P_L2_SIZE;
261     int valid = 0;
262     PhysPageEntry *p;
263     int i;
264 
265     if (lp->ptr == PHYS_MAP_NODE_NIL) {
266         return;
267     }
268 
269     p = nodes[lp->ptr];
270     for (i = 0; i < P_L2_SIZE; i++) {
271         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
272             continue;
273         }
274 
275         valid_ptr = i;
276         valid++;
277         if (p[i].skip) {
278             phys_page_compact(&p[i], nodes);
279         }
280     }
281 
282     /* We can only compress if there's only one child. */
283     if (valid != 1) {
284         return;
285     }
286 
287     assert(valid_ptr < P_L2_SIZE);
288 
289     /* Don't compress if it won't fit in the # of bits we have. */
290     if (P_L2_LEVELS >= (1 << 6) &&
291         lp->skip + p[valid_ptr].skip >= (1 << 6)) {
292         return;
293     }
294 
295     lp->ptr = p[valid_ptr].ptr;
296     if (!p[valid_ptr].skip) {
297         /* If our only child is a leaf, make this a leaf. */
298         /* By design, we should have made this node a leaf to begin with so we
299          * should never reach here.
300          * But since it's so simple to handle this, let's do it just in case we
301          * change this rule.
302          */
303         lp->skip = 0;
304     } else {
305         lp->skip += p[valid_ptr].skip;
306     }
307 }
308 
address_space_dispatch_compact(AddressSpaceDispatch * d)309 void address_space_dispatch_compact(AddressSpaceDispatch *d)
310 {
311     if (d->phys_map.skip) {
312         phys_page_compact(&d->phys_map, d->map.nodes);
313     }
314 }
315 
section_covers_addr(const MemoryRegionSection * section,hwaddr addr)316 static inline bool section_covers_addr(const MemoryRegionSection *section,
317                                        hwaddr addr)
318 {
319     /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
320      * the section must cover the entire address space.
321      */
322     return int128_gethi(section->size) ||
323            range_covers_byte(section->offset_within_address_space,
324                              int128_getlo(section->size), addr);
325 }
326 
phys_page_find(AddressSpaceDispatch * d,hwaddr addr)327 static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
328 {
329     PhysPageEntry lp = d->phys_map, *p;
330     Node *nodes = d->map.nodes;
331     MemoryRegionSection *sections = d->map.sections;
332     hwaddr index = addr >> TARGET_PAGE_BITS;
333     int i;
334 
335     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
336         if (lp.ptr == PHYS_MAP_NODE_NIL) {
337             return &sections[PHYS_SECTION_UNASSIGNED];
338         }
339         p = nodes[lp.ptr];
340         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
341     }
342 
343     if (section_covers_addr(&sections[lp.ptr], addr)) {
344         return &sections[lp.ptr];
345     } else {
346         return &sections[PHYS_SECTION_UNASSIGNED];
347     }
348 }
349 
350 /* Called from RCU critical section */
address_space_lookup_region(AddressSpaceDispatch * d,hwaddr addr,bool resolve_subpage)351 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
352                                                         hwaddr addr,
353                                                         bool resolve_subpage)
354 {
355     MemoryRegionSection *section = atomic_read(&d->mru_section);
356     subpage_t *subpage;
357 
358     if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
359         !section_covers_addr(section, addr)) {
360         section = phys_page_find(d, addr);
361         atomic_set(&d->mru_section, section);
362     }
363     if (resolve_subpage && section->mr->subpage) {
364         subpage = container_of(section->mr, subpage_t, iomem);
365         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
366     }
367     return section;
368 }
369 
370 /* Called from RCU critical section */
371 static MemoryRegionSection *
address_space_translate_internal(AddressSpaceDispatch * d,hwaddr addr,hwaddr * xlat,hwaddr * plen,bool resolve_subpage)372 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
373                                  hwaddr *plen, bool resolve_subpage)
374 {
375     MemoryRegionSection *section;
376     MemoryRegion *mr;
377     Int128 diff;
378 
379     section = address_space_lookup_region(d, addr, resolve_subpage);
380     /* Compute offset within MemoryRegionSection */
381     addr -= section->offset_within_address_space;
382 
383     /* Compute offset within MemoryRegion */
384     *xlat = addr + section->offset_within_region;
385 
386     mr = section->mr;
387 
388     /* MMIO registers can be expected to perform full-width accesses based only
389      * on their address, without considering adjacent registers that could
390      * decode to completely different MemoryRegions.  When such registers
391      * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
392      * regions overlap wildly.  For this reason we cannot clamp the accesses
393      * here.
394      *
395      * If the length is small (as is the case for address_space_ldl/stl),
396      * everything works fine.  If the incoming length is large, however,
397      * the caller really has to do the clamping through memory_access_size.
398      */
399     if (memory_region_is_ram(mr)) {
400         diff = int128_sub(section->size, int128_make64(addr));
401         *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
402     }
403     return section;
404 }
405 
406 /**
407  * address_space_translate_iommu - translate an address through an IOMMU
408  * memory region and then through the target address space.
409  *
410  * @iommu_mr: the IOMMU memory region that we start the translation from
411  * @addr: the address to be translated through the MMU
412  * @xlat: the translated address offset within the destination memory region.
413  *        It cannot be %NULL.
414  * @plen_out: valid read/write length of the translated address. It
415  *            cannot be %NULL.
416  * @page_mask_out: page mask for the translated address. This
417  *            should only be meaningful for IOMMU translated
418  *            addresses, since there may be huge pages that this bit
419  *            would tell. It can be %NULL if we don't care about it.
420  * @is_write: whether the translation operation is for write
421  * @is_mmio: whether this can be MMIO, set true if it can
422  * @target_as: the address space targeted by the IOMMU
423  * @attrs: transaction attributes
424  *
425  * This function is called from RCU critical section.  It is the common
426  * part of flatview_do_translate and address_space_translate_cached.
427  */
address_space_translate_iommu(IOMMUMemoryRegion * iommu_mr,hwaddr * xlat,hwaddr * plen_out,hwaddr * page_mask_out,bool is_write,bool is_mmio,AddressSpace ** target_as,MemTxAttrs attrs)428 static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
429                                                          hwaddr *xlat,
430                                                          hwaddr *plen_out,
431                                                          hwaddr *page_mask_out,
432                                                          bool is_write,
433                                                          bool is_mmio,
434                                                          AddressSpace **target_as,
435                                                          MemTxAttrs attrs)
436 {
437     MemoryRegionSection *section;
438     hwaddr page_mask = (hwaddr)-1;
439 
440     do {
441         hwaddr addr = *xlat;
442         IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
443         int iommu_idx = 0;
444         IOMMUTLBEntry iotlb;
445 
446         if (imrc->attrs_to_index) {
447             iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
448         }
449 
450         iotlb = imrc->translate(iommu_mr, addr, is_write ?
451                                 IOMMU_WO : IOMMU_RO, iommu_idx);
452 
453         if (!(iotlb.perm & (1 << is_write))) {
454             goto unassigned;
455         }
456 
457         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
458                 | (addr & iotlb.addr_mask));
459         page_mask &= iotlb.addr_mask;
460         *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
461         *target_as = iotlb.target_as;
462 
463         section = address_space_translate_internal(
464                 address_space_to_dispatch(iotlb.target_as), addr, xlat,
465                 plen_out, is_mmio);
466 
467         iommu_mr = memory_region_get_iommu(section->mr);
468     } while (unlikely(iommu_mr));
469 
470     if (page_mask_out) {
471         *page_mask_out = page_mask;
472     }
473     return *section;
474 
475 unassigned:
476     return (MemoryRegionSection) { .mr = &io_mem_unassigned };
477 }
478 
479 /**
480  * flatview_do_translate - translate an address in FlatView
481  *
482  * @fv: the flat view that we want to translate on
483  * @addr: the address to be translated in above address space
484  * @xlat: the translated address offset within memory region. It
485  *        cannot be @NULL.
486  * @plen_out: valid read/write length of the translated address. It
487  *            can be @NULL when we don't care about it.
488  * @page_mask_out: page mask for the translated address. This
489  *            should only be meaningful for IOMMU translated
490  *            addresses, since there may be huge pages that this bit
491  *            would tell. It can be @NULL if we don't care about it.
492  * @is_write: whether the translation operation is for write
493  * @is_mmio: whether this can be MMIO, set true if it can
494  * @target_as: the address space targeted by the IOMMU
495  * @attrs: memory transaction attributes
496  *
497  * This function is called from RCU critical section
498  */
flatview_do_translate(FlatView * fv,hwaddr addr,hwaddr * xlat,hwaddr * plen_out,hwaddr * page_mask_out,bool is_write,bool is_mmio,AddressSpace ** target_as,MemTxAttrs attrs)499 static MemoryRegionSection flatview_do_translate(FlatView *fv,
500                                                  hwaddr addr,
501                                                  hwaddr *xlat,
502                                                  hwaddr *plen_out,
503                                                  hwaddr *page_mask_out,
504                                                  bool is_write,
505                                                  bool is_mmio,
506                                                  AddressSpace **target_as,
507                                                  MemTxAttrs attrs)
508 {
509     MemoryRegionSection *section;
510     IOMMUMemoryRegion *iommu_mr;
511     hwaddr plen = (hwaddr)(-1);
512 
513     if (!plen_out) {
514         plen_out = &plen;
515     }
516 
517     section = address_space_translate_internal(
518             flatview_to_dispatch(fv), addr, xlat,
519             plen_out, is_mmio);
520 
521     iommu_mr = memory_region_get_iommu(section->mr);
522     if (unlikely(iommu_mr)) {
523         return address_space_translate_iommu(iommu_mr, xlat,
524                                              plen_out, page_mask_out,
525                                              is_write, is_mmio,
526                                              target_as, attrs);
527     }
528     if (page_mask_out) {
529         /* Not behind an IOMMU, use default page size. */
530         *page_mask_out = ~TARGET_PAGE_MASK;
531     }
532 
533     return *section;
534 }
535 
536 /* Called from RCU critical section */
address_space_get_iotlb_entry(AddressSpace * as,hwaddr addr,bool is_write,MemTxAttrs attrs)537 IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
538                                             bool is_write, MemTxAttrs attrs)
539 {
540     MemoryRegionSection section;
541     hwaddr xlat, page_mask;
542 
543     /*
544      * This can never be MMIO, and we don't really care about plen,
545      * but page mask.
546      */
547     section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
548                                     NULL, &page_mask, is_write, false, &as,
549                                     attrs);
550 
551     /* Illegal translation */
552     if (section.mr == &io_mem_unassigned) {
553         goto iotlb_fail;
554     }
555 
556     /* Convert memory region offset into address space offset */
557     xlat += section.offset_within_address_space -
558         section.offset_within_region;
559 
560     return (IOMMUTLBEntry) {
561         .target_as = as,
562         .iova = addr & ~page_mask,
563         .translated_addr = xlat & ~page_mask,
564         .addr_mask = page_mask,
565         /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
566         .perm = IOMMU_RW,
567     };
568 
569 iotlb_fail:
570     return (IOMMUTLBEntry) {0};
571 }
572 
573 /* Called from RCU critical section */
flatview_translate(FlatView * fv,hwaddr addr,hwaddr * xlat,hwaddr * plen,bool is_write,MemTxAttrs attrs)574 MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
575                                  hwaddr *plen, bool is_write,
576                                  MemTxAttrs attrs)
577 {
578     MemoryRegion *mr;
579     MemoryRegionSection section;
580     AddressSpace *as = NULL;
581 
582     /* This can be MMIO, so setup MMIO bit. */
583     section = flatview_do_translate(fv, addr, xlat, plen, NULL,
584                                     is_write, true, &as, attrs);
585     mr = section.mr;
586 
587     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
588         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
589         *plen = MIN(page, *plen);
590     }
591 
592     return mr;
593 }
594 
595 typedef struct TCGIOMMUNotifier {
596     IOMMUNotifier n;
597     MemoryRegion *mr;
598     CPUState *cpu;
599     int iommu_idx;
600     bool active;
601 } TCGIOMMUNotifier;
602 
tcg_iommu_unmap_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)603 static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
604 {
605     TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
606 
607     if (!notifier->active) {
608         return;
609     }
610     tlb_flush(notifier->cpu);
611     notifier->active = false;
612     /* We leave the notifier struct on the list to avoid reallocating it later.
613      * Generally the number of IOMMUs a CPU deals with will be small.
614      * In any case we can't unregister the iommu notifier from a notify
615      * callback.
616      */
617 }
618 
tcg_register_iommu_notifier(CPUState * cpu,IOMMUMemoryRegion * iommu_mr,int iommu_idx)619 static void tcg_register_iommu_notifier(CPUState *cpu,
620                                         IOMMUMemoryRegion *iommu_mr,
621                                         int iommu_idx)
622 {
623     /* Make sure this CPU has an IOMMU notifier registered for this
624      * IOMMU/IOMMU index combination, so that we can flush its TLB
625      * when the IOMMU tells us the mappings we've cached have changed.
626      */
627     MemoryRegion *mr = MEMORY_REGION(iommu_mr);
628     TCGIOMMUNotifier *notifier;
629     Error *err = NULL;
630     int i, ret;
631 
632     for (i = 0; i < cpu->iommu_notifiers->len; i++) {
633         notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
634         if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
635             break;
636         }
637     }
638     if (i == cpu->iommu_notifiers->len) {
639         /* Not found, add a new entry at the end of the array */
640         cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
641         notifier = g_new0(TCGIOMMUNotifier, 1);
642         g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
643 
644         notifier->mr = mr;
645         notifier->iommu_idx = iommu_idx;
646         notifier->cpu = cpu;
647         /* Rather than trying to register interest in the specific part
648          * of the iommu's address space that we've accessed and then
649          * expand it later as subsequent accesses touch more of it, we
650          * just register interest in the whole thing, on the assumption
651          * that iommu reconfiguration will be rare.
652          */
653         iommu_notifier_init(&notifier->n,
654                             tcg_iommu_unmap_notify,
655                             IOMMU_NOTIFIER_UNMAP,
656                             0,
657                             HWADDR_MAX,
658                             iommu_idx);
659         ret = memory_region_register_iommu_notifier(notifier->mr, &notifier->n,
660                                                     &err);
661         if (ret) {
662             error_report_err(err);
663             exit(1);
664         }
665     }
666 
667     if (!notifier->active) {
668         notifier->active = true;
669     }
670 }
671 
tcg_iommu_free_notifier_list(CPUState * cpu)672 static void tcg_iommu_free_notifier_list(CPUState *cpu)
673 {
674     /* Destroy the CPU's notifier list */
675     int i;
676     TCGIOMMUNotifier *notifier;
677 
678     for (i = 0; i < cpu->iommu_notifiers->len; i++) {
679         notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
680         memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
681         g_free(notifier);
682     }
683     g_array_free(cpu->iommu_notifiers, true);
684 }
685 
686 /* Called from RCU critical section */
687 MemoryRegionSection *
address_space_translate_for_iotlb(CPUState * cpu,int asidx,hwaddr addr,hwaddr * xlat,hwaddr * plen,MemTxAttrs attrs,int * prot)688 address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
689                                   hwaddr *xlat, hwaddr *plen,
690                                   MemTxAttrs attrs, int *prot)
691 {
692     MemoryRegionSection *section;
693     IOMMUMemoryRegion *iommu_mr;
694     IOMMUMemoryRegionClass *imrc;
695     IOMMUTLBEntry iotlb;
696     int iommu_idx;
697     AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
698 
699     for (;;) {
700         section = address_space_translate_internal(d, addr, &addr, plen, false);
701 
702         iommu_mr = memory_region_get_iommu(section->mr);
703         if (!iommu_mr) {
704             break;
705         }
706 
707         imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
708 
709         iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
710         tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
711         /* We need all the permissions, so pass IOMMU_NONE so the IOMMU
712          * doesn't short-cut its translation table walk.
713          */
714         iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
715         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
716                 | (addr & iotlb.addr_mask));
717         /* Update the caller's prot bits to remove permissions the IOMMU
718          * is giving us a failure response for. If we get down to no
719          * permissions left at all we can give up now.
720          */
721         if (!(iotlb.perm & IOMMU_RO)) {
722             *prot &= ~(PAGE_READ | PAGE_EXEC);
723         }
724         if (!(iotlb.perm & IOMMU_WO)) {
725             *prot &= ~PAGE_WRITE;
726         }
727 
728         if (!*prot) {
729             goto translate_fail;
730         }
731 
732         d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
733     }
734 
735     assert(!memory_region_is_iommu(section->mr));
736     *xlat = addr;
737     return section;
738 
739 translate_fail:
740     return &d->map.sections[PHYS_SECTION_UNASSIGNED];
741 }
742 #endif
743 
744 #if !defined(CONFIG_USER_ONLY)
745 
cpu_common_post_load(void * opaque,int version_id)746 static int cpu_common_post_load(void *opaque, int version_id)
747 {
748     CPUState *cpu = opaque;
749 
750     /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
751        version_id is increased. */
752     cpu->interrupt_request &= ~0x01;
753     tlb_flush(cpu);
754 
755     /* loadvm has just updated the content of RAM, bypassing the
756      * usual mechanisms that ensure we flush TBs for writes to
757      * memory we've translated code from. So we must flush all TBs,
758      * which will now be stale.
759      */
760     tb_flush(cpu);
761 
762     return 0;
763 }
764 
cpu_common_pre_load(void * opaque)765 static int cpu_common_pre_load(void *opaque)
766 {
767     CPUState *cpu = opaque;
768 
769     cpu->exception_index = -1;
770 
771     return 0;
772 }
773 
cpu_common_exception_index_needed(void * opaque)774 static bool cpu_common_exception_index_needed(void *opaque)
775 {
776     CPUState *cpu = opaque;
777 
778     return tcg_enabled() && cpu->exception_index != -1;
779 }
780 
781 static const VMStateDescription vmstate_cpu_common_exception_index = {
782     .name = "cpu_common/exception_index",
783     .version_id = 1,
784     .minimum_version_id = 1,
785     .needed = cpu_common_exception_index_needed,
786     .fields = (VMStateField[]) {
787         VMSTATE_INT32(exception_index, CPUState),
788         VMSTATE_END_OF_LIST()
789     }
790 };
791 
cpu_common_crash_occurred_needed(void * opaque)792 static bool cpu_common_crash_occurred_needed(void *opaque)
793 {
794     CPUState *cpu = opaque;
795 
796     return cpu->crash_occurred;
797 }
798 
799 static const VMStateDescription vmstate_cpu_common_crash_occurred = {
800     .name = "cpu_common/crash_occurred",
801     .version_id = 1,
802     .minimum_version_id = 1,
803     .needed = cpu_common_crash_occurred_needed,
804     .fields = (VMStateField[]) {
805         VMSTATE_BOOL(crash_occurred, CPUState),
806         VMSTATE_END_OF_LIST()
807     }
808 };
809 
810 const VMStateDescription vmstate_cpu_common = {
811     .name = "cpu_common",
812     .version_id = 1,
813     .minimum_version_id = 1,
814     .pre_load = cpu_common_pre_load,
815     .post_load = cpu_common_post_load,
816     .fields = (VMStateField[]) {
817         VMSTATE_UINT32(halted, CPUState),
818         VMSTATE_UINT32(interrupt_request, CPUState),
819         VMSTATE_END_OF_LIST()
820     },
821     .subsections = (const VMStateDescription*[]) {
822         &vmstate_cpu_common_exception_index,
823         &vmstate_cpu_common_crash_occurred,
824         NULL
825     }
826 };
827 
828 #endif
829 
qemu_get_cpu(int index)830 CPUState *qemu_get_cpu(int index)
831 {
832     CPUState *cpu;
833 
834     CPU_FOREACH(cpu) {
835         if (cpu->cpu_index == index) {
836             return cpu;
837         }
838     }
839 
840     return NULL;
841 }
842 
843 #if !defined(CONFIG_USER_ONLY)
cpu_address_space_init(CPUState * cpu,int asidx,const char * prefix,MemoryRegion * mr)844 void cpu_address_space_init(CPUState *cpu, int asidx,
845                             const char *prefix, MemoryRegion *mr)
846 {
847     CPUAddressSpace *newas;
848     AddressSpace *as = g_new0(AddressSpace, 1);
849     char *as_name;
850 
851     assert(mr);
852     as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
853     address_space_init(as, mr, as_name);
854     g_free(as_name);
855 
856     /* Target code should have set num_ases before calling us */
857     assert(asidx < cpu->num_ases);
858 
859     if (asidx == 0) {
860         /* address space 0 gets the convenience alias */
861         cpu->as = as;
862     }
863 
864     /* KVM cannot currently support multiple address spaces. */
865     assert(asidx == 0 || !kvm_enabled());
866 
867     if (!cpu->cpu_ases) {
868         cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
869     }
870 
871     newas = &cpu->cpu_ases[asidx];
872     newas->cpu = cpu;
873     newas->as = as;
874     if (tcg_enabled()) {
875         newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
876         newas->tcg_as_listener.commit = tcg_commit;
877         memory_listener_register(&newas->tcg_as_listener, as);
878     }
879 }
880 
cpu_get_address_space(CPUState * cpu,int asidx)881 AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
882 {
883     /* Return the AddressSpace corresponding to the specified index */
884     return cpu->cpu_ases[asidx].as;
885 }
886 #endif
887 
cpu_exec_unrealizefn(CPUState * cpu)888 void cpu_exec_unrealizefn(CPUState *cpu)
889 {
890     CPUClass *cc = CPU_GET_CLASS(cpu);
891 
892     cpu_list_remove(cpu);
893 
894     if (cc->vmsd != NULL) {
895         vmstate_unregister(NULL, cc->vmsd, cpu);
896     }
897     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
898         vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
899     }
900 #ifndef CONFIG_USER_ONLY
901     tcg_iommu_free_notifier_list(cpu);
902 #endif
903 }
904 
905 Property cpu_common_props[] = {
906 #ifndef CONFIG_USER_ONLY
907     /* Create a memory property for softmmu CPU object,
908      * so users can wire up its memory. (This can't go in hw/core/cpu.c
909      * because that file is compiled only once for both user-mode
910      * and system builds.) The default if no link is set up is to use
911      * the system address space.
912      */
913     DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
914                      MemoryRegion *),
915 #endif
916     DEFINE_PROP_END_OF_LIST(),
917 };
918 
cpu_exec_initfn(CPUState * cpu)919 void cpu_exec_initfn(CPUState *cpu)
920 {
921     cpu->as = NULL;
922     cpu->num_ases = 0;
923 
924 #ifndef CONFIG_USER_ONLY
925     cpu->thread_id = qemu_get_thread_id();
926     cpu->memory = system_memory;
927     object_ref(OBJECT(cpu->memory));
928 #endif
929 }
930 
cpu_exec_realizefn(CPUState * cpu,Error ** errp)931 void cpu_exec_realizefn(CPUState *cpu, Error **errp)
932 {
933     CPUClass *cc = CPU_GET_CLASS(cpu);
934     static bool tcg_target_initialized;
935 
936     cpu_list_add(cpu);
937 
938     if (tcg_enabled() && !tcg_target_initialized) {
939         tcg_target_initialized = true;
940         cc->tcg_initialize();
941     }
942     tlb_init(cpu);
943 
944     qemu_plugin_vcpu_init_hook(cpu);
945 
946 #ifndef CONFIG_USER_ONLY
947     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
948         vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
949     }
950     if (cc->vmsd != NULL) {
951         vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
952     }
953 
954     cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
955 #endif
956 }
957 
parse_cpu_option(const char * cpu_option)958 const char *parse_cpu_option(const char *cpu_option)
959 {
960     ObjectClass *oc;
961     CPUClass *cc;
962     gchar **model_pieces;
963     const char *cpu_type;
964 
965     model_pieces = g_strsplit(cpu_option, ",", 2);
966     if (!model_pieces[0]) {
967         error_report("-cpu option cannot be empty");
968         exit(1);
969     }
970 
971     oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
972     if (oc == NULL) {
973         error_report("unable to find CPU model '%s'", model_pieces[0]);
974         g_strfreev(model_pieces);
975         exit(EXIT_FAILURE);
976     }
977 
978     cpu_type = object_class_get_name(oc);
979     cc = CPU_CLASS(oc);
980     cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
981     g_strfreev(model_pieces);
982     return cpu_type;
983 }
984 
985 #if defined(CONFIG_USER_ONLY)
tb_invalidate_phys_addr(target_ulong addr)986 void tb_invalidate_phys_addr(target_ulong addr)
987 {
988     mmap_lock();
989     tb_invalidate_phys_page_range(addr, addr + 1);
990     mmap_unlock();
991 }
992 
breakpoint_invalidate(CPUState * cpu,target_ulong pc)993 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
994 {
995     tb_invalidate_phys_addr(pc);
996 }
997 #else
tb_invalidate_phys_addr(AddressSpace * as,hwaddr addr,MemTxAttrs attrs)998 void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
999 {
1000     ram_addr_t ram_addr;
1001     MemoryRegion *mr;
1002     hwaddr l = 1;
1003 
1004     if (!tcg_enabled()) {
1005         return;
1006     }
1007 
1008     RCU_READ_LOCK_GUARD();
1009     mr = address_space_translate(as, addr, &addr, &l, false, attrs);
1010     if (!(memory_region_is_ram(mr)
1011           || memory_region_is_romd(mr))) {
1012         return;
1013     }
1014     ram_addr = memory_region_get_ram_addr(mr) + addr;
1015     tb_invalidate_phys_page_range(ram_addr, ram_addr + 1);
1016 }
1017 
breakpoint_invalidate(CPUState * cpu,target_ulong pc)1018 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
1019 {
1020     MemTxAttrs attrs;
1021     hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
1022     int asidx = cpu_asidx_from_attrs(cpu, attrs);
1023     if (phys != -1) {
1024         /* Locks grabbed by tb_invalidate_phys_addr */
1025         tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
1026                                 phys | (pc & ~TARGET_PAGE_MASK), attrs);
1027     }
1028 }
1029 #endif
1030 
1031 #ifndef CONFIG_USER_ONLY
1032 /* Add a watchpoint.  */
cpu_watchpoint_insert(CPUState * cpu,vaddr addr,vaddr len,int flags,CPUWatchpoint ** watchpoint)1033 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
1034                           int flags, CPUWatchpoint **watchpoint)
1035 {
1036     CPUWatchpoint *wp;
1037 
1038     /* forbid ranges which are empty or run off the end of the address space */
1039     if (len == 0 || (addr + len - 1) < addr) {
1040         error_report("tried to set invalid watchpoint at %"
1041                      VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
1042         return -EINVAL;
1043     }
1044     wp = g_malloc(sizeof(*wp));
1045 
1046     wp->vaddr = addr;
1047     wp->len = len;
1048     wp->flags = flags;
1049 
1050     /* keep all GDB-injected watchpoints in front */
1051     if (flags & BP_GDB) {
1052         QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
1053     } else {
1054         QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
1055     }
1056 
1057     tlb_flush_page(cpu, addr);
1058 
1059     if (watchpoint)
1060         *watchpoint = wp;
1061     return 0;
1062 }
1063 
1064 /* Remove a specific watchpoint.  */
cpu_watchpoint_remove(CPUState * cpu,vaddr addr,vaddr len,int flags)1065 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
1066                           int flags)
1067 {
1068     CPUWatchpoint *wp;
1069 
1070     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1071         if (addr == wp->vaddr && len == wp->len
1072                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
1073             cpu_watchpoint_remove_by_ref(cpu, wp);
1074             return 0;
1075         }
1076     }
1077     return -ENOENT;
1078 }
1079 
1080 /* Remove a specific watchpoint by reference.  */
cpu_watchpoint_remove_by_ref(CPUState * cpu,CPUWatchpoint * watchpoint)1081 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
1082 {
1083     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
1084 
1085     tlb_flush_page(cpu, watchpoint->vaddr);
1086 
1087     g_free(watchpoint);
1088 }
1089 
1090 /* Remove all matching watchpoints.  */
cpu_watchpoint_remove_all(CPUState * cpu,int mask)1091 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
1092 {
1093     CPUWatchpoint *wp, *next;
1094 
1095     QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
1096         if (wp->flags & mask) {
1097             cpu_watchpoint_remove_by_ref(cpu, wp);
1098         }
1099     }
1100 }
1101 
1102 /* Return true if this watchpoint address matches the specified
1103  * access (ie the address range covered by the watchpoint overlaps
1104  * partially or completely with the address range covered by the
1105  * access).
1106  */
watchpoint_address_matches(CPUWatchpoint * wp,vaddr addr,vaddr len)1107 static inline bool watchpoint_address_matches(CPUWatchpoint *wp,
1108                                               vaddr addr, vaddr len)
1109 {
1110     /* We know the lengths are non-zero, but a little caution is
1111      * required to avoid errors in the case where the range ends
1112      * exactly at the top of the address space and so addr + len
1113      * wraps round to zero.
1114      */
1115     vaddr wpend = wp->vaddr + wp->len - 1;
1116     vaddr addrend = addr + len - 1;
1117 
1118     return !(addr > wpend || wp->vaddr > addrend);
1119 }
1120 
1121 /* Return flags for watchpoints that match addr + prot.  */
cpu_watchpoint_address_matches(CPUState * cpu,vaddr addr,vaddr len)1122 int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
1123 {
1124     CPUWatchpoint *wp;
1125     int ret = 0;
1126 
1127     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1128         if (watchpoint_address_matches(wp, addr, TARGET_PAGE_SIZE)) {
1129             ret |= wp->flags;
1130         }
1131     }
1132     return ret;
1133 }
1134 #endif /* !CONFIG_USER_ONLY */
1135 
1136 /* Add a breakpoint.  */
cpu_breakpoint_insert(CPUState * cpu,vaddr pc,int flags,CPUBreakpoint ** breakpoint)1137 int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
1138                           CPUBreakpoint **breakpoint)
1139 {
1140     CPUBreakpoint *bp;
1141 
1142     bp = g_malloc(sizeof(*bp));
1143 
1144     bp->pc = pc;
1145     bp->flags = flags;
1146 
1147     /* keep all GDB-injected breakpoints in front */
1148     if (flags & BP_GDB) {
1149         QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
1150     } else {
1151         QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
1152     }
1153 
1154     breakpoint_invalidate(cpu, pc);
1155 
1156     if (breakpoint) {
1157         *breakpoint = bp;
1158     }
1159     return 0;
1160 }
1161 
1162 /* Remove a specific breakpoint.  */
cpu_breakpoint_remove(CPUState * cpu,vaddr pc,int flags)1163 int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
1164 {
1165     CPUBreakpoint *bp;
1166 
1167     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1168         if (bp->pc == pc && bp->flags == flags) {
1169             cpu_breakpoint_remove_by_ref(cpu, bp);
1170             return 0;
1171         }
1172     }
1173     return -ENOENT;
1174 }
1175 
1176 /* Remove a specific breakpoint by reference.  */
cpu_breakpoint_remove_by_ref(CPUState * cpu,CPUBreakpoint * breakpoint)1177 void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
1178 {
1179     QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
1180 
1181     breakpoint_invalidate(cpu, breakpoint->pc);
1182 
1183     g_free(breakpoint);
1184 }
1185 
1186 /* Remove all matching breakpoints. */
cpu_breakpoint_remove_all(CPUState * cpu,int mask)1187 void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
1188 {
1189     CPUBreakpoint *bp, *next;
1190 
1191     QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
1192         if (bp->flags & mask) {
1193             cpu_breakpoint_remove_by_ref(cpu, bp);
1194         }
1195     }
1196 }
1197 
1198 /* enable or disable single step mode. EXCP_DEBUG is returned by the
1199    CPU loop after each instruction */
cpu_single_step(CPUState * cpu,int enabled)1200 void cpu_single_step(CPUState *cpu, int enabled)
1201 {
1202     if (cpu->singlestep_enabled != enabled) {
1203         cpu->singlestep_enabled = enabled;
1204         if (kvm_enabled()) {
1205             kvm_update_guest_debug(cpu, 0);
1206         } else {
1207             /* must flush all the translated code to avoid inconsistencies */
1208             /* XXX: only flush what is necessary */
1209             tb_flush(cpu);
1210         }
1211     }
1212 }
1213 
cpu_abort(CPUState * cpu,const char * fmt,...)1214 void cpu_abort(CPUState *cpu, const char *fmt, ...)
1215 {
1216     va_list ap;
1217     va_list ap2;
1218 
1219     va_start(ap, fmt);
1220     va_copy(ap2, ap);
1221     fprintf(stderr, "qemu: fatal: ");
1222     vfprintf(stderr, fmt, ap);
1223     fprintf(stderr, "\n");
1224     cpu_dump_state(cpu, stderr, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1225     if (qemu_log_separate()) {
1226         qemu_log_lock();
1227         qemu_log("qemu: fatal: ");
1228         qemu_log_vprintf(fmt, ap2);
1229         qemu_log("\n");
1230         log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1231         qemu_log_flush();
1232         qemu_log_unlock();
1233         qemu_log_close();
1234     }
1235     va_end(ap2);
1236     va_end(ap);
1237     replay_finish();
1238 #if defined(CONFIG_USER_ONLY)
1239     {
1240         struct sigaction act;
1241         sigfillset(&act.sa_mask);
1242         act.sa_handler = SIG_DFL;
1243         act.sa_flags = 0;
1244         sigaction(SIGABRT, &act, NULL);
1245     }
1246 #endif
1247     abort();
1248 }
1249 
1250 #if !defined(CONFIG_USER_ONLY)
1251 /* Called from RCU critical section */
qemu_get_ram_block(ram_addr_t addr)1252 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
1253 {
1254     RAMBlock *block;
1255 
1256     block = atomic_rcu_read(&ram_list.mru_block);
1257     if (block && addr - block->offset < block->max_length) {
1258         return block;
1259     }
1260     RAMBLOCK_FOREACH(block) {
1261         if (addr - block->offset < block->max_length) {
1262             goto found;
1263         }
1264     }
1265 
1266     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1267     abort();
1268 
1269 found:
1270     /* It is safe to write mru_block outside the iothread lock.  This
1271      * is what happens:
1272      *
1273      *     mru_block = xxx
1274      *     rcu_read_unlock()
1275      *                                        xxx removed from list
1276      *                  rcu_read_lock()
1277      *                  read mru_block
1278      *                                        mru_block = NULL;
1279      *                                        call_rcu(reclaim_ramblock, xxx);
1280      *                  rcu_read_unlock()
1281      *
1282      * atomic_rcu_set is not needed here.  The block was already published
1283      * when it was placed into the list.  Here we're just making an extra
1284      * copy of the pointer.
1285      */
1286     ram_list.mru_block = block;
1287     return block;
1288 }
1289 
tlb_reset_dirty_range_all(ram_addr_t start,ram_addr_t length)1290 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
1291 {
1292     CPUState *cpu;
1293     ram_addr_t start1;
1294     RAMBlock *block;
1295     ram_addr_t end;
1296 
1297     assert(tcg_enabled());
1298     end = TARGET_PAGE_ALIGN(start + length);
1299     start &= TARGET_PAGE_MASK;
1300 
1301     RCU_READ_LOCK_GUARD();
1302     block = qemu_get_ram_block(start);
1303     assert(block == qemu_get_ram_block(end - 1));
1304     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1305     CPU_FOREACH(cpu) {
1306         tlb_reset_dirty(cpu, start1, length);
1307     }
1308 }
1309 
1310 /* Note: start and end must be within the same ram block.  */
cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,ram_addr_t length,unsigned client)1311 bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
1312                                               ram_addr_t length,
1313                                               unsigned client)
1314 {
1315     DirtyMemoryBlocks *blocks;
1316     unsigned long end, page;
1317     bool dirty = false;
1318     RAMBlock *ramblock;
1319     uint64_t mr_offset, mr_size;
1320 
1321     if (length == 0) {
1322         return false;
1323     }
1324 
1325     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
1326     page = start >> TARGET_PAGE_BITS;
1327 
1328     WITH_RCU_READ_LOCK_GUARD() {
1329         blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1330         ramblock = qemu_get_ram_block(start);
1331         /* Range sanity check on the ramblock */
1332         assert(start >= ramblock->offset &&
1333                start + length <= ramblock->offset + ramblock->used_length);
1334 
1335         while (page < end) {
1336             unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1337             unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1338             unsigned long num = MIN(end - page,
1339                                     DIRTY_MEMORY_BLOCK_SIZE - offset);
1340 
1341             dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
1342                                                   offset, num);
1343             page += num;
1344         }
1345 
1346         mr_offset = (ram_addr_t)(page << TARGET_PAGE_BITS) - ramblock->offset;
1347         mr_size = (end - page) << TARGET_PAGE_BITS;
1348         memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
1349     }
1350 
1351     if (dirty && tcg_enabled()) {
1352         tlb_reset_dirty_range_all(start, length);
1353     }
1354 
1355     return dirty;
1356 }
1357 
cpu_physical_memory_snapshot_and_clear_dirty(MemoryRegion * mr,hwaddr offset,hwaddr length,unsigned client)1358 DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
1359     (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
1360 {
1361     DirtyMemoryBlocks *blocks;
1362     ram_addr_t start = memory_region_get_ram_addr(mr) + offset;
1363     unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
1364     ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
1365     ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
1366     DirtyBitmapSnapshot *snap;
1367     unsigned long page, end, dest;
1368 
1369     snap = g_malloc0(sizeof(*snap) +
1370                      ((last - first) >> (TARGET_PAGE_BITS + 3)));
1371     snap->start = first;
1372     snap->end   = last;
1373 
1374     page = first >> TARGET_PAGE_BITS;
1375     end  = last  >> TARGET_PAGE_BITS;
1376     dest = 0;
1377 
1378     WITH_RCU_READ_LOCK_GUARD() {
1379         blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1380 
1381         while (page < end) {
1382             unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1383             unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1384             unsigned long num = MIN(end - page,
1385                                     DIRTY_MEMORY_BLOCK_SIZE - offset);
1386 
1387             assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
1388             assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
1389             offset >>= BITS_PER_LEVEL;
1390 
1391             bitmap_copy_and_clear_atomic(snap->dirty + dest,
1392                                          blocks->blocks[idx] + offset,
1393                                          num);
1394             page += num;
1395             dest += num >> BITS_PER_LEVEL;
1396         }
1397     }
1398 
1399     if (tcg_enabled()) {
1400         tlb_reset_dirty_range_all(start, length);
1401     }
1402 
1403     memory_region_clear_dirty_bitmap(mr, offset, length);
1404 
1405     return snap;
1406 }
1407 
cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot * snap,ram_addr_t start,ram_addr_t length)1408 bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
1409                                             ram_addr_t start,
1410                                             ram_addr_t length)
1411 {
1412     unsigned long page, end;
1413 
1414     assert(start >= snap->start);
1415     assert(start + length <= snap->end);
1416 
1417     end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
1418     page = (start - snap->start) >> TARGET_PAGE_BITS;
1419 
1420     while (page < end) {
1421         if (test_bit(page, snap->dirty)) {
1422             return true;
1423         }
1424         page++;
1425     }
1426     return false;
1427 }
1428 
1429 /* Called from RCU critical section */
memory_region_section_get_iotlb(CPUState * cpu,MemoryRegionSection * section)1430 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1431                                        MemoryRegionSection *section)
1432 {
1433     AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
1434     return section - d->map.sections;
1435 }
1436 #endif /* defined(CONFIG_USER_ONLY) */
1437 
1438 #if !defined(CONFIG_USER_ONLY)
1439 
1440 static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
1441                             uint16_t section);
1442 static subpage_t *subpage_init(FlatView *fv, hwaddr base);
1443 
1444 static void *(*phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
1445                                qemu_anon_ram_alloc;
1446 
1447 /*
1448  * Set a custom physical guest memory alloator.
1449  * Accelerators with unusual needs may need this.  Hopefully, we can
1450  * get rid of it eventually.
1451  */
phys_mem_set_alloc(void * (* alloc)(size_t,uint64_t * align,bool shared))1452 void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align, bool shared))
1453 {
1454     phys_mem_alloc = alloc;
1455 }
1456 
phys_section_add(PhysPageMap * map,MemoryRegionSection * section)1457 static uint16_t phys_section_add(PhysPageMap *map,
1458                                  MemoryRegionSection *section)
1459 {
1460     /* The physical section number is ORed with a page-aligned
1461      * pointer to produce the iotlb entries.  Thus it should
1462      * never overflow into the page-aligned value.
1463      */
1464     assert(map->sections_nb < TARGET_PAGE_SIZE);
1465 
1466     if (map->sections_nb == map->sections_nb_alloc) {
1467         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1468         map->sections = g_renew(MemoryRegionSection, map->sections,
1469                                 map->sections_nb_alloc);
1470     }
1471     map->sections[map->sections_nb] = *section;
1472     memory_region_ref(section->mr);
1473     return map->sections_nb++;
1474 }
1475 
phys_section_destroy(MemoryRegion * mr)1476 static void phys_section_destroy(MemoryRegion *mr)
1477 {
1478     bool have_sub_page = mr->subpage;
1479 
1480     memory_region_unref(mr);
1481 
1482     if (have_sub_page) {
1483         subpage_t *subpage = container_of(mr, subpage_t, iomem);
1484         object_unref(OBJECT(&subpage->iomem));
1485         g_free(subpage);
1486     }
1487 }
1488 
phys_sections_free(PhysPageMap * map)1489 static void phys_sections_free(PhysPageMap *map)
1490 {
1491     while (map->sections_nb > 0) {
1492         MemoryRegionSection *section = &map->sections[--map->sections_nb];
1493         phys_section_destroy(section->mr);
1494     }
1495     g_free(map->sections);
1496     g_free(map->nodes);
1497 }
1498 
register_subpage(FlatView * fv,MemoryRegionSection * section)1499 static void register_subpage(FlatView *fv, MemoryRegionSection *section)
1500 {
1501     AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1502     subpage_t *subpage;
1503     hwaddr base = section->offset_within_address_space
1504         & TARGET_PAGE_MASK;
1505     MemoryRegionSection *existing = phys_page_find(d, base);
1506     MemoryRegionSection subsection = {
1507         .offset_within_address_space = base,
1508         .size = int128_make64(TARGET_PAGE_SIZE),
1509     };
1510     hwaddr start, end;
1511 
1512     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1513 
1514     if (!(existing->mr->subpage)) {
1515         subpage = subpage_init(fv, base);
1516         subsection.fv = fv;
1517         subsection.mr = &subpage->iomem;
1518         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1519                       phys_section_add(&d->map, &subsection));
1520     } else {
1521         subpage = container_of(existing->mr, subpage_t, iomem);
1522     }
1523     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1524     end = start + int128_get64(section->size) - 1;
1525     subpage_register(subpage, start, end,
1526                      phys_section_add(&d->map, section));
1527 }
1528 
1529 
register_multipage(FlatView * fv,MemoryRegionSection * section)1530 static void register_multipage(FlatView *fv,
1531                                MemoryRegionSection *section)
1532 {
1533     AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1534     hwaddr start_addr = section->offset_within_address_space;
1535     uint16_t section_index = phys_section_add(&d->map, section);
1536     uint64_t num_pages = int128_get64(int128_rshift(section->size,
1537                                                     TARGET_PAGE_BITS));
1538 
1539     assert(num_pages);
1540     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1541 }
1542 
1543 /*
1544  * The range in *section* may look like this:
1545  *
1546  *      |s|PPPPPPP|s|
1547  *
1548  * where s stands for subpage and P for page.
1549  */
flatview_add_to_dispatch(FlatView * fv,MemoryRegionSection * section)1550 void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
1551 {
1552     MemoryRegionSection remain = *section;
1553     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1554 
1555     /* register first subpage */
1556     if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1557         uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
1558                         - remain.offset_within_address_space;
1559 
1560         MemoryRegionSection now = remain;
1561         now.size = int128_min(int128_make64(left), now.size);
1562         register_subpage(fv, &now);
1563         if (int128_eq(remain.size, now.size)) {
1564             return;
1565         }
1566         remain.size = int128_sub(remain.size, now.size);
1567         remain.offset_within_address_space += int128_get64(now.size);
1568         remain.offset_within_region += int128_get64(now.size);
1569     }
1570 
1571     /* register whole pages */
1572     if (int128_ge(remain.size, page_size)) {
1573         MemoryRegionSection now = remain;
1574         now.size = int128_and(now.size, int128_neg(page_size));
1575         register_multipage(fv, &now);
1576         if (int128_eq(remain.size, now.size)) {
1577             return;
1578         }
1579         remain.size = int128_sub(remain.size, now.size);
1580         remain.offset_within_address_space += int128_get64(now.size);
1581         remain.offset_within_region += int128_get64(now.size);
1582     }
1583 
1584     /* register last subpage */
1585     register_subpage(fv, &remain);
1586 }
1587 
qemu_flush_coalesced_mmio_buffer(void)1588 void qemu_flush_coalesced_mmio_buffer(void)
1589 {
1590     if (kvm_enabled())
1591         kvm_flush_coalesced_mmio_buffer();
1592 }
1593 
qemu_mutex_lock_ramlist(void)1594 void qemu_mutex_lock_ramlist(void)
1595 {
1596     qemu_mutex_lock(&ram_list.mutex);
1597 }
1598 
qemu_mutex_unlock_ramlist(void)1599 void qemu_mutex_unlock_ramlist(void)
1600 {
1601     qemu_mutex_unlock(&ram_list.mutex);
1602 }
1603 
ram_block_dump(Monitor * mon)1604 void ram_block_dump(Monitor *mon)
1605 {
1606     RAMBlock *block;
1607     char *psize;
1608 
1609     RCU_READ_LOCK_GUARD();
1610     monitor_printf(mon, "%24s %8s  %18s %18s %18s\n",
1611                    "Block Name", "PSize", "Offset", "Used", "Total");
1612     RAMBLOCK_FOREACH(block) {
1613         psize = size_to_str(block->page_size);
1614         monitor_printf(mon, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
1615                        " 0x%016" PRIx64 "\n", block->idstr, psize,
1616                        (uint64_t)block->offset,
1617                        (uint64_t)block->used_length,
1618                        (uint64_t)block->max_length);
1619         g_free(psize);
1620     }
1621 }
1622 
1623 #ifdef __linux__
1624 /*
1625  * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
1626  * may or may not name the same files / on the same filesystem now as
1627  * when we actually open and map them.  Iterate over the file
1628  * descriptors instead, and use qemu_fd_getpagesize().
1629  */
find_min_backend_pagesize(Object * obj,void * opaque)1630 static int find_min_backend_pagesize(Object *obj, void *opaque)
1631 {
1632     long *hpsize_min = opaque;
1633 
1634     if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1635         HostMemoryBackend *backend = MEMORY_BACKEND(obj);
1636         long hpsize = host_memory_backend_pagesize(backend);
1637 
1638         if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
1639             *hpsize_min = hpsize;
1640         }
1641     }
1642 
1643     return 0;
1644 }
1645 
find_max_backend_pagesize(Object * obj,void * opaque)1646 static int find_max_backend_pagesize(Object *obj, void *opaque)
1647 {
1648     long *hpsize_max = opaque;
1649 
1650     if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1651         HostMemoryBackend *backend = MEMORY_BACKEND(obj);
1652         long hpsize = host_memory_backend_pagesize(backend);
1653 
1654         if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
1655             *hpsize_max = hpsize;
1656         }
1657     }
1658 
1659     return 0;
1660 }
1661 
1662 /*
1663  * TODO: We assume right now that all mapped host memory backends are
1664  * used as RAM, however some might be used for different purposes.
1665  */
qemu_minrampagesize(void)1666 long qemu_minrampagesize(void)
1667 {
1668     long hpsize = LONG_MAX;
1669     long mainrampagesize;
1670     Object *memdev_root;
1671     MachineState *ms = MACHINE(qdev_get_machine());
1672 
1673     mainrampagesize = qemu_mempath_getpagesize(mem_path);
1674 
1675     /* it's possible we have memory-backend objects with
1676      * hugepage-backed RAM. these may get mapped into system
1677      * address space via -numa parameters or memory hotplug
1678      * hooks. we want to take these into account, but we
1679      * also want to make sure these supported hugepage
1680      * sizes are applicable across the entire range of memory
1681      * we may boot from, so we take the min across all
1682      * backends, and assume normal pages in cases where a
1683      * backend isn't backed by hugepages.
1684      */
1685     memdev_root = object_resolve_path("/objects", NULL);
1686     if (memdev_root) {
1687         object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
1688     }
1689     if (hpsize == LONG_MAX) {
1690         /* No additional memory regions found ==> Report main RAM page size */
1691         return mainrampagesize;
1692     }
1693 
1694     /* If NUMA is disabled or the NUMA nodes are not backed with a
1695      * memory-backend, then there is at least one node using "normal" RAM,
1696      * so if its page size is smaller we have got to report that size instead.
1697      */
1698     if (hpsize > mainrampagesize &&
1699         (ms->numa_state == NULL ||
1700          ms->numa_state->num_nodes == 0 ||
1701          ms->numa_state->nodes[0].node_memdev == NULL)) {
1702         static bool warned;
1703         if (!warned) {
1704             error_report("Huge page support disabled (n/a for main memory).");
1705             warned = true;
1706         }
1707         return mainrampagesize;
1708     }
1709 
1710     return hpsize;
1711 }
1712 
qemu_maxrampagesize(void)1713 long qemu_maxrampagesize(void)
1714 {
1715     long pagesize = qemu_mempath_getpagesize(mem_path);
1716     Object *memdev_root = object_resolve_path("/objects", NULL);
1717 
1718     if (memdev_root) {
1719         object_child_foreach(memdev_root, find_max_backend_pagesize,
1720                              &pagesize);
1721     }
1722     return pagesize;
1723 }
1724 #else
qemu_minrampagesize(void)1725 long qemu_minrampagesize(void)
1726 {
1727     return qemu_real_host_page_size;
1728 }
qemu_maxrampagesize(void)1729 long qemu_maxrampagesize(void)
1730 {
1731     return qemu_real_host_page_size;
1732 }
1733 #endif
1734 
1735 #ifdef CONFIG_POSIX
get_file_size(int fd)1736 static int64_t get_file_size(int fd)
1737 {
1738     int64_t size;
1739 #if defined(__linux__)
1740     struct stat st;
1741 
1742     if (fstat(fd, &st) < 0) {
1743         return -errno;
1744     }
1745 
1746     /* Special handling for devdax character devices */
1747     if (S_ISCHR(st.st_mode)) {
1748         g_autofree char *subsystem_path = NULL;
1749         g_autofree char *subsystem = NULL;
1750 
1751         subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
1752                                          major(st.st_rdev), minor(st.st_rdev));
1753         subsystem = g_file_read_link(subsystem_path, NULL);
1754 
1755         if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
1756             g_autofree char *size_path = NULL;
1757             g_autofree char *size_str = NULL;
1758 
1759             size_path = g_strdup_printf("/sys/dev/char/%d:%d/size",
1760                                     major(st.st_rdev), minor(st.st_rdev));
1761 
1762             if (g_file_get_contents(size_path, &size_str, NULL, NULL)) {
1763                 return g_ascii_strtoll(size_str, NULL, 0);
1764             }
1765         }
1766     }
1767 #endif /* defined(__linux__) */
1768 
1769     /* st.st_size may be zero for special files yet lseek(2) works */
1770     size = lseek(fd, 0, SEEK_END);
1771     if (size < 0) {
1772         return -errno;
1773     }
1774     return size;
1775 }
1776 
file_ram_open(const char * path,const char * region_name,bool * created,Error ** errp)1777 static int file_ram_open(const char *path,
1778                          const char *region_name,
1779                          bool *created,
1780                          Error **errp)
1781 {
1782     char *filename;
1783     char *sanitized_name;
1784     char *c;
1785     int fd = -1;
1786 
1787     *created = false;
1788     for (;;) {
1789         fd = open(path, O_RDWR);
1790         if (fd >= 0) {
1791             /* @path names an existing file, use it */
1792             break;
1793         }
1794         if (errno == ENOENT) {
1795             /* @path names a file that doesn't exist, create it */
1796             fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1797             if (fd >= 0) {
1798                 *created = true;
1799                 break;
1800             }
1801         } else if (errno == EISDIR) {
1802             /* @path names a directory, create a file there */
1803             /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1804             sanitized_name = g_strdup(region_name);
1805             for (c = sanitized_name; *c != '\0'; c++) {
1806                 if (*c == '/') {
1807                     *c = '_';
1808                 }
1809             }
1810 
1811             filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1812                                        sanitized_name);
1813             g_free(sanitized_name);
1814 
1815             fd = mkstemp(filename);
1816             if (fd >= 0) {
1817                 unlink(filename);
1818                 g_free(filename);
1819                 break;
1820             }
1821             g_free(filename);
1822         }
1823         if (errno != EEXIST && errno != EINTR) {
1824             error_setg_errno(errp, errno,
1825                              "can't open backing store %s for guest RAM",
1826                              path);
1827             return -1;
1828         }
1829         /*
1830          * Try again on EINTR and EEXIST.  The latter happens when
1831          * something else creates the file between our two open().
1832          */
1833     }
1834 
1835     return fd;
1836 }
1837 
file_ram_alloc(RAMBlock * block,ram_addr_t memory,int fd,bool truncate,Error ** errp)1838 static void *file_ram_alloc(RAMBlock *block,
1839                             ram_addr_t memory,
1840                             int fd,
1841                             bool truncate,
1842                             Error **errp)
1843 {
1844     MachineState *ms = MACHINE(qdev_get_machine());
1845     void *area;
1846 
1847     block->page_size = qemu_fd_getpagesize(fd);
1848     if (block->mr->align % block->page_size) {
1849         error_setg(errp, "alignment 0x%" PRIx64
1850                    " must be multiples of page size 0x%zx",
1851                    block->mr->align, block->page_size);
1852         return NULL;
1853     } else if (block->mr->align && !is_power_of_2(block->mr->align)) {
1854         error_setg(errp, "alignment 0x%" PRIx64
1855                    " must be a power of two", block->mr->align);
1856         return NULL;
1857     }
1858     block->mr->align = MAX(block->page_size, block->mr->align);
1859 #if defined(__s390x__)
1860     if (kvm_enabled()) {
1861         block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1862     }
1863 #endif
1864 
1865     if (memory < block->page_size) {
1866         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1867                    "or larger than page size 0x%zx",
1868                    memory, block->page_size);
1869         return NULL;
1870     }
1871 
1872     memory = ROUND_UP(memory, block->page_size);
1873 
1874     /*
1875      * ftruncate is not supported by hugetlbfs in older
1876      * hosts, so don't bother bailing out on errors.
1877      * If anything goes wrong with it under other filesystems,
1878      * mmap will fail.
1879      *
1880      * Do not truncate the non-empty backend file to avoid corrupting
1881      * the existing data in the file. Disabling shrinking is not
1882      * enough. For example, the current vNVDIMM implementation stores
1883      * the guest NVDIMM labels at the end of the backend file. If the
1884      * backend file is later extended, QEMU will not be able to find
1885      * those labels. Therefore, extending the non-empty backend file
1886      * is disabled as well.
1887      */
1888     if (truncate && ftruncate(fd, memory)) {
1889         perror("ftruncate");
1890     }
1891 
1892     area = qemu_ram_mmap(fd, memory, block->mr->align,
1893                          block->flags & RAM_SHARED, block->flags & RAM_PMEM);
1894     if (area == MAP_FAILED) {
1895         error_setg_errno(errp, errno,
1896                          "unable to map backing store for guest RAM");
1897         return NULL;
1898     }
1899 
1900     if (mem_prealloc) {
1901         os_mem_prealloc(fd, area, memory, ms->smp.cpus, errp);
1902         if (errp && *errp) {
1903             qemu_ram_munmap(fd, area, memory);
1904             return NULL;
1905         }
1906     }
1907 
1908     block->fd = fd;
1909     return area;
1910 }
1911 #endif
1912 
1913 /* Allocate space within the ram_addr_t space that governs the
1914  * dirty bitmaps.
1915  * Called with the ramlist lock held.
1916  */
find_ram_offset(ram_addr_t size)1917 static ram_addr_t find_ram_offset(ram_addr_t size)
1918 {
1919     RAMBlock *block, *next_block;
1920     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1921 
1922     assert(size != 0); /* it would hand out same offset multiple times */
1923 
1924     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1925         return 0;
1926     }
1927 
1928     RAMBLOCK_FOREACH(block) {
1929         ram_addr_t candidate, next = RAM_ADDR_MAX;
1930 
1931         /* Align blocks to start on a 'long' in the bitmap
1932          * which makes the bitmap sync'ing take the fast path.
1933          */
1934         candidate = block->offset + block->max_length;
1935         candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
1936 
1937         /* Search for the closest following block
1938          * and find the gap.
1939          */
1940         RAMBLOCK_FOREACH(next_block) {
1941             if (next_block->offset >= candidate) {
1942                 next = MIN(next, next_block->offset);
1943             }
1944         }
1945 
1946         /* If it fits remember our place and remember the size
1947          * of gap, but keep going so that we might find a smaller
1948          * gap to fill so avoiding fragmentation.
1949          */
1950         if (next - candidate >= size && next - candidate < mingap) {
1951             offset = candidate;
1952             mingap = next - candidate;
1953         }
1954 
1955         trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
1956     }
1957 
1958     if (offset == RAM_ADDR_MAX) {
1959         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1960                 (uint64_t)size);
1961         abort();
1962     }
1963 
1964     trace_find_ram_offset(size, offset);
1965 
1966     return offset;
1967 }
1968 
last_ram_page(void)1969 static unsigned long last_ram_page(void)
1970 {
1971     RAMBlock *block;
1972     ram_addr_t last = 0;
1973 
1974     RCU_READ_LOCK_GUARD();
1975     RAMBLOCK_FOREACH(block) {
1976         last = MAX(last, block->offset + block->max_length);
1977     }
1978     return last >> TARGET_PAGE_BITS;
1979 }
1980 
qemu_ram_setup_dump(void * addr,ram_addr_t size)1981 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1982 {
1983     int ret;
1984 
1985     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1986     if (!machine_dump_guest_core(current_machine)) {
1987         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1988         if (ret) {
1989             perror("qemu_madvise");
1990             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1991                             "but dump_guest_core=off specified\n");
1992         }
1993     }
1994 }
1995 
qemu_ram_get_idstr(RAMBlock * rb)1996 const char *qemu_ram_get_idstr(RAMBlock *rb)
1997 {
1998     return rb->idstr;
1999 }
2000 
qemu_ram_get_host_addr(RAMBlock * rb)2001 void *qemu_ram_get_host_addr(RAMBlock *rb)
2002 {
2003     return rb->host;
2004 }
2005 
qemu_ram_get_offset(RAMBlock * rb)2006 ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
2007 {
2008     return rb->offset;
2009 }
2010 
qemu_ram_get_used_length(RAMBlock * rb)2011 ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
2012 {
2013     return rb->used_length;
2014 }
2015 
qemu_ram_is_shared(RAMBlock * rb)2016 bool qemu_ram_is_shared(RAMBlock *rb)
2017 {
2018     return rb->flags & RAM_SHARED;
2019 }
2020 
2021 /* Note: Only set at the start of postcopy */
qemu_ram_is_uf_zeroable(RAMBlock * rb)2022 bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
2023 {
2024     return rb->flags & RAM_UF_ZEROPAGE;
2025 }
2026 
qemu_ram_set_uf_zeroable(RAMBlock * rb)2027 void qemu_ram_set_uf_zeroable(RAMBlock *rb)
2028 {
2029     rb->flags |= RAM_UF_ZEROPAGE;
2030 }
2031 
qemu_ram_is_migratable(RAMBlock * rb)2032 bool qemu_ram_is_migratable(RAMBlock *rb)
2033 {
2034     return rb->flags & RAM_MIGRATABLE;
2035 }
2036 
qemu_ram_set_migratable(RAMBlock * rb)2037 void qemu_ram_set_migratable(RAMBlock *rb)
2038 {
2039     rb->flags |= RAM_MIGRATABLE;
2040 }
2041 
qemu_ram_unset_migratable(RAMBlock * rb)2042 void qemu_ram_unset_migratable(RAMBlock *rb)
2043 {
2044     rb->flags &= ~RAM_MIGRATABLE;
2045 }
2046 
2047 /* Called with iothread lock held.  */
qemu_ram_set_idstr(RAMBlock * new_block,const char * name,DeviceState * dev)2048 void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
2049 {
2050     RAMBlock *block;
2051 
2052     assert(new_block);
2053     assert(!new_block->idstr[0]);
2054 
2055     if (dev) {
2056         char *id = qdev_get_dev_path(dev);
2057         if (id) {
2058             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
2059             g_free(id);
2060         }
2061     }
2062     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
2063 
2064     RCU_READ_LOCK_GUARD();
2065     RAMBLOCK_FOREACH(block) {
2066         if (block != new_block &&
2067             !strcmp(block->idstr, new_block->idstr)) {
2068             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
2069                     new_block->idstr);
2070             abort();
2071         }
2072     }
2073 }
2074 
2075 /* Called with iothread lock held.  */
qemu_ram_unset_idstr(RAMBlock * block)2076 void qemu_ram_unset_idstr(RAMBlock *block)
2077 {
2078     /* FIXME: arch_init.c assumes that this is not called throughout
2079      * migration.  Ignore the problem since hot-unplug during migration
2080      * does not work anyway.
2081      */
2082     if (block) {
2083         memset(block->idstr, 0, sizeof(block->idstr));
2084     }
2085 }
2086 
qemu_ram_pagesize(RAMBlock * rb)2087 size_t qemu_ram_pagesize(RAMBlock *rb)
2088 {
2089     return rb->page_size;
2090 }
2091 
2092 /* Returns the largest size of page in use */
qemu_ram_pagesize_largest(void)2093 size_t qemu_ram_pagesize_largest(void)
2094 {
2095     RAMBlock *block;
2096     size_t largest = 0;
2097 
2098     RAMBLOCK_FOREACH(block) {
2099         largest = MAX(largest, qemu_ram_pagesize(block));
2100     }
2101 
2102     return largest;
2103 }
2104 
memory_try_enable_merging(void * addr,size_t len)2105 static int memory_try_enable_merging(void *addr, size_t len)
2106 {
2107     if (!machine_mem_merge(current_machine)) {
2108         /* disabled by the user */
2109         return 0;
2110     }
2111 
2112     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
2113 }
2114 
2115 /* Only legal before guest might have detected the memory size: e.g. on
2116  * incoming migration, or right after reset.
2117  *
2118  * As memory core doesn't know how is memory accessed, it is up to
2119  * resize callback to update device state and/or add assertions to detect
2120  * misuse, if necessary.
2121  */
qemu_ram_resize(RAMBlock * block,ram_addr_t newsize,Error ** errp)2122 int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
2123 {
2124     assert(block);
2125 
2126     newsize = HOST_PAGE_ALIGN(newsize);
2127 
2128     if (block->used_length == newsize) {
2129         return 0;
2130     }
2131 
2132     if (!(block->flags & RAM_RESIZEABLE)) {
2133         error_setg_errno(errp, EINVAL,
2134                          "Length mismatch: %s: 0x" RAM_ADDR_FMT
2135                          " in != 0x" RAM_ADDR_FMT, block->idstr,
2136                          newsize, block->used_length);
2137         return -EINVAL;
2138     }
2139 
2140     if (block->max_length < newsize) {
2141         error_setg_errno(errp, EINVAL,
2142                          "Length too large: %s: 0x" RAM_ADDR_FMT
2143                          " > 0x" RAM_ADDR_FMT, block->idstr,
2144                          newsize, block->max_length);
2145         return -EINVAL;
2146     }
2147 
2148     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
2149     block->used_length = newsize;
2150     cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
2151                                         DIRTY_CLIENTS_ALL);
2152     memory_region_set_size(block->mr, newsize);
2153     if (block->resized) {
2154         block->resized(block->idstr, newsize, block->host);
2155     }
2156     return 0;
2157 }
2158 
2159 /* Called with ram_list.mutex held */
dirty_memory_extend(ram_addr_t old_ram_size,ram_addr_t new_ram_size)2160 static void dirty_memory_extend(ram_addr_t old_ram_size,
2161                                 ram_addr_t new_ram_size)
2162 {
2163     ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
2164                                              DIRTY_MEMORY_BLOCK_SIZE);
2165     ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
2166                                              DIRTY_MEMORY_BLOCK_SIZE);
2167     int i;
2168 
2169     /* Only need to extend if block count increased */
2170     if (new_num_blocks <= old_num_blocks) {
2171         return;
2172     }
2173 
2174     for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
2175         DirtyMemoryBlocks *old_blocks;
2176         DirtyMemoryBlocks *new_blocks;
2177         int j;
2178 
2179         old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
2180         new_blocks = g_malloc(sizeof(*new_blocks) +
2181                               sizeof(new_blocks->blocks[0]) * new_num_blocks);
2182 
2183         if (old_num_blocks) {
2184             memcpy(new_blocks->blocks, old_blocks->blocks,
2185                    old_num_blocks * sizeof(old_blocks->blocks[0]));
2186         }
2187 
2188         for (j = old_num_blocks; j < new_num_blocks; j++) {
2189             new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
2190         }
2191 
2192         atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
2193 
2194         if (old_blocks) {
2195             g_free_rcu(old_blocks, rcu);
2196         }
2197     }
2198 }
2199 
ram_block_add(RAMBlock * new_block,Error ** errp,bool shared)2200 static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
2201 {
2202     RAMBlock *block;
2203     RAMBlock *last_block = NULL;
2204     ram_addr_t old_ram_size, new_ram_size;
2205     Error *err = NULL;
2206 
2207     old_ram_size = last_ram_page();
2208 
2209     qemu_mutex_lock_ramlist();
2210     new_block->offset = find_ram_offset(new_block->max_length);
2211 
2212     if (!new_block->host) {
2213         if (xen_enabled()) {
2214             xen_ram_alloc(new_block->offset, new_block->max_length,
2215                           new_block->mr, &err);
2216             if (err) {
2217                 error_propagate(errp, err);
2218                 qemu_mutex_unlock_ramlist();
2219                 return;
2220             }
2221         } else {
2222             new_block->host = phys_mem_alloc(new_block->max_length,
2223                                              &new_block->mr->align, shared);
2224             if (!new_block->host) {
2225                 error_setg_errno(errp, errno,
2226                                  "cannot set up guest memory '%s'",
2227                                  memory_region_name(new_block->mr));
2228                 qemu_mutex_unlock_ramlist();
2229                 return;
2230             }
2231             memory_try_enable_merging(new_block->host, new_block->max_length);
2232         }
2233     }
2234 
2235     new_ram_size = MAX(old_ram_size,
2236               (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
2237     if (new_ram_size > old_ram_size) {
2238         dirty_memory_extend(old_ram_size, new_ram_size);
2239     }
2240     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
2241      * QLIST (which has an RCU-friendly variant) does not have insertion at
2242      * tail, so save the last element in last_block.
2243      */
2244     RAMBLOCK_FOREACH(block) {
2245         last_block = block;
2246         if (block->max_length < new_block->max_length) {
2247             break;
2248         }
2249     }
2250     if (block) {
2251         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
2252     } else if (last_block) {
2253         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
2254     } else { /* list is empty */
2255         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
2256     }
2257     ram_list.mru_block = NULL;
2258 
2259     /* Write list before version */
2260     smp_wmb();
2261     ram_list.version++;
2262     qemu_mutex_unlock_ramlist();
2263 
2264     cpu_physical_memory_set_dirty_range(new_block->offset,
2265                                         new_block->used_length,
2266                                         DIRTY_CLIENTS_ALL);
2267 
2268     if (new_block->host) {
2269         qemu_ram_setup_dump(new_block->host, new_block->max_length);
2270         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
2271         /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
2272         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
2273         ram_block_notify_add(new_block->host, new_block->max_length);
2274     }
2275 }
2276 
2277 #ifdef CONFIG_POSIX
qemu_ram_alloc_from_fd(ram_addr_t size,MemoryRegion * mr,uint32_t ram_flags,int fd,Error ** errp)2278 RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
2279                                  uint32_t ram_flags, int fd,
2280                                  Error **errp)
2281 {
2282     RAMBlock *new_block;
2283     Error *local_err = NULL;
2284     int64_t file_size;
2285 
2286     /* Just support these ram flags by now. */
2287     assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
2288 
2289     if (xen_enabled()) {
2290         error_setg(errp, "-mem-path not supported with Xen");
2291         return NULL;
2292     }
2293 
2294     if (kvm_enabled() && !kvm_has_sync_mmu()) {
2295         error_setg(errp,
2296                    "host lacks kvm mmu notifiers, -mem-path unsupported");
2297         return NULL;
2298     }
2299 
2300     if (phys_mem_alloc != qemu_anon_ram_alloc) {
2301         /*
2302          * file_ram_alloc() needs to allocate just like
2303          * phys_mem_alloc, but we haven't bothered to provide
2304          * a hook there.
2305          */
2306         error_setg(errp,
2307                    "-mem-path not supported with this accelerator");
2308         return NULL;
2309     }
2310 
2311     size = HOST_PAGE_ALIGN(size);
2312     file_size = get_file_size(fd);
2313     if (file_size > 0 && file_size < size) {
2314         error_setg(errp, "backing store %s size 0x%" PRIx64
2315                    " does not match 'size' option 0x" RAM_ADDR_FMT,
2316                    mem_path, file_size, size);
2317         return NULL;
2318     }
2319 
2320     new_block = g_malloc0(sizeof(*new_block));
2321     new_block->mr = mr;
2322     new_block->used_length = size;
2323     new_block->max_length = size;
2324     new_block->flags = ram_flags;
2325     new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
2326     if (!new_block->host) {
2327         g_free(new_block);
2328         return NULL;
2329     }
2330 
2331     ram_block_add(new_block, &local_err, ram_flags & RAM_SHARED);
2332     if (local_err) {
2333         g_free(new_block);
2334         error_propagate(errp, local_err);
2335         return NULL;
2336     }
2337     return new_block;
2338 
2339 }
2340 
2341 
qemu_ram_alloc_from_file(ram_addr_t size,MemoryRegion * mr,uint32_t ram_flags,const char * mem_path,Error ** errp)2342 RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
2343                                    uint32_t ram_flags, const char *mem_path,
2344                                    Error **errp)
2345 {
2346     int fd;
2347     bool created;
2348     RAMBlock *block;
2349 
2350     fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
2351     if (fd < 0) {
2352         return NULL;
2353     }
2354 
2355     block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
2356     if (!block) {
2357         if (created) {
2358             unlink(mem_path);
2359         }
2360         close(fd);
2361         return NULL;
2362     }
2363 
2364     return block;
2365 }
2366 #endif
2367 
2368 static
qemu_ram_alloc_internal(ram_addr_t size,ram_addr_t max_size,void (* resized)(const char *,uint64_t length,void * host),void * host,bool resizeable,bool share,MemoryRegion * mr,Error ** errp)2369 RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
2370                                   void (*resized)(const char*,
2371                                                   uint64_t length,
2372                                                   void *host),
2373                                   void *host, bool resizeable, bool share,
2374                                   MemoryRegion *mr, Error **errp)
2375 {
2376     RAMBlock *new_block;
2377     Error *local_err = NULL;
2378 
2379     size = HOST_PAGE_ALIGN(size);
2380     max_size = HOST_PAGE_ALIGN(max_size);
2381     new_block = g_malloc0(sizeof(*new_block));
2382     new_block->mr = mr;
2383     new_block->resized = resized;
2384     new_block->used_length = size;
2385     new_block->max_length = max_size;
2386     assert(max_size >= size);
2387     new_block->fd = -1;
2388     new_block->page_size = qemu_real_host_page_size;
2389     new_block->host = host;
2390     if (host) {
2391         new_block->flags |= RAM_PREALLOC;
2392     }
2393     if (resizeable) {
2394         new_block->flags |= RAM_RESIZEABLE;
2395     }
2396     ram_block_add(new_block, &local_err, share);
2397     if (local_err) {
2398         g_free(new_block);
2399         error_propagate(errp, local_err);
2400         return NULL;
2401     }
2402     return new_block;
2403 }
2404 
qemu_ram_alloc_from_ptr(ram_addr_t size,void * host,MemoryRegion * mr,Error ** errp)2405 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
2406                                    MemoryRegion *mr, Error **errp)
2407 {
2408     return qemu_ram_alloc_internal(size, size, NULL, host, false,
2409                                    false, mr, errp);
2410 }
2411 
qemu_ram_alloc(ram_addr_t size,bool share,MemoryRegion * mr,Error ** errp)2412 RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
2413                          MemoryRegion *mr, Error **errp)
2414 {
2415     return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
2416                                    share, mr, errp);
2417 }
2418 
qemu_ram_alloc_resizeable(ram_addr_t size,ram_addr_t maxsz,void (* resized)(const char *,uint64_t length,void * host),MemoryRegion * mr,Error ** errp)2419 RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
2420                                      void (*resized)(const char*,
2421                                                      uint64_t length,
2422                                                      void *host),
2423                                      MemoryRegion *mr, Error **errp)
2424 {
2425     return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
2426                                    false, mr, errp);
2427 }
2428 
reclaim_ramblock(RAMBlock * block)2429 static void reclaim_ramblock(RAMBlock *block)
2430 {
2431     if (block->flags & RAM_PREALLOC) {
2432         ;
2433     } else if (xen_enabled()) {
2434         xen_invalidate_map_cache_entry(block->host);
2435 #ifndef _WIN32
2436     } else if (block->fd >= 0) {
2437         qemu_ram_munmap(block->fd, block->host, block->max_length);
2438         close(block->fd);
2439 #endif
2440     } else {
2441         qemu_anon_ram_free(block->host, block->max_length);
2442     }
2443     g_free(block);
2444 }
2445 
qemu_ram_free(RAMBlock * block)2446 void qemu_ram_free(RAMBlock *block)
2447 {
2448     if (!block) {
2449         return;
2450     }
2451 
2452     if (block->host) {
2453         ram_block_notify_remove(block->host, block->max_length);
2454     }
2455 
2456     qemu_mutex_lock_ramlist();
2457     QLIST_REMOVE_RCU(block, next);
2458     ram_list.mru_block = NULL;
2459     /* Write list before version */
2460     smp_wmb();
2461     ram_list.version++;
2462     call_rcu(block, reclaim_ramblock, rcu);
2463     qemu_mutex_unlock_ramlist();
2464 }
2465 
2466 #ifndef _WIN32
qemu_ram_remap(ram_addr_t addr,ram_addr_t length)2467 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
2468 {
2469     RAMBlock *block;
2470     ram_addr_t offset;
2471     int flags;
2472     void *area, *vaddr;
2473 
2474     RAMBLOCK_FOREACH(block) {
2475         offset = addr - block->offset;
2476         if (offset < block->max_length) {
2477             vaddr = ramblock_ptr(block, offset);
2478             if (block->flags & RAM_PREALLOC) {
2479                 ;
2480             } else if (xen_enabled()) {
2481                 abort();
2482             } else {
2483                 flags = MAP_FIXED;
2484                 if (block->fd >= 0) {
2485                     flags |= (block->flags & RAM_SHARED ?
2486                               MAP_SHARED : MAP_PRIVATE);
2487                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2488                                 flags, block->fd, offset);
2489                 } else {
2490                     /*
2491                      * Remap needs to match alloc.  Accelerators that
2492                      * set phys_mem_alloc never remap.  If they did,
2493                      * we'd need a remap hook here.
2494                      */
2495                     assert(phys_mem_alloc == qemu_anon_ram_alloc);
2496 
2497                     flags |= MAP_PRIVATE | MAP_ANONYMOUS;
2498                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2499                                 flags, -1, 0);
2500                 }
2501                 if (area != vaddr) {
2502                     error_report("Could not remap addr: "
2503                                  RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
2504                                  length, addr);
2505                     exit(1);
2506                 }
2507                 memory_try_enable_merging(vaddr, length);
2508                 qemu_ram_setup_dump(vaddr, length);
2509             }
2510         }
2511     }
2512 }
2513 #endif /* !_WIN32 */
2514 
2515 /* Return a host pointer to ram allocated with qemu_ram_alloc.
2516  * This should not be used for general purpose DMA.  Use address_space_map
2517  * or address_space_rw instead. For local memory (e.g. video ram) that the
2518  * device owns, use memory_region_get_ram_ptr.
2519  *
2520  * Called within RCU critical section.
2521  */
qemu_map_ram_ptr(RAMBlock * ram_block,ram_addr_t addr)2522 void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
2523 {
2524     RAMBlock *block = ram_block;
2525 
2526     if (block == NULL) {
2527         block = qemu_get_ram_block(addr);
2528         addr -= block->offset;
2529     }
2530 
2531     if (xen_enabled() && block->host == NULL) {
2532         /* We need to check if the requested address is in the RAM
2533          * because we don't want to map the entire memory in QEMU.
2534          * In that case just map until the end of the page.
2535          */
2536         if (block->offset == 0) {
2537             return xen_map_cache(addr, 0, 0, false);
2538         }
2539 
2540         block->host = xen_map_cache(block->offset, block->max_length, 1, false);
2541     }
2542     return ramblock_ptr(block, addr);
2543 }
2544 
2545 /* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
2546  * but takes a size argument.
2547  *
2548  * Called within RCU critical section.
2549  */
qemu_ram_ptr_length(RAMBlock * ram_block,ram_addr_t addr,hwaddr * size,bool lock)2550 static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
2551                                  hwaddr *size, bool lock)
2552 {
2553     RAMBlock *block = ram_block;
2554     if (*size == 0) {
2555         return NULL;
2556     }
2557 
2558     if (block == NULL) {
2559         block = qemu_get_ram_block(addr);
2560         addr -= block->offset;
2561     }
2562     *size = MIN(*size, block->max_length - addr);
2563 
2564     if (xen_enabled() && block->host == NULL) {
2565         /* We need to check if the requested address is in the RAM
2566          * because we don't want to map the entire memory in QEMU.
2567          * In that case just map the requested area.
2568          */
2569         if (block->offset == 0) {
2570             return xen_map_cache(addr, *size, lock, lock);
2571         }
2572 
2573         block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
2574     }
2575 
2576     return ramblock_ptr(block, addr);
2577 }
2578 
2579 /* Return the offset of a hostpointer within a ramblock */
qemu_ram_block_host_offset(RAMBlock * rb,void * host)2580 ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
2581 {
2582     ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
2583     assert((uintptr_t)host >= (uintptr_t)rb->host);
2584     assert(res < rb->max_length);
2585 
2586     return res;
2587 }
2588 
2589 /*
2590  * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
2591  * in that RAMBlock.
2592  *
2593  * ptr: Host pointer to look up
2594  * round_offset: If true round the result offset down to a page boundary
2595  * *ram_addr: set to result ram_addr
2596  * *offset: set to result offset within the RAMBlock
2597  *
2598  * Returns: RAMBlock (or NULL if not found)
2599  *
2600  * By the time this function returns, the returned pointer is not protected
2601  * by RCU anymore.  If the caller is not within an RCU critical section and
2602  * does not hold the iothread lock, it must have other means of protecting the
2603  * pointer, such as a reference to the region that includes the incoming
2604  * ram_addr_t.
2605  */
qemu_ram_block_from_host(void * ptr,bool round_offset,ram_addr_t * offset)2606 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
2607                                    ram_addr_t *offset)
2608 {
2609     RAMBlock *block;
2610     uint8_t *host = ptr;
2611 
2612     if (xen_enabled()) {
2613         ram_addr_t ram_addr;
2614         RCU_READ_LOCK_GUARD();
2615         ram_addr = xen_ram_addr_from_mapcache(ptr);
2616         block = qemu_get_ram_block(ram_addr);
2617         if (block) {
2618             *offset = ram_addr - block->offset;
2619         }
2620         return block;
2621     }
2622 
2623     RCU_READ_LOCK_GUARD();
2624     block = atomic_rcu_read(&ram_list.mru_block);
2625     if (block && block->host && host - block->host < block->max_length) {
2626         goto found;
2627     }
2628 
2629     RAMBLOCK_FOREACH(block) {
2630         /* This case append when the block is not mapped. */
2631         if (block->host == NULL) {
2632             continue;
2633         }
2634         if (host - block->host < block->max_length) {
2635             goto found;
2636         }
2637     }
2638 
2639     return NULL;
2640 
2641 found:
2642     *offset = (host - block->host);
2643     if (round_offset) {
2644         *offset &= TARGET_PAGE_MASK;
2645     }
2646     return block;
2647 }
2648 
2649 /*
2650  * Finds the named RAMBlock
2651  *
2652  * name: The name of RAMBlock to find
2653  *
2654  * Returns: RAMBlock (or NULL if not found)
2655  */
qemu_ram_block_by_name(const char * name)2656 RAMBlock *qemu_ram_block_by_name(const char *name)
2657 {
2658     RAMBlock *block;
2659 
2660     RAMBLOCK_FOREACH(block) {
2661         if (!strcmp(name, block->idstr)) {
2662             return block;
2663         }
2664     }
2665 
2666     return NULL;
2667 }
2668 
2669 /* Some of the softmmu routines need to translate from a host pointer
2670    (typically a TLB entry) back to a ram offset.  */
qemu_ram_addr_from_host(void * ptr)2671 ram_addr_t qemu_ram_addr_from_host(void *ptr)
2672 {
2673     RAMBlock *block;
2674     ram_addr_t offset;
2675 
2676     block = qemu_ram_block_from_host(ptr, false, &offset);
2677     if (!block) {
2678         return RAM_ADDR_INVALID;
2679     }
2680 
2681     return block->offset + offset;
2682 }
2683 
2684 /* Generate a debug exception if a watchpoint has been hit.  */
cpu_check_watchpoint(CPUState * cpu,vaddr addr,vaddr len,MemTxAttrs attrs,int flags,uintptr_t ra)2685 void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
2686                           MemTxAttrs attrs, int flags, uintptr_t ra)
2687 {
2688     CPUClass *cc = CPU_GET_CLASS(cpu);
2689     CPUWatchpoint *wp;
2690 
2691     assert(tcg_enabled());
2692     if (cpu->watchpoint_hit) {
2693         /*
2694          * We re-entered the check after replacing the TB.
2695          * Now raise the debug interrupt so that it will
2696          * trigger after the current instruction.
2697          */
2698         qemu_mutex_lock_iothread();
2699         cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2700         qemu_mutex_unlock_iothread();
2701         return;
2702     }
2703 
2704     addr = cc->adjust_watchpoint_address(cpu, addr, len);
2705     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2706         if (watchpoint_address_matches(wp, addr, len)
2707             && (wp->flags & flags)) {
2708             if (flags == BP_MEM_READ) {
2709                 wp->flags |= BP_WATCHPOINT_HIT_READ;
2710             } else {
2711                 wp->flags |= BP_WATCHPOINT_HIT_WRITE;
2712             }
2713             wp->hitaddr = MAX(addr, wp->vaddr);
2714             wp->hitattrs = attrs;
2715             if (!cpu->watchpoint_hit) {
2716                 if (wp->flags & BP_CPU &&
2717                     !cc->debug_check_watchpoint(cpu, wp)) {
2718                     wp->flags &= ~BP_WATCHPOINT_HIT;
2719                     continue;
2720                 }
2721                 cpu->watchpoint_hit = wp;
2722 
2723                 mmap_lock();
2724                 tb_check_watchpoint(cpu, ra);
2725                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2726                     cpu->exception_index = EXCP_DEBUG;
2727                     mmap_unlock();
2728                     cpu_loop_exit_restore(cpu, ra);
2729                 } else {
2730                     /* Force execution of one insn next time.  */
2731                     cpu->cflags_next_tb = 1 | curr_cflags();
2732                     mmap_unlock();
2733                     if (ra) {
2734                         cpu_restore_state(cpu, ra, true);
2735                     }
2736                     cpu_loop_exit_noexc(cpu);
2737                 }
2738             }
2739         } else {
2740             wp->flags &= ~BP_WATCHPOINT_HIT;
2741         }
2742     }
2743 }
2744 
2745 static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
2746                                  MemTxAttrs attrs, uint8_t *buf, hwaddr len);
2747 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
2748                                   const uint8_t *buf, hwaddr len);
2749 static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
2750                                   bool is_write, MemTxAttrs attrs);
2751 
subpage_read(void * opaque,hwaddr addr,uint64_t * data,unsigned len,MemTxAttrs attrs)2752 static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2753                                 unsigned len, MemTxAttrs attrs)
2754 {
2755     subpage_t *subpage = opaque;
2756     uint8_t buf[8];
2757     MemTxResult res;
2758 
2759 #if defined(DEBUG_SUBPAGE)
2760     printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2761            subpage, len, addr);
2762 #endif
2763     res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
2764     if (res) {
2765         return res;
2766     }
2767     *data = ldn_p(buf, len);
2768     return MEMTX_OK;
2769 }
2770 
subpage_write(void * opaque,hwaddr addr,uint64_t value,unsigned len,MemTxAttrs attrs)2771 static MemTxResult subpage_write(void *opaque, hwaddr addr,
2772                                  uint64_t value, unsigned len, MemTxAttrs attrs)
2773 {
2774     subpage_t *subpage = opaque;
2775     uint8_t buf[8];
2776 
2777 #if defined(DEBUG_SUBPAGE)
2778     printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2779            " value %"PRIx64"\n",
2780            __func__, subpage, len, addr, value);
2781 #endif
2782     stn_p(buf, len, value);
2783     return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
2784 }
2785 
subpage_accepts(void * opaque,hwaddr addr,unsigned len,bool is_write,MemTxAttrs attrs)2786 static bool subpage_accepts(void *opaque, hwaddr addr,
2787                             unsigned len, bool is_write,
2788                             MemTxAttrs attrs)
2789 {
2790     subpage_t *subpage = opaque;
2791 #if defined(DEBUG_SUBPAGE)
2792     printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2793            __func__, subpage, is_write ? 'w' : 'r', len, addr);
2794 #endif
2795 
2796     return flatview_access_valid(subpage->fv, addr + subpage->base,
2797                                  len, is_write, attrs);
2798 }
2799 
2800 static const MemoryRegionOps subpage_ops = {
2801     .read_with_attrs = subpage_read,
2802     .write_with_attrs = subpage_write,
2803     .impl.min_access_size = 1,
2804     .impl.max_access_size = 8,
2805     .valid.min_access_size = 1,
2806     .valid.max_access_size = 8,
2807     .valid.accepts = subpage_accepts,
2808     .endianness = DEVICE_NATIVE_ENDIAN,
2809 };
2810 
subpage_register(subpage_t * mmio,uint32_t start,uint32_t end,uint16_t section)2811 static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
2812                             uint16_t section)
2813 {
2814     int idx, eidx;
2815 
2816     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2817         return -1;
2818     idx = SUBPAGE_IDX(start);
2819     eidx = SUBPAGE_IDX(end);
2820 #if defined(DEBUG_SUBPAGE)
2821     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2822            __func__, mmio, start, end, idx, eidx, section);
2823 #endif
2824     for (; idx <= eidx; idx++) {
2825         mmio->sub_section[idx] = section;
2826     }
2827 
2828     return 0;
2829 }
2830 
subpage_init(FlatView * fv,hwaddr base)2831 static subpage_t *subpage_init(FlatView *fv, hwaddr base)
2832 {
2833     subpage_t *mmio;
2834 
2835     /* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
2836     mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2837     mmio->fv = fv;
2838     mmio->base = base;
2839     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2840                           NULL, TARGET_PAGE_SIZE);
2841     mmio->iomem.subpage = true;
2842 #if defined(DEBUG_SUBPAGE)
2843     printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2844            mmio, base, TARGET_PAGE_SIZE);
2845 #endif
2846 
2847     return mmio;
2848 }
2849 
dummy_section(PhysPageMap * map,FlatView * fv,MemoryRegion * mr)2850 static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
2851 {
2852     assert(fv);
2853     MemoryRegionSection section = {
2854         .fv = fv,
2855         .mr = mr,
2856         .offset_within_address_space = 0,
2857         .offset_within_region = 0,
2858         .size = int128_2_64(),
2859     };
2860 
2861     return phys_section_add(map, &section);
2862 }
2863 
iotlb_to_section(CPUState * cpu,hwaddr index,MemTxAttrs attrs)2864 MemoryRegionSection *iotlb_to_section(CPUState *cpu,
2865                                       hwaddr index, MemTxAttrs attrs)
2866 {
2867     int asidx = cpu_asidx_from_attrs(cpu, attrs);
2868     CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2869     AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2870     MemoryRegionSection *sections = d->map.sections;
2871 
2872     return &sections[index & ~TARGET_PAGE_MASK];
2873 }
2874 
io_mem_init(void)2875 static void io_mem_init(void)
2876 {
2877     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2878                           NULL, UINT64_MAX);
2879 }
2880 
address_space_dispatch_new(FlatView * fv)2881 AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
2882 {
2883     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2884     uint16_t n;
2885 
2886     n = dummy_section(&d->map, fv, &io_mem_unassigned);
2887     assert(n == PHYS_SECTION_UNASSIGNED);
2888 
2889     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2890 
2891     return d;
2892 }
2893 
address_space_dispatch_free(AddressSpaceDispatch * d)2894 void address_space_dispatch_free(AddressSpaceDispatch *d)
2895 {
2896     phys_sections_free(&d->map);
2897     g_free(d);
2898 }
2899 
do_nothing(CPUState * cpu,run_on_cpu_data d)2900 static void do_nothing(CPUState *cpu, run_on_cpu_data d)
2901 {
2902 }
2903 
tcg_log_global_after_sync(MemoryListener * listener)2904 static void tcg_log_global_after_sync(MemoryListener *listener)
2905 {
2906     CPUAddressSpace *cpuas;
2907 
2908     /* Wait for the CPU to end the current TB.  This avoids the following
2909      * incorrect race:
2910      *
2911      *      vCPU                         migration
2912      *      ----------------------       -------------------------
2913      *      TLB check -> slow path
2914      *        notdirty_mem_write
2915      *          write to RAM
2916      *          mark dirty
2917      *                                   clear dirty flag
2918      *      TLB check -> fast path
2919      *                                   read memory
2920      *        write to RAM
2921      *
2922      * by pushing the migration thread's memory read after the vCPU thread has
2923      * written the memory.
2924      */
2925     if (replay_mode == REPLAY_MODE_NONE) {
2926         /*
2927          * VGA can make calls to this function while updating the screen.
2928          * In record/replay mode this causes a deadlock, because
2929          * run_on_cpu waits for rr mutex. Therefore no races are possible
2930          * in this case and no need for making run_on_cpu when
2931          * record/replay is not enabled.
2932          */
2933         cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2934         run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
2935     }
2936 }
2937 
tcg_commit(MemoryListener * listener)2938 static void tcg_commit(MemoryListener *listener)
2939 {
2940     CPUAddressSpace *cpuas;
2941     AddressSpaceDispatch *d;
2942 
2943     assert(tcg_enabled());
2944     /* since each CPU stores ram addresses in its TLB cache, we must
2945        reset the modified entries */
2946     cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2947     cpu_reloading_memory_map();
2948     /* The CPU and TLB are protected by the iothread lock.
2949      * We reload the dispatch pointer now because cpu_reloading_memory_map()
2950      * may have split the RCU critical section.
2951      */
2952     d = address_space_to_dispatch(cpuas->as);
2953     atomic_rcu_set(&cpuas->memory_dispatch, d);
2954     tlb_flush(cpuas->cpu);
2955 }
2956 
memory_map_init(void)2957 static void memory_map_init(void)
2958 {
2959     system_memory = g_malloc(sizeof(*system_memory));
2960 
2961     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2962     address_space_init(&address_space_memory, system_memory, "memory");
2963 
2964     system_io = g_malloc(sizeof(*system_io));
2965     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2966                           65536);
2967     address_space_init(&address_space_io, system_io, "I/O");
2968 }
2969 
get_system_memory(void)2970 MemoryRegion *get_system_memory(void)
2971 {
2972     return system_memory;
2973 }
2974 
get_system_io(void)2975 MemoryRegion *get_system_io(void)
2976 {
2977     return system_io;
2978 }
2979 
2980 #endif /* !defined(CONFIG_USER_ONLY) */
2981 
2982 /* physical memory access (slow version, mainly for debug) */
2983 #if defined(CONFIG_USER_ONLY)
cpu_memory_rw_debug(CPUState * cpu,target_ulong addr,uint8_t * buf,target_ulong len,int is_write)2984 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2985                         uint8_t *buf, target_ulong len, int is_write)
2986 {
2987     int flags;
2988     target_ulong l, page;
2989     void * p;
2990 
2991     while (len > 0) {
2992         page = addr & TARGET_PAGE_MASK;
2993         l = (page + TARGET_PAGE_SIZE) - addr;
2994         if (l > len)
2995             l = len;
2996         flags = page_get_flags(page);
2997         if (!(flags & PAGE_VALID))
2998             return -1;
2999         if (is_write) {
3000             if (!(flags & PAGE_WRITE))
3001                 return -1;
3002             /* XXX: this code should not depend on lock_user */
3003             if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
3004                 return -1;
3005             memcpy(p, buf, l);
3006             unlock_user(p, addr, l);
3007         } else {
3008             if (!(flags & PAGE_READ))
3009                 return -1;
3010             /* XXX: this code should not depend on lock_user */
3011             if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
3012                 return -1;
3013             memcpy(buf, p, l);
3014             unlock_user(p, addr, 0);
3015         }
3016         len -= l;
3017         buf += l;
3018         addr += l;
3019     }
3020     return 0;
3021 }
3022 
3023 #else
3024 
invalidate_and_set_dirty(MemoryRegion * mr,hwaddr addr,hwaddr length)3025 static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
3026                                      hwaddr length)
3027 {
3028     uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
3029     addr += memory_region_get_ram_addr(mr);
3030 
3031     /* No early return if dirty_log_mask is or becomes 0, because
3032      * cpu_physical_memory_set_dirty_range will still call
3033      * xen_modified_memory.
3034      */
3035     if (dirty_log_mask) {
3036         dirty_log_mask =
3037             cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
3038     }
3039     if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
3040         assert(tcg_enabled());
3041         tb_invalidate_phys_range(addr, addr + length);
3042         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
3043     }
3044     cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
3045 }
3046 
memory_region_flush_rom_device(MemoryRegion * mr,hwaddr addr,hwaddr size)3047 void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
3048 {
3049     /*
3050      * In principle this function would work on other memory region types too,
3051      * but the ROM device use case is the only one where this operation is
3052      * necessary.  Other memory regions should use the
3053      * address_space_read/write() APIs.
3054      */
3055     assert(memory_region_is_romd(mr));
3056 
3057     invalidate_and_set_dirty(mr, addr, size);
3058 }
3059 
memory_access_size(MemoryRegion * mr,unsigned l,hwaddr addr)3060 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
3061 {
3062     unsigned access_size_max = mr->ops->valid.max_access_size;
3063 
3064     /* Regions are assumed to support 1-4 byte accesses unless
3065        otherwise specified.  */
3066     if (access_size_max == 0) {
3067         access_size_max = 4;
3068     }
3069 
3070     /* Bound the maximum access by the alignment of the address.  */
3071     if (!mr->ops->impl.unaligned) {
3072         unsigned align_size_max = addr & -addr;
3073         if (align_size_max != 0 && align_size_max < access_size_max) {
3074             access_size_max = align_size_max;
3075         }
3076     }
3077 
3078     /* Don't attempt accesses larger than the maximum.  */
3079     if (l > access_size_max) {
3080         l = access_size_max;
3081     }
3082     l = pow2floor(l);
3083 
3084     return l;
3085 }
3086 
prepare_mmio_access(MemoryRegion * mr)3087 static bool prepare_mmio_access(MemoryRegion *mr)
3088 {
3089     bool unlocked = !qemu_mutex_iothread_locked();
3090     bool release_lock = false;
3091 
3092     if (unlocked && mr->global_locking) {
3093         qemu_mutex_lock_iothread();
3094         unlocked = false;
3095         release_lock = true;
3096     }
3097     if (mr->flush_coalesced_mmio) {
3098         if (unlocked) {
3099             qemu_mutex_lock_iothread();
3100         }
3101         qemu_flush_coalesced_mmio_buffer();
3102         if (unlocked) {
3103             qemu_mutex_unlock_iothread();
3104         }
3105     }
3106 
3107     return release_lock;
3108 }
3109 
3110 /* Called within RCU critical section.  */
flatview_write_continue(FlatView * fv,hwaddr addr,MemTxAttrs attrs,const uint8_t * buf,hwaddr len,hwaddr addr1,hwaddr l,MemoryRegion * mr)3111 static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
3112                                            MemTxAttrs attrs,
3113                                            const uint8_t *buf,
3114                                            hwaddr len, hwaddr addr1,
3115                                            hwaddr l, MemoryRegion *mr)
3116 {
3117     uint8_t *ptr;
3118     uint64_t val;
3119     MemTxResult result = MEMTX_OK;
3120     bool release_lock = false;
3121 
3122     for (;;) {
3123         if (!memory_access_is_direct(mr, true)) {
3124             release_lock |= prepare_mmio_access(mr);
3125             l = memory_access_size(mr, l, addr1);
3126             /* XXX: could force current_cpu to NULL to avoid
3127                potential bugs */
3128             val = ldn_he_p(buf, l);
3129             result |= memory_region_dispatch_write(mr, addr1, val,
3130                                                    size_memop(l), attrs);
3131         } else {
3132             /* RAM case */
3133             ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3134             memcpy(ptr, buf, l);
3135             invalidate_and_set_dirty(mr, addr1, l);
3136         }
3137 
3138         if (release_lock) {
3139             qemu_mutex_unlock_iothread();
3140             release_lock = false;
3141         }
3142 
3143         len -= l;
3144         buf += l;
3145         addr += l;
3146 
3147         if (!len) {
3148             break;
3149         }
3150 
3151         l = len;
3152         mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
3153     }
3154 
3155     return result;
3156 }
3157 
3158 /* Called from RCU critical section.  */
flatview_write(FlatView * fv,hwaddr addr,MemTxAttrs attrs,const uint8_t * buf,hwaddr len)3159 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
3160                                   const uint8_t *buf, hwaddr len)
3161 {
3162     hwaddr l;
3163     hwaddr addr1;
3164     MemoryRegion *mr;
3165     MemTxResult result = MEMTX_OK;
3166 
3167     l = len;
3168     mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
3169     result = flatview_write_continue(fv, addr, attrs, buf, len,
3170                                      addr1, l, mr);
3171 
3172     return result;
3173 }
3174 
3175 /* Called within RCU critical section.  */
flatview_read_continue(FlatView * fv,hwaddr addr,MemTxAttrs attrs,uint8_t * buf,hwaddr len,hwaddr addr1,hwaddr l,MemoryRegion * mr)3176 MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
3177                                    MemTxAttrs attrs, uint8_t *buf,
3178                                    hwaddr len, hwaddr addr1, hwaddr l,
3179                                    MemoryRegion *mr)
3180 {
3181     uint8_t *ptr;
3182     uint64_t val;
3183     MemTxResult result = MEMTX_OK;
3184     bool release_lock = false;
3185 
3186     for (;;) {
3187         if (!memory_access_is_direct(mr, false)) {
3188             /* I/O case */
3189             release_lock |= prepare_mmio_access(mr);
3190             l = memory_access_size(mr, l, addr1);
3191             result |= memory_region_dispatch_read(mr, addr1, &val,
3192                                                   size_memop(l), attrs);
3193             stn_he_p(buf, l, val);
3194         } else {
3195             /* RAM case */
3196             ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3197             memcpy(buf, ptr, l);
3198         }
3199 
3200         if (release_lock) {
3201             qemu_mutex_unlock_iothread();
3202             release_lock = false;
3203         }
3204 
3205         len -= l;
3206         buf += l;
3207         addr += l;
3208 
3209         if (!len) {
3210             break;
3211         }
3212 
3213         l = len;
3214         mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3215     }
3216 
3217     return result;
3218 }
3219 
3220 /* Called from RCU critical section.  */
flatview_read(FlatView * fv,hwaddr addr,MemTxAttrs attrs,uint8_t * buf,hwaddr len)3221 static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
3222                                  MemTxAttrs attrs, uint8_t *buf, hwaddr len)
3223 {
3224     hwaddr l;
3225     hwaddr addr1;
3226     MemoryRegion *mr;
3227 
3228     l = len;
3229     mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3230     return flatview_read_continue(fv, addr, attrs, buf, len,
3231                                   addr1, l, mr);
3232 }
3233 
address_space_read_full(AddressSpace * as,hwaddr addr,MemTxAttrs attrs,uint8_t * buf,hwaddr len)3234 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
3235                                     MemTxAttrs attrs, uint8_t *buf, hwaddr len)
3236 {
3237     MemTxResult result = MEMTX_OK;
3238     FlatView *fv;
3239 
3240     if (len > 0) {
3241         RCU_READ_LOCK_GUARD();
3242         fv = address_space_to_flatview(as);
3243         result = flatview_read(fv, addr, attrs, buf, len);
3244     }
3245 
3246     return result;
3247 }
3248 
address_space_write(AddressSpace * as,hwaddr addr,MemTxAttrs attrs,const uint8_t * buf,hwaddr len)3249 MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
3250                                 MemTxAttrs attrs,
3251                                 const uint8_t *buf, hwaddr len)
3252 {
3253     MemTxResult result = MEMTX_OK;
3254     FlatView *fv;
3255 
3256     if (len > 0) {
3257         RCU_READ_LOCK_GUARD();
3258         fv = address_space_to_flatview(as);
3259         result = flatview_write(fv, addr, attrs, buf, len);
3260     }
3261 
3262     return result;
3263 }
3264 
address_space_rw(AddressSpace * as,hwaddr addr,MemTxAttrs attrs,uint8_t * buf,hwaddr len,bool is_write)3265 MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
3266                              uint8_t *buf, hwaddr len, bool is_write)
3267 {
3268     if (is_write) {
3269         return address_space_write(as, addr, attrs, buf, len);
3270     } else {
3271         return address_space_read_full(as, addr, attrs, buf, len);
3272     }
3273 }
3274 
cpu_physical_memory_rw(hwaddr addr,uint8_t * buf,hwaddr len,int is_write)3275 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
3276                             hwaddr len, int is_write)
3277 {
3278     address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
3279                      buf, len, is_write);
3280 }
3281 
3282 enum write_rom_type {
3283     WRITE_DATA,
3284     FLUSH_CACHE,
3285 };
3286 
address_space_write_rom_internal(AddressSpace * as,hwaddr addr,MemTxAttrs attrs,const uint8_t * buf,hwaddr len,enum write_rom_type type)3287 static inline MemTxResult address_space_write_rom_internal(AddressSpace *as,
3288                                                            hwaddr addr,
3289                                                            MemTxAttrs attrs,
3290                                                            const uint8_t *buf,
3291                                                            hwaddr len,
3292                                                            enum write_rom_type type)
3293 {
3294     hwaddr l;
3295     uint8_t *ptr;
3296     hwaddr addr1;
3297     MemoryRegion *mr;
3298 
3299     RCU_READ_LOCK_GUARD();
3300     while (len > 0) {
3301         l = len;
3302         mr = address_space_translate(as, addr, &addr1, &l, true, attrs);
3303 
3304         if (!(memory_region_is_ram(mr) ||
3305               memory_region_is_romd(mr))) {
3306             l = memory_access_size(mr, l, addr1);
3307         } else {
3308             /* ROM/RAM case */
3309             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3310             switch (type) {
3311             case WRITE_DATA:
3312                 memcpy(ptr, buf, l);
3313                 invalidate_and_set_dirty(mr, addr1, l);
3314                 break;
3315             case FLUSH_CACHE:
3316                 flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
3317                 break;
3318             }
3319         }
3320         len -= l;
3321         buf += l;
3322         addr += l;
3323     }
3324     return MEMTX_OK;
3325 }
3326 
3327 /* used for ROM loading : can write in RAM and ROM */
address_space_write_rom(AddressSpace * as,hwaddr addr,MemTxAttrs attrs,const uint8_t * buf,hwaddr len)3328 MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
3329                                     MemTxAttrs attrs,
3330                                     const uint8_t *buf, hwaddr len)
3331 {
3332     return address_space_write_rom_internal(as, addr, attrs,
3333                                             buf, len, WRITE_DATA);
3334 }
3335 
cpu_flush_icache_range(hwaddr start,hwaddr len)3336 void cpu_flush_icache_range(hwaddr start, hwaddr len)
3337 {
3338     /*
3339      * This function should do the same thing as an icache flush that was
3340      * triggered from within the guest. For TCG we are always cache coherent,
3341      * so there is no need to flush anything. For KVM / Xen we need to flush
3342      * the host's instruction cache at least.
3343      */
3344     if (tcg_enabled()) {
3345         return;
3346     }
3347 
3348     address_space_write_rom_internal(&address_space_memory,
3349                                      start, MEMTXATTRS_UNSPECIFIED,
3350                                      NULL, len, FLUSH_CACHE);
3351 }
3352 
3353 typedef struct {
3354     MemoryRegion *mr;
3355     void *buffer;
3356     hwaddr addr;
3357     hwaddr len;
3358     bool in_use;
3359 } BounceBuffer;
3360 
3361 static BounceBuffer bounce;
3362 
3363 typedef struct MapClient {
3364     QEMUBH *bh;
3365     QLIST_ENTRY(MapClient) link;
3366 } MapClient;
3367 
3368 QemuMutex map_client_list_lock;
3369 static QLIST_HEAD(, MapClient) map_client_list
3370     = QLIST_HEAD_INITIALIZER(map_client_list);
3371 
cpu_unregister_map_client_do(MapClient * client)3372 static void cpu_unregister_map_client_do(MapClient *client)
3373 {
3374     QLIST_REMOVE(client, link);
3375     g_free(client);
3376 }
3377 
cpu_notify_map_clients_locked(void)3378 static void cpu_notify_map_clients_locked(void)
3379 {
3380     MapClient *client;
3381 
3382     while (!QLIST_EMPTY(&map_client_list)) {
3383         client = QLIST_FIRST(&map_client_list);
3384         qemu_bh_schedule(client->bh);
3385         cpu_unregister_map_client_do(client);
3386     }
3387 }
3388 
cpu_register_map_client(QEMUBH * bh)3389 void cpu_register_map_client(QEMUBH *bh)
3390 {
3391     MapClient *client = g_malloc(sizeof(*client));
3392 
3393     qemu_mutex_lock(&map_client_list_lock);
3394     client->bh = bh;
3395     QLIST_INSERT_HEAD(&map_client_list, client, link);
3396     if (!atomic_read(&bounce.in_use)) {
3397         cpu_notify_map_clients_locked();
3398     }
3399     qemu_mutex_unlock(&map_client_list_lock);
3400 }
3401 
cpu_exec_init_all(void)3402 void cpu_exec_init_all(void)
3403 {
3404     qemu_mutex_init(&ram_list.mutex);
3405     /* The data structures we set up here depend on knowing the page size,
3406      * so no more changes can be made after this point.
3407      * In an ideal world, nothing we did before we had finished the
3408      * machine setup would care about the target page size, and we could
3409      * do this much later, rather than requiring board models to state
3410      * up front what their requirements are.
3411      */
3412     finalize_target_page_bits();
3413     io_mem_init();
3414     memory_map_init();
3415     qemu_mutex_init(&map_client_list_lock);
3416 }
3417 
cpu_unregister_map_client(QEMUBH * bh)3418 void cpu_unregister_map_client(QEMUBH *bh)
3419 {
3420     MapClient *client;
3421 
3422     qemu_mutex_lock(&map_client_list_lock);
3423     QLIST_FOREACH(client, &map_client_list, link) {
3424         if (client->bh == bh) {
3425             cpu_unregister_map_client_do(client);
3426             break;
3427         }
3428     }
3429     qemu_mutex_unlock(&map_client_list_lock);
3430 }
3431 
cpu_notify_map_clients(void)3432 static void cpu_notify_map_clients(void)
3433 {
3434     qemu_mutex_lock(&map_client_list_lock);
3435     cpu_notify_map_clients_locked();
3436     qemu_mutex_unlock(&map_client_list_lock);
3437 }
3438 
flatview_access_valid(FlatView * fv,hwaddr addr,hwaddr len,bool is_write,MemTxAttrs attrs)3439 static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
3440                                   bool is_write, MemTxAttrs attrs)
3441 {
3442     MemoryRegion *mr;
3443     hwaddr l, xlat;
3444 
3445     while (len > 0) {
3446         l = len;
3447         mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3448         if (!memory_access_is_direct(mr, is_write)) {
3449             l = memory_access_size(mr, l, addr);
3450             if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
3451                 return false;
3452             }
3453         }
3454 
3455         len -= l;
3456         addr += l;
3457     }
3458     return true;
3459 }
3460 
address_space_access_valid(AddressSpace * as,hwaddr addr,hwaddr len,bool is_write,MemTxAttrs attrs)3461 bool address_space_access_valid(AddressSpace *as, hwaddr addr,
3462                                 hwaddr len, bool is_write,
3463                                 MemTxAttrs attrs)
3464 {
3465     FlatView *fv;
3466     bool result;
3467 
3468     RCU_READ_LOCK_GUARD();
3469     fv = address_space_to_flatview(as);
3470     result = flatview_access_valid(fv, addr, len, is_write, attrs);
3471     return result;
3472 }
3473 
3474 static hwaddr
flatview_extend_translation(FlatView * fv,hwaddr addr,hwaddr target_len,MemoryRegion * mr,hwaddr base,hwaddr len,bool is_write,MemTxAttrs attrs)3475 flatview_extend_translation(FlatView *fv, hwaddr addr,
3476                             hwaddr target_len,
3477                             MemoryRegion *mr, hwaddr base, hwaddr len,
3478                             bool is_write, MemTxAttrs attrs)
3479 {
3480     hwaddr done = 0;
3481     hwaddr xlat;
3482     MemoryRegion *this_mr;
3483 
3484     for (;;) {
3485         target_len -= len;
3486         addr += len;
3487         done += len;
3488         if (target_len == 0) {
3489             return done;
3490         }
3491 
3492         len = target_len;
3493         this_mr = flatview_translate(fv, addr, &xlat,
3494                                      &len, is_write, attrs);
3495         if (this_mr != mr || xlat != base + done) {
3496             return done;
3497         }
3498     }
3499 }
3500 
3501 /* Map a physical memory region into a host virtual address.
3502  * May map a subset of the requested range, given by and returned in *plen.
3503  * May return NULL if resources needed to perform the mapping are exhausted.
3504  * Use only for reads OR writes - not for read-modify-write operations.
3505  * Use cpu_register_map_client() to know when retrying the map operation is
3506  * likely to succeed.
3507  */
address_space_map(AddressSpace * as,hwaddr addr,hwaddr * plen,bool is_write,MemTxAttrs attrs)3508 void *address_space_map(AddressSpace *as,
3509                         hwaddr addr,
3510                         hwaddr *plen,
3511                         bool is_write,
3512                         MemTxAttrs attrs)
3513 {
3514     hwaddr len = *plen;
3515     hwaddr l, xlat;
3516     MemoryRegion *mr;
3517     void *ptr;
3518     FlatView *fv;
3519 
3520     if (len == 0) {
3521         return NULL;
3522     }
3523 
3524     l = len;
3525     RCU_READ_LOCK_GUARD();
3526     fv = address_space_to_flatview(as);
3527     mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3528 
3529     if (!memory_access_is_direct(mr, is_write)) {
3530         if (atomic_xchg(&bounce.in_use, true)) {
3531             return NULL;
3532         }
3533         /* Avoid unbounded allocations */
3534         l = MIN(l, TARGET_PAGE_SIZE);
3535         bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3536         bounce.addr = addr;
3537         bounce.len = l;
3538 
3539         memory_region_ref(mr);
3540         bounce.mr = mr;
3541         if (!is_write) {
3542             flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
3543                                bounce.buffer, l);
3544         }
3545 
3546         *plen = l;
3547         return bounce.buffer;
3548     }
3549 
3550 
3551     memory_region_ref(mr);
3552     *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
3553                                         l, is_write, attrs);
3554     ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
3555 
3556     return ptr;
3557 }
3558 
3559 /* Unmaps a memory region previously mapped by address_space_map().
3560  * Will also mark the memory as dirty if is_write == 1.  access_len gives
3561  * the amount of memory that was actually read or written by the caller.
3562  */
address_space_unmap(AddressSpace * as,void * buffer,hwaddr len,int is_write,hwaddr access_len)3563 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3564                          int is_write, hwaddr access_len)
3565 {
3566     if (buffer != bounce.buffer) {
3567         MemoryRegion *mr;
3568         ram_addr_t addr1;
3569 
3570         mr = memory_region_from_host(buffer, &addr1);
3571         assert(mr != NULL);
3572         if (is_write) {
3573             invalidate_and_set_dirty(mr, addr1, access_len);
3574         }
3575         if (xen_enabled()) {
3576             xen_invalidate_map_cache_entry(buffer);
3577         }
3578         memory_region_unref(mr);
3579         return;
3580     }
3581     if (is_write) {
3582         address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
3583                             bounce.buffer, access_len);
3584     }
3585     qemu_vfree(bounce.buffer);
3586     bounce.buffer = NULL;
3587     memory_region_unref(bounce.mr);
3588     atomic_mb_set(&bounce.in_use, false);
3589     cpu_notify_map_clients();
3590 }
3591 
cpu_physical_memory_map(hwaddr addr,hwaddr * plen,int is_write)3592 void *cpu_physical_memory_map(hwaddr addr,
3593                               hwaddr *plen,
3594                               int is_write)
3595 {
3596     return address_space_map(&address_space_memory, addr, plen, is_write,
3597                              MEMTXATTRS_UNSPECIFIED);
3598 }
3599 
cpu_physical_memory_unmap(void * buffer,hwaddr len,int is_write,hwaddr access_len)3600 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3601                                int is_write, hwaddr access_len)
3602 {
3603     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3604 }
3605 
3606 #define ARG1_DECL                AddressSpace *as
3607 #define ARG1                     as
3608 #define SUFFIX
3609 #define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
3610 #define RCU_READ_LOCK(...)       rcu_read_lock()
3611 #define RCU_READ_UNLOCK(...)     rcu_read_unlock()
3612 #include "memory_ldst.inc.c"
3613 
address_space_cache_init(MemoryRegionCache * cache,AddressSpace * as,hwaddr addr,hwaddr len,bool is_write)3614 int64_t address_space_cache_init(MemoryRegionCache *cache,
3615                                  AddressSpace *as,
3616                                  hwaddr addr,
3617                                  hwaddr len,
3618                                  bool is_write)
3619 {
3620     AddressSpaceDispatch *d;
3621     hwaddr l;
3622     MemoryRegion *mr;
3623 
3624     assert(len > 0);
3625 
3626     l = len;
3627     cache->fv = address_space_get_flatview(as);
3628     d = flatview_to_dispatch(cache->fv);
3629     cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
3630 
3631     mr = cache->mrs.mr;
3632     memory_region_ref(mr);
3633     if (memory_access_is_direct(mr, is_write)) {
3634         /* We don't care about the memory attributes here as we're only
3635          * doing this if we found actual RAM, which behaves the same
3636          * regardless of attributes; so UNSPECIFIED is fine.
3637          */
3638         l = flatview_extend_translation(cache->fv, addr, len, mr,
3639                                         cache->xlat, l, is_write,
3640                                         MEMTXATTRS_UNSPECIFIED);
3641         cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
3642     } else {
3643         cache->ptr = NULL;
3644     }
3645 
3646     cache->len = l;
3647     cache->is_write = is_write;
3648     return l;
3649 }
3650 
address_space_cache_invalidate(MemoryRegionCache * cache,hwaddr addr,hwaddr access_len)3651 void address_space_cache_invalidate(MemoryRegionCache *cache,
3652                                     hwaddr addr,
3653                                     hwaddr access_len)
3654 {
3655     assert(cache->is_write);
3656     if (likely(cache->ptr)) {
3657         invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
3658     }
3659 }
3660 
address_space_cache_destroy(MemoryRegionCache * cache)3661 void address_space_cache_destroy(MemoryRegionCache *cache)
3662 {
3663     if (!cache->mrs.mr) {
3664         return;
3665     }
3666 
3667     if (xen_enabled()) {
3668         xen_invalidate_map_cache_entry(cache->ptr);
3669     }
3670     memory_region_unref(cache->mrs.mr);
3671     flatview_unref(cache->fv);
3672     cache->mrs.mr = NULL;
3673     cache->fv = NULL;
3674 }
3675 
3676 /* Called from RCU critical section.  This function has the same
3677  * semantics as address_space_translate, but it only works on a
3678  * predefined range of a MemoryRegion that was mapped with
3679  * address_space_cache_init.
3680  */
address_space_translate_cached(MemoryRegionCache * cache,hwaddr addr,hwaddr * xlat,hwaddr * plen,bool is_write,MemTxAttrs attrs)3681 static inline MemoryRegion *address_space_translate_cached(
3682     MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
3683     hwaddr *plen, bool is_write, MemTxAttrs attrs)
3684 {
3685     MemoryRegionSection section;
3686     MemoryRegion *mr;
3687     IOMMUMemoryRegion *iommu_mr;
3688     AddressSpace *target_as;
3689 
3690     assert(!cache->ptr);
3691     *xlat = addr + cache->xlat;
3692 
3693     mr = cache->mrs.mr;
3694     iommu_mr = memory_region_get_iommu(mr);
3695     if (!iommu_mr) {
3696         /* MMIO region.  */
3697         return mr;
3698     }
3699 
3700     section = address_space_translate_iommu(iommu_mr, xlat, plen,
3701                                             NULL, is_write, true,
3702                                             &target_as, attrs);
3703     return section.mr;
3704 }
3705 
3706 /* Called from RCU critical section. address_space_read_cached uses this
3707  * out of line function when the target is an MMIO or IOMMU region.
3708  */
3709 void
address_space_read_cached_slow(MemoryRegionCache * cache,hwaddr addr,void * buf,hwaddr len)3710 address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3711                                    void *buf, hwaddr len)
3712 {
3713     hwaddr addr1, l;
3714     MemoryRegion *mr;
3715 
3716     l = len;
3717     mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
3718                                         MEMTXATTRS_UNSPECIFIED);
3719     flatview_read_continue(cache->fv,
3720                            addr, MEMTXATTRS_UNSPECIFIED, buf, len,
3721                            addr1, l, mr);
3722 }
3723 
3724 /* Called from RCU critical section. address_space_write_cached uses this
3725  * out of line function when the target is an MMIO or IOMMU region.
3726  */
3727 void
address_space_write_cached_slow(MemoryRegionCache * cache,hwaddr addr,const void * buf,hwaddr len)3728 address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3729                                     const void *buf, hwaddr len)
3730 {
3731     hwaddr addr1, l;
3732     MemoryRegion *mr;
3733 
3734     l = len;
3735     mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
3736                                         MEMTXATTRS_UNSPECIFIED);
3737     flatview_write_continue(cache->fv,
3738                             addr, MEMTXATTRS_UNSPECIFIED, buf, len,
3739                             addr1, l, mr);
3740 }
3741 
3742 #define ARG1_DECL                MemoryRegionCache *cache
3743 #define ARG1                     cache
3744 #define SUFFIX                   _cached_slow
3745 #define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
3746 #define RCU_READ_LOCK()          ((void)0)
3747 #define RCU_READ_UNLOCK()        ((void)0)
3748 #include "memory_ldst.inc.c"
3749 
3750 /* virtual memory access for debug (includes writing to ROM) */
cpu_memory_rw_debug(CPUState * cpu,target_ulong addr,uint8_t * buf,target_ulong len,int is_write)3751 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3752                         uint8_t *buf, target_ulong len, int is_write)
3753 {
3754     hwaddr phys_addr;
3755     target_ulong l, page;
3756 
3757     cpu_synchronize_state(cpu);
3758     while (len > 0) {
3759         int asidx;
3760         MemTxAttrs attrs;
3761 
3762         page = addr & TARGET_PAGE_MASK;
3763         phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3764         asidx = cpu_asidx_from_attrs(cpu, attrs);
3765         /* if no physical page mapped, return an error */
3766         if (phys_addr == -1)
3767             return -1;
3768         l = (page + TARGET_PAGE_SIZE) - addr;
3769         if (l > len)
3770             l = len;
3771         phys_addr += (addr & ~TARGET_PAGE_MASK);
3772         if (is_write) {
3773             address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr,
3774                                     attrs, buf, l);
3775         } else {
3776             address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
3777                              attrs, buf, l, 0);
3778         }
3779         len -= l;
3780         buf += l;
3781         addr += l;
3782     }
3783     return 0;
3784 }
3785 
3786 /*
3787  * Allows code that needs to deal with migration bitmaps etc to still be built
3788  * target independent.
3789  */
qemu_target_page_size(void)3790 size_t qemu_target_page_size(void)
3791 {
3792     return TARGET_PAGE_SIZE;
3793 }
3794 
qemu_target_page_bits(void)3795 int qemu_target_page_bits(void)
3796 {
3797     return TARGET_PAGE_BITS;
3798 }
3799 
qemu_target_page_bits_min(void)3800 int qemu_target_page_bits_min(void)
3801 {
3802     return TARGET_PAGE_BITS_MIN;
3803 }
3804 #endif
3805 
target_words_bigendian(void)3806 bool target_words_bigendian(void)
3807 {
3808 #if defined(TARGET_WORDS_BIGENDIAN)
3809     return true;
3810 #else
3811     return false;
3812 #endif
3813 }
3814 
3815 #ifndef CONFIG_USER_ONLY
cpu_physical_memory_is_io(hwaddr phys_addr)3816 bool cpu_physical_memory_is_io(hwaddr phys_addr)
3817 {
3818     MemoryRegion*mr;
3819     hwaddr l = 1;
3820     bool res;
3821 
3822     RCU_READ_LOCK_GUARD();
3823     mr = address_space_translate(&address_space_memory,
3824                                  phys_addr, &phys_addr, &l, false,
3825                                  MEMTXATTRS_UNSPECIFIED);
3826 
3827     res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3828     return res;
3829 }
3830 
qemu_ram_foreach_block(RAMBlockIterFunc func,void * opaque)3831 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3832 {
3833     RAMBlock *block;
3834     int ret = 0;
3835 
3836     RCU_READ_LOCK_GUARD();
3837     RAMBLOCK_FOREACH(block) {
3838         ret = func(block, opaque);
3839         if (ret) {
3840             break;
3841         }
3842     }
3843     return ret;
3844 }
3845 
3846 /*
3847  * Unmap pages of memory from start to start+length such that
3848  * they a) read as 0, b) Trigger whatever fault mechanism
3849  * the OS provides for postcopy.
3850  * The pages must be unmapped by the end of the function.
3851  * Returns: 0 on success, none-0 on failure
3852  *
3853  */
ram_block_discard_range(RAMBlock * rb,uint64_t start,size_t length)3854 int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
3855 {
3856     int ret = -1;
3857 
3858     uint8_t *host_startaddr = rb->host + start;
3859 
3860     if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
3861         error_report("ram_block_discard_range: Unaligned start address: %p",
3862                      host_startaddr);
3863         goto err;
3864     }
3865 
3866     if ((start + length) <= rb->used_length) {
3867         bool need_madvise, need_fallocate;
3868         uint8_t *host_endaddr = host_startaddr + length;
3869         if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
3870             error_report("ram_block_discard_range: Unaligned end address: %p",
3871                          host_endaddr);
3872             goto err;
3873         }
3874 
3875         errno = ENOTSUP; /* If we are missing MADVISE etc */
3876 
3877         /* The logic here is messy;
3878          *    madvise DONTNEED fails for hugepages
3879          *    fallocate works on hugepages and shmem
3880          */
3881         need_madvise = (rb->page_size == qemu_host_page_size);
3882         need_fallocate = rb->fd != -1;
3883         if (need_fallocate) {
3884             /* For a file, this causes the area of the file to be zero'd
3885              * if read, and for hugetlbfs also causes it to be unmapped
3886              * so a userfault will trigger.
3887              */
3888 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
3889             ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
3890                             start, length);
3891             if (ret) {
3892                 ret = -errno;
3893                 error_report("ram_block_discard_range: Failed to fallocate "
3894                              "%s:%" PRIx64 " +%zx (%d)",
3895                              rb->idstr, start, length, ret);
3896                 goto err;
3897             }
3898 #else
3899             ret = -ENOSYS;
3900             error_report("ram_block_discard_range: fallocate not available/file"
3901                          "%s:%" PRIx64 " +%zx (%d)",
3902                          rb->idstr, start, length, ret);
3903             goto err;
3904 #endif
3905         }
3906         if (need_madvise) {
3907             /* For normal RAM this causes it to be unmapped,
3908              * for shared memory it causes the local mapping to disappear
3909              * and to fall back on the file contents (which we just
3910              * fallocate'd away).
3911              */
3912 #if defined(CONFIG_MADVISE)
3913             ret =  madvise(host_startaddr, length, MADV_DONTNEED);
3914             if (ret) {
3915                 ret = -errno;
3916                 error_report("ram_block_discard_range: Failed to discard range "
3917                              "%s:%" PRIx64 " +%zx (%d)",
3918                              rb->idstr, start, length, ret);
3919                 goto err;
3920             }
3921 #else
3922             ret = -ENOSYS;
3923             error_report("ram_block_discard_range: MADVISE not available"
3924                          "%s:%" PRIx64 " +%zx (%d)",
3925                          rb->idstr, start, length, ret);
3926             goto err;
3927 #endif
3928         }
3929         trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
3930                                       need_madvise, need_fallocate, ret);
3931     } else {
3932         error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
3933                      "/%zx/" RAM_ADDR_FMT")",
3934                      rb->idstr, start, length, rb->used_length);
3935     }
3936 
3937 err:
3938     return ret;
3939 }
3940 
ramblock_is_pmem(RAMBlock * rb)3941 bool ramblock_is_pmem(RAMBlock *rb)
3942 {
3943     return rb->flags & RAM_PMEM;
3944 }
3945 
3946 #endif
3947 
page_size_init(void)3948 void page_size_init(void)
3949 {
3950     /* NOTE: we can always suppose that qemu_host_page_size >=
3951        TARGET_PAGE_SIZE */
3952     if (qemu_host_page_size == 0) {
3953         qemu_host_page_size = qemu_real_host_page_size;
3954     }
3955     if (qemu_host_page_size < TARGET_PAGE_SIZE) {
3956         qemu_host_page_size = TARGET_PAGE_SIZE;
3957     }
3958     qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
3959 }
3960 
3961 #if !defined(CONFIG_USER_ONLY)
3962 
mtree_print_phys_entries(int start,int end,int skip,int ptr)3963 static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
3964 {
3965     if (start == end - 1) {
3966         qemu_printf("\t%3d      ", start);
3967     } else {
3968         qemu_printf("\t%3d..%-3d ", start, end - 1);
3969     }
3970     qemu_printf(" skip=%d ", skip);
3971     if (ptr == PHYS_MAP_NODE_NIL) {
3972         qemu_printf(" ptr=NIL");
3973     } else if (!skip) {
3974         qemu_printf(" ptr=#%d", ptr);
3975     } else {
3976         qemu_printf(" ptr=[%d]", ptr);
3977     }
3978     qemu_printf("\n");
3979 }
3980 
3981 #define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
3982                            int128_sub((size), int128_one())) : 0)
3983 
mtree_print_dispatch(AddressSpaceDispatch * d,MemoryRegion * root)3984 void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root)
3985 {
3986     int i;
3987 
3988     qemu_printf("  Dispatch\n");
3989     qemu_printf("    Physical sections\n");
3990 
3991     for (i = 0; i < d->map.sections_nb; ++i) {
3992         MemoryRegionSection *s = d->map.sections + i;
3993         const char *names[] = { " [unassigned]", " [not dirty]",
3994                                 " [ROM]", " [watch]" };
3995 
3996         qemu_printf("      #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx
3997                     " %s%s%s%s%s",
3998             i,
3999             s->offset_within_address_space,
4000             s->offset_within_address_space + MR_SIZE(s->mr->size),
4001             s->mr->name ? s->mr->name : "(noname)",
4002             i < ARRAY_SIZE(names) ? names[i] : "",
4003             s->mr == root ? " [ROOT]" : "",
4004             s == d->mru_section ? " [MRU]" : "",
4005             s->mr->is_iommu ? " [iommu]" : "");
4006 
4007         if (s->mr->alias) {
4008             qemu_printf(" alias=%s", s->mr->alias->name ?
4009                     s->mr->alias->name : "noname");
4010         }
4011         qemu_printf("\n");
4012     }
4013 
4014     qemu_printf("    Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
4015                P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
4016     for (i = 0; i < d->map.nodes_nb; ++i) {
4017         int j, jprev;
4018         PhysPageEntry prev;
4019         Node *n = d->map.nodes + i;
4020 
4021         qemu_printf("      [%d]\n", i);
4022 
4023         for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
4024             PhysPageEntry *pe = *n + j;
4025 
4026             if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
4027                 continue;
4028             }
4029 
4030             mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4031 
4032             jprev = j;
4033             prev = *pe;
4034         }
4035 
4036         if (jprev != ARRAY_SIZE(*n)) {
4037             mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4038         }
4039     }
4040 }
4041 
4042 #endif
4043