1 /////////////////////////////////////////////////////////////////////////
2 // $Id: paging.cc 14328 2021-07-27 19:18:34Z vruppert $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 //  Copyright (C) 2001-2021  The Bochs Project
6 //
7 //  This library is free software; you can redistribute it and/or
8 //  modify it under the terms of the GNU Lesser General Public
9 //  License as published by the Free Software Foundation; either
10 //  version 2 of the License, or (at your option) any later version.
11 //
12 //  This library is distributed in the hope that it will be useful,
13 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 //  Lesser General Public License for more details.
16 //
17 //  You should have received a copy of the GNU Lesser General Public
18 //  License along with this library; if not, write to the Free Software
19 //  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
20 /////////////////////////////////////////////////////////////////////////
21 
22 #define NEED_CPU_REG_SHORTCUTS 1
23 #include "bochs.h"
24 #include "cpu.h"
25 #include "msr.h"
26 #define LOG_THIS BX_CPU_THIS_PTR
27 
28 #include "memory/memory-bochs.h"
29 #include "pc_system.h"
30 
31 // X86 Registers Which Affect Paging:
32 // ==================================
33 //
34 // CR0:
35 //   bit 31: PG, Paging (386+)
36 //   bit 16: WP, Write Protect (486+)
37 //     0: allow   supervisor level writes into user level RO pages
38 //     1: inhibit supervisor level writes into user level RO pages
39 //
40 // CR3:
41 //   bit 31..12: PDBR, Page Directory Base Register (386+)
42 //   bit      4: PCD, Page level Cache Disable (486+)
43 //     Controls caching of current page directory.  Affects only the processor's
44 //     internal caches (L1 and L2).
45 //     This flag ignored if paging disabled (PG=0) or cache disabled (CD=1).
46 //     Values:
47 //       0: Page Directory can be cached
48 //       1: Page Directory not cached
49 //   bit      3: PWT, Page level Writes Transparent (486+)
50 //     Controls write-through or write-back caching policy of current page
51 //     directory.  Affects only the processor's internal caches (L1 and L2).
52 //     This flag ignored if paging disabled (PG=0) or cache disabled (CD=1).
53 //     Values:
54 //       0: write-back caching enabled
55 //       1: write-through caching enabled
56 //
57 // CR4:
58 //   bit 4: PSE, Page Size Extension (Pentium+)
59 //     0: 4KByte pages (typical)
60 //     1: 4MByte or 2MByte pages
61 //   bit 5: PAE, Physical Address Extension (Pentium Pro+)
62 //     0: 32bit physical addresses
63 //     1: 36bit physical addresses
64 //   bit 7: PGE, Page Global Enable (Pentium Pro+)
65 //     The global page feature allows frequently used or shared pages
66 //     to be marked as global (PDE or PTE bit 8).  Global pages are
67 //     not flushed from TLB on a task switch or write to CR3.
68 //     Values:
69 //       0: disables global page feature
70 //       1: enables global page feature
71 //
72 //    page size extention and physical address size extention matrix (legacy mode)
73 //   ==============================================================================
74 //   CR0.PG  CR4.PAE  CR4.PSE  PDPE.PS  PDE.PS | page size   physical address size
75 //   ==============================================================================
76 //      0       X        X       R         X   |   --          paging disabled
77 //      1       0        0       R         X   |   4K              32bits
78 //      1       0        1       R         0   |   4K              32bits
79 //      1       0        1       R         1   |   4M              32bits
80 //      1       1        X       R         0   |   4K              36bits
81 //      1       1        X       R         1   |   2M              36bits
82 
83 //     page size extention and physical address size extention matrix (long mode)
84 //   ==============================================================================
85 //   CR0.PG  CR4.PAE  CR4.PSE  PDPE.PS  PDE.PS | page size   physical address size
86 //   ==============================================================================
87 //      1       1        X       0         0   |   4K              52bits
88 //      1       1        X       0         1   |   2M              52bits
89 //      1       1        X       1         -   |   1G              52bits
90 
91 
92 // Page Directory/Table Entry Fields Defined:
93 // ==========================================
94 // NX: No Execute
95 //   This bit controls the ability to execute code from all physical
96 //   pages mapped by the table entry.
97 //     0: Code can be executed from the mapped physical pages
98 //     1: Code cannot be executed
99 //   The NX bit can only be set when the no-execute page-protection
100 //   feature is enabled by setting EFER.NXE=1, If EFER.NXE=0, the
101 //   NX bit is treated as reserved. In this case, #PF occurs if the
102 //   NX bit is not cleared to zero.
103 //
104 // G: Global flag
105 //   Indiciates a global page when set.  When a page is marked
106 //   global and the PGE flag in CR4 is set, the page table or
107 //   directory entry for the page is not invalidated in the TLB
108 //   when CR3 is loaded or a task switch occurs.  Only software
109 //   clears and sets this flag.  For page directory entries that
110 //   point to page tables, this flag is ignored and the global
111 //   characteristics of a page are set in the page table entries.
112 //
113 // PS: Page Size flag
114 //   Only used in page directory entries.  When PS=0, the page
115 //   size is 4KBytes and the page directory entry points to a
116 //   page table.  When PS=1, the page size is 4MBytes for
117 //   normal 32-bit addressing and 2MBytes if extended physical
118 //   addressing.
119 //
120 // PAT: Page-Attribute Table
121 //   This bit is only present in the lowest level of the page
122 //   translation hierarchy. The PAT bit is the high-order bit
123 //   of a 3-bit index into the PAT register. The other two
124 //   bits involved in forming the index are the PCD and PWT
125 //   bits.
126 //
127 // D: Dirty bit:
128 //   Processor sets the Dirty bit in the 2nd-level page table before a
129 //   write operation to an address mapped by that page table entry.
130 //   Dirty bit in directory entries is undefined.
131 //
132 // A: Accessed bit:
133 //   Processor sets the Accessed bits in both levels of page tables before
134 //   a read/write operation to a page.
135 //
136 // PCD: Page level Cache Disable
137 //   Controls caching of individual pages or page tables.
138 //   This allows a per-page based mechanism to disable caching, for
139 //   those pages which contained memory mapped IO, or otherwise
140 //   should not be cached.  Processor ignores this flag if paging
141 //   is not used (CR0.PG=0) or the cache disable bit is set (CR0.CD=1).
142 //   Values:
143 //     0: page or page table can be cached
144 //     1: page or page table is not cached (prevented)
145 //
146 // PWT: Page level Write Through
147 //   Controls the write-through or write-back caching policy of individual
148 //   pages or page tables.  Processor ignores this flag if paging
149 //   is not used (CR0.PG=0) or the cache disable bit is set (CR0.CD=1).
150 //   Values:
151 //     0: write-back caching
152 //     1: write-through caching
153 //
154 // U/S: User/Supervisor level
155 //   0: Supervisor level - for the OS, drivers, etc.
156 //   1: User level - application code and data
157 //
158 // R/W: Read/Write access
159 //   0: read-only access
160 //   1: read/write access
161 //
162 // P: Present
163 //   0: Not present
164 //   1: Present
165 // ==========================================
166 
167 // Combined page directory/page table protection:
168 // ==============================================
169 // There is one column for the combined effect on a 386
170 // and one column for the combined effect on a 486+ CPU.
171 // The 386 CPU behavior is not supported by Bochs.
172 //
173 // +----------------+-----------------+----------------+----------------+
174 // |  Page Directory|     Page Table  |   Combined 386 |  Combined 486+ |
175 // |Privilege  Type | Privilege  Type | Privilege  Type| Privilege  Type|
176 // |----------------+-----------------+----------------+----------------|
177 // |User       R    | User       R    | User       R   | User       R   |
178 // |User       R    | User       RW   | User       R   | User       R   |
179 // |User       RW   | User       R    | User       R   | User       R   |
180 // |User       RW   | User       RW   | User       RW  | User       RW  |
181 // |User       R    | Supervisor R    | User       R   | Supervisor RW  |
182 // |User       R    | Supervisor RW   | User       R   | Supervisor RW  |
183 // |User       RW   | Supervisor R    | User       R   | Supervisor RW  |
184 // |User       RW   | Supervisor RW   | User       RW  | Supervisor RW  |
185 // |Supervisor R    | User       R    | User       R   | Supervisor RW  |
186 // |Supervisor R    | User       RW   | User       R   | Supervisor RW  |
187 // |Supervisor RW   | User       R    | User       R   | Supervisor RW  |
188 // |Supervisor RW   | User       RW   | User       RW  | Supervisor RW  |
189 // |Supervisor R    | Supervisor R    | Supervisor RW  | Supervisor RW  |
190 // |Supervisor R    | Supervisor RW   | Supervisor RW  | Supervisor RW  |
191 // |Supervisor RW   | Supervisor R    | Supervisor RW  | Supervisor RW  |
192 // |Supervisor RW   | Supervisor RW   | Supervisor RW  | Supervisor RW  |
193 // +----------------+-----------------+----------------+----------------+
194 
195 // Page Fault Error Code Format:
196 // =============================
197 //
198 // bits 31..4: Reserved
199 // bit  3: RSVD (Pentium Pro+)
200 //   0: fault caused by reserved bits set to 1 in a page directory
201 //      when the PSE or PAE flags in CR4 are set to 1
202 //   1: fault was not caused by reserved bit violation
203 // bit  2: U/S (386+)
204 //   0: fault originated when in supervior mode
205 //   1: fault originated when in user mode
206 // bit  1: R/W (386+)
207 //   0: access causing the fault was a read
208 //   1: access causing the fault was a write
209 // bit  0: P (386+)
210 //   0: fault caused by a nonpresent page
211 //   1: fault caused by a page level protection violation
212 
213 // Some paging related notes:
214 // ==========================
215 //
216 // - When the processor is running in supervisor level, all pages are both
217 //   readable and writable (write-protect ignored).  When running at user
218 //   level, only pages which belong to the user level are accessible;
219 //   read/write & read-only are readable, read/write are writable.
220 //
221 // - If the Present bit is 0 in either level of page table, an
222 //   access which uses these entries will generate a page fault.
223 //
224 // - (A)ccess bit is used to report read or write access to a page
225 //   or 2nd level page table.
226 //
227 // - (D)irty bit is used to report write access to a page.
228 //
229 // - Processor running at CPL=0,1,2 maps to U/S=0
230 //   Processor running at CPL=3     maps to U/S=1
231 
232 // bit [11] of the TLB lpf used for TLB_NoHostPtr valid indication
233 #define TLB_LPFOf(laddr) AlignedAccessLPFOf(laddr, 0x7ff)
234 
235 #if BX_CPU_LEVEL >= 4
236 #  define BX_PRIV_CHECK_SIZE 32
237 #else
238 #  define BX_PRIV_CHECK_SIZE 16
239 #endif
240 
241 // The 'priv_check' array is used to decide if the current access
242 // has the proper paging permissions.  An index is formed, based
243 // on parameters such as the access type and level, the write protect
244 // flag and values cached in the TLB.  The format of the index into this
245 // array is:
246 //
247 //   |4 |3 |2 |1 |0 |
248 //   |wp|us|us|rw|rw|
249 //    |  |  |  |  |
250 //    |  |  |  |  +---> r/w of current access
251 //    |  |  +--+------> u/s,r/w combined of page dir & table (cached)
252 //    |  +------------> u/s of current access
253 //    +---------------> Current CR0.WP value
254 //
255 //                                                                  CR0.WP = 0     CR0.WP = 1
256 //    -----------------------------------------------------------------------------------------
257 //       0  0  0  0 | sys read from supervisor page             | Allowed       | Allowed
258 //       0  0  0  1 | sys write to read only supervisor page    | Allowed       | Not Allowed
259 //       0  0  1  0 | sys read from supervisor page             | Allowed       | Allowed
260 //       0  0  1  1 | sys write to supervisor page              | Allowed       | Allowed
261 //       0  1  0  0 | sys read from read only user page         | Allowed       | Allowed
262 //       0  1  0  1 | sys write to read only user page          | Allowed       | Not Allowed
263 //       0  1  1  0 | sys read from user page                   | Allowed       | Allowed
264 //       0  1  1  1 | sys write to user page                    | Allowed       | Allowed
265 //       1  0  0  0 | user read from read only supervisor page  | Not Allowed   | Not Allowed
266 //       1  0  0  1 | user write to read only supervisor page   | Not Allowed   | Not Allowed
267 //       1  0  1  0 | user read from supervisor page            | Not Allowed   | Not Allowed
268 //       1  0  1  1 | user write to supervisor page             | Not Allowed   | Not Allowed
269 //       1  1  0  0 | user read from read only user page        | Allowed       | Allowed
270 //       1  1  0  1 | user write to read only user page         | Not Allowed   | Not Allowed
271 //       1  1  1  0 | user read from user page                  | Allowed       | Allowed
272 //       1  1  1  1 | user write to user page                   | Allowed       | Allowed
273 //
274 
275 /* 0xff0bbb0b */
276 static const Bit8u priv_check[BX_PRIV_CHECK_SIZE] =
277 {
278   1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
279 #if BX_CPU_LEVEL >= 4
280   1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1
281 #endif
282 };
283 
284 // The 'priv_check' array for shadow stack accesses
285 //
286 //      |3 |2 |1 |0 |
287 //      |us|us|rw|rw|
288 //       |  |  |  |
289 //       |  |  |  +---> r/w of current access
290 //       |  +--+------> u/s,r/w combined of page dir & table (cached)
291 //       +------------> u/s of current access
292 //
293 //    -------------------------------------------------------------------
294 //       0  0  0  0 | sys read from supervisor page             | Allowed
295 //       0  0  0  1 | sys write to read only supervisor page    | Allowed : shadow stack page looks like read only page
296 //       0  0  1  0 | sys read from supervisor page             | Allowed
297 //       0  0  1  1 | sys write to supervisor page              | Allowed
298 //       0  1  0  0 | sys read from read only user page         | Not Allowed   : supervisor-mode shadow-stack access is not allowed to a user-mode page
299 //       0  1  0  1 | sys write to read only user page          | Not Allowed   : supervisor-mode shadow-stack access is not allowed to a user-mode page
300 //       0  1  1  0 | sys read from user page                   | Not Allowed   : supervisor-mode shadow-stack access is not allowed to a user-mode page
301 //       0  1  1  1 | sys write to user page                    | Not Allowed   : supervisor-mode shadow-stack access is not allowed to a user-mode page
302 //       1  0  0  0 | user read from read only supervisor page  | Not Allowed   : user-mode shadow-stack access is not allowed to a supervisor-mode page
303 //       1  0  0  1 | user write to read only supervisor page   | Not Allowed   : user-mode shadow-stack access is not allowed to a supervisor-mode page
304 //       1  0  1  0 | user read from supervisor page            | Not Allowed   : user-mode shadow-stack access is not allowed to a supervisor-mode page
305 //       1  0  1  1 | user write to supervisor page             | Not Allowed   : user-mode shadow-stack access is not allowed to a supervisor-mode page
306 //       1  1  0  0 | user read from read only user page        | Allowed
307 //       1  1  0  1 | user write to read only user page         | Allowed : shadow stack page looks like read only page
308 //       1  1  1  0 | user read from user page                  | Allowed
309 //       1  1  1  1 | user write to user page                   | Allowed
310 //
311 
312 const Bit64u BX_PAGING_PHY_ADDRESS_RESERVED_BITS = BX_PHY_ADDRESS_RESERVED_BITS & BX_CONST64(0xfffffffffffff);
313 
314 const Bit64u PAGE_DIRECTORY_NX_BIT = BX_CONST64(0x8000000000000000);
315 
316 const Bit64u BX_CR3_PAGING_MASK = BX_CONST64(0x000ffffffffff000);
317 
318 // Each entry in the TLB cache has 3 entries:
319 //
320 //   lpf:         Linear Page Frame (page aligned linear address of page)
321 //     bits 32..12  Linear page frame
322 //     bit  11      0: TLB HostPtr access allowed, 1: not allowed
323 //     bit  10...0  Invalidate index
324 //
325 //   ppf:         Physical Page Frame (page aligned phy address of page)
326 //
327 //   hostPageAddr:
328 //                Host Page Frame address used for direct access to
329 //                the mem.vector[] space allocated for the guest physical
330 //                memory.  If this is zero, it means that a pointer
331 //                to the host space could not be generated, likely because
332 //                that page of memory is not standard memory (it might
333 //                be memory mapped IO, ROM, etc).
334 //
335 //   accessBits:
336 //
337 //     bit  31:     Page is a global page.
338 //
339 //       The following bits are used for a very efficient permissions
340 //       check.  The goal is to be able, using only the current privilege
341 //       level and access type, to determine if the page tables allow the
342 //       access to occur or at least should rewalk the page tables.  On
343 //       the first read access, permissions are set to only read, so a
344 //       rewalk is necessary when a subsequent write fails the tests.
345 //       This allows for the dirty bit to be set properly, but for the
346 //       test to be efficient.  Note that the CR0.WP flag is not present.
347 //       The values in the following flags is based on the current CR0.WP
348 //       value, necessitating a TLB flush when CR0.WP changes.
349 //
350 //       The test bit:
351 //         OK = 1 << ((S<<2) | (W<<1) | U)
352 //
353 //       where S:1=Shadow Stack (CET)
354 //             W:1=Write, 0=Read;
355 //             U:1=CPL3, 0=CPL0-2
356 //
357 //       Thus for reads, it is:
358 //         OK = 0x01 << (          U )
359 //       for writes:
360 //         OK = 0x04 << (          U )
361 //       for shadow stack reads:
362 //         OK = 0x10 << (          U )
363 //       for shadow stack writes:
364 //         OK = 0x40 << (          U )
365 //
366 //     bit 3: Write   from User   privilege is OK
367 //     bit 2: Write   from System privilege is OK
368 //     bit 1: Read    from User   privilege is OK
369 //     bit 0: Read    from System privilege is OK
370 //
371 //       Note, that the TLB should have TLB_NoHostPtr bit set in the lpf when
372 //       direct access through host pointer is NOT allowed for the page.
373 //       A memory operation asking for a direct access through host pointer
374 //       will not set TLB_NoHostPtr bit in its lpf and thus get TLB miss
375 //       result when the direct access is not allowed.
376 //
377 
378 const Bit32u TLB_NoHostPtr = 0x800; /* set this bit when direct access is NOT allowed */
379 
380 #include "cpustats.h"
381 
382 // ==============================================================
383 
TLB_flush(void)384 void BX_CPU_C::TLB_flush(void)
385 {
386   INC_TLBFLUSH_STAT(tlbGlobalFlushes);
387 
388   invalidate_prefetch_q();
389   invalidate_stack_cache();
390 
391   BX_CPU_THIS_PTR DTLB.flush();
392   BX_CPU_THIS_PTR ITLB.flush();
393 
394 #if BX_SUPPORT_MONITOR_MWAIT
395   // invalidating of the TLB might change translation for monitored page
396   // and cause subsequent MWAIT instruction to wait forever
397   BX_CPU_THIS_PTR monitor.reset_monitor();
398 #endif
399 
400   // break all links bewteen traces
401   BX_CPU_THIS_PTR iCache.breakLinks();
402 }
403 
404 #if BX_CPU_LEVEL >= 6
TLB_flushNonGlobal(void)405 void BX_CPU_C::TLB_flushNonGlobal(void)
406 {
407   INC_TLBFLUSH_STAT(tlbNonGlobalFlushes);
408 
409   invalidate_prefetch_q();
410   invalidate_stack_cache();
411 
412   BX_CPU_THIS_PTR DTLB.flushNonGlobal();
413   BX_CPU_THIS_PTR ITLB.flushNonGlobal();
414 
415 #if BX_SUPPORT_MONITOR_MWAIT
416   // invalidating of the TLB might change translation for monitored page
417   // and cause subsequent MWAIT instruction to wait forever
418   BX_CPU_THIS_PTR monitor.reset_monitor();
419 #endif
420 
421   // break all links bewteen traces
422   BX_CPU_THIS_PTR iCache.breakLinks();
423 }
424 #endif
425 
TLB_invlpg(bx_address laddr)426 void BX_CPU_C::TLB_invlpg(bx_address laddr)
427 {
428   invalidate_prefetch_q();
429   invalidate_stack_cache();
430 
431   BX_DEBUG(("TLB_invlpg(0x" FMT_ADDRX "): invalidate TLB entry", laddr));
432   BX_CPU_THIS_PTR DTLB.invlpg(laddr);
433   BX_CPU_THIS_PTR ITLB.invlpg(laddr);
434 
435 #if BX_SUPPORT_MONITOR_MWAIT
436   // invalidating of the TLB entry might change translation for monitored
437   // page and cause subsequent MWAIT instruction to wait forever
438   BX_CPU_THIS_PTR monitor.reset_monitor();
439 #endif
440 
441   // break all links bewteen traces
442   BX_CPU_THIS_PTR iCache.breakLinks();
443 }
444 
INVLPG(bxInstruction_c * i)445 void BX_CPP_AttrRegparmN(1) BX_CPU_C::INVLPG(bxInstruction_c* i)
446 {
447   // CPL is always 0 in real mode
448   if (/* !real_mode() && */ CPL!=0) {
449     BX_ERROR(("%s: priveledge check failed, generate #GP(0)", i->getIaOpcodeNameShort()));
450     exception(BX_GP_EXCEPTION, 0);
451   }
452 
453   bx_address eaddr = BX_CPU_RESOLVE_ADDR(i);
454   bx_address laddr = get_laddr(i->seg(), eaddr);
455 
456 #if BX_SUPPORT_VMX
457   if (BX_CPU_THIS_PTR in_vmx_guest) {
458     if (VMEXIT(VMX_VM_EXEC_CTRL2_INVLPG_VMEXIT)) VMexit(VMX_VMEXIT_INVLPG, laddr);
459   }
460 #endif
461 
462 #if BX_SUPPORT_SVM
463   if (BX_CPU_THIS_PTR in_svm_guest) {
464     if (SVM_INTERCEPT(SVM_INTERCEPT0_INVLPG))
465       Svm_Vmexit(SVM_VMEXIT_INVLPG, BX_SUPPORT_SVM_EXTENSION(BX_CPUID_SVM_DECODE_ASSIST) ? laddr : 0);
466   }
467 #endif
468 
469 #if BX_SUPPORT_X86_64
470   if (IsCanonical(laddr))
471 #endif
472   {
473     BX_INSTR_TLB_CNTRL(BX_CPU_ID, BX_INSTR_INVLPG, laddr);
474     TLB_invlpg(laddr);
475   }
476 
477   BX_NEXT_TRACE(i);
478 }
479 
480 // error checking order - page not present, reserved bits, protection
481 enum {
482   ERROR_NOT_PRESENT  = 0x00,
483   ERROR_PROTECTION   = 0x01,
484   ERROR_WRITE_ACCESS = 0x02,
485   ERROR_USER_ACCESS  = 0x04,
486   ERROR_RESERVED     = 0x08,
487   ERROR_CODE_ACCESS  = 0x10,
488   ERROR_PKEY         = 0x20,
489   ERROR_SHADOW_STACK = 0x40,
490 };
491 
page_fault(unsigned fault,bx_address laddr,unsigned user,unsigned rw)492 void BX_CPU_C::page_fault(unsigned fault, bx_address laddr, unsigned user, unsigned rw)
493 {
494   unsigned isWrite = rw & 1;
495 
496   Bit32u error_code = fault | (user << 2) | (isWrite << 1);
497 #if BX_CPU_LEVEL >= 6
498   if (rw == BX_EXECUTE) {
499     if (BX_CPU_THIS_PTR cr4.get_SMEP())
500       error_code |= ERROR_CODE_ACCESS; // I/D = 1
501     if (BX_CPU_THIS_PTR cr4.get_PAE() && BX_CPU_THIS_PTR efer.get_NXE())
502       error_code |= ERROR_CODE_ACCESS;
503   }
504 #endif
505 #if BX_SUPPORT_CET
506   bool is_shadow_stack = (rw & 4) != 0;
507   if (is_shadow_stack)
508     error_code |= ERROR_SHADOW_STACK;
509 #endif
510 
511 #if BX_SUPPORT_SVM
512   SvmInterceptException(BX_HARDWARE_EXCEPTION, BX_PF_EXCEPTION, error_code, 1, laddr); // before the CR2 was modified
513 #endif
514 
515 #if BX_SUPPORT_VMX
516   VMexit_Event(BX_HARDWARE_EXCEPTION, BX_PF_EXCEPTION, error_code, 1, laddr); // before the CR2 was modified
517 #endif
518 
519   BX_CPU_THIS_PTR cr2 = laddr;
520 
521 #if BX_SUPPORT_X86_64
522   BX_DEBUG(("page fault for address %08x%08x @ %08x%08x",
523              GET32H(laddr), GET32L(laddr), GET32H(RIP), GET32L(RIP)));
524 #else
525   BX_DEBUG(("page fault for address %08x @ %08x", laddr, EIP));
526 #endif
527 
528   exception(BX_PF_EXCEPTION, error_code);
529 }
530 
531 enum {
532   BX_LEVEL_PML4 = 3,
533   BX_LEVEL_PDPTE = 2,
534   BX_LEVEL_PDE = 1,
535   BX_LEVEL_PTE = 0
536 };
537 
538 static const char *bx_paging_level[4] = { "PTE", "PDE", "PDPE", "PML4" }; // keep it 4 letters
539 
540 // combined_access legend:
541 // -----------------------
542 // 00    |
543 // 01    | R/W
544 // 02    | U/S
545 // 03    |
546 // 07    | Shadow Stack
547 // 08    | Global
548 // 11-09 | memtype (3 bits)
549 
550 enum {
551   BX_COMBINED_ACCESS_WRITE = 0x2,
552   BX_COMBINED_ACCESS_USER  = 0x4,
553   BX_COMBINED_SHADOW_STACK = 0x80,
554   BX_COMBINED_GLOBAL_PAGE  = 0x100,
555 };
556 
557 #if BX_CPU_LEVEL >= 6
558 
559 //                Format of a Long Mode Non-Leaf Entry
560 // -----------------------------------------------------------
561 // 00    | Present (P)
562 // 01    | R/W
563 // 02    | U/S
564 // 03    | Page-Level Write-Through (PWT)
565 // 04    | Page-Level Cache-Disable (PCD)
566 // 05    | Accessed (A)
567 // 06    | (ignored)
568 // 07    | Page Size (PS), must be 0 if no Large Page on the level
569 // 11-08 | (ignored)
570 // PA-12 | Physical address of 4-KByte aligned page-directory-pointer table
571 // 51-PA | Reserved (must be zero)
572 // 62-52 | (ignored)
573 // 63    | Execute-Disable (XD) (if EFER.NXE=1, reserved otherwise)
574 // -----------------------------------------------------------
575 
576 const Bit64u PAGING_PAE_RESERVED_BITS = BX_PAGING_PHY_ADDRESS_RESERVED_BITS;
577 
578 // in legacy PAE mode bits [62:52] are reserved. bit 63 is NXE
579 const Bit64u PAGING_LEGACY_PAE_RESERVED_BITS = BX_PAGING_PHY_ADDRESS_RESERVED_BITS | BX_CONST64(0x7ff0000000000000);
580 
581 //       Format of a PDPTE that References a 1-GByte Page
582 // -----------------------------------------------------------
583 // 00    | Present (P)
584 // 01    | R/W
585 // 02    | U/S
586 // 03    | Page-Level Write-Through (PWT)
587 // 04    | Page-Level Cache-Disable (PCD)
588 // 05    | Accessed (A)
589 // 06    | (ignored)
590 // 07    | Page Size, must be 1 to indicate a 1-GByte Page
591 // 08    | Global (G) (if CR4.PGE=1, ignored otherwise)
592 // 11-09 | (ignored)
593 // 12    | PAT (if PAT is supported, reserved otherwise)
594 // 29-13 | Reserved (must be zero)
595 // PA-30 | Physical address of the 1-Gbyte Page
596 // 51-PA | Reserved (must be zero)
597 // 62-52 | (ignored)
598 // 63    | Execute-Disable (XD) (if EFER.NXE=1, reserved otherwise)
599 // -----------------------------------------------------------
600 
601 const Bit64u PAGING_PAE_PDPTE1G_RESERVED_BITS = BX_PAGING_PHY_ADDRESS_RESERVED_BITS | BX_CONST64(0x3FFFE000);
602 
603 //        Format of a PAE PDE that Maps a 2-MByte Page
604 // -----------------------------------------------------------
605 // 00    | Present (P)
606 // 01    | R/W
607 // 02    | U/S
608 // 03    | Page-Level Write-Through (PWT)
609 // 04    | Page-Level Cache-Disable (PCD)
610 // 05    | Accessed (A)
611 // 06    | Dirty (D)
612 // 07    | Page Size (PS), must be 1 to indicate a 2-MByte Page
613 // 08    | Global (G) (if CR4.PGE=1, ignored otherwise)
614 // 11-09 | (ignored)
615 // 12    | PAT (if PAT is supported, reserved otherwise)
616 // 20-13 | Reserved (must be zero)
617 // PA-21 | Physical address of the 2-MByte page
618 // 51-PA | Reserved (must be zero)
619 // 62-52 | ignored in long mode, reserved (must be 0) in legacy PAE mode
620 // 63    | Execute-Disable (XD) (if EFER.NXE=1, reserved otherwise)
621 // -----------------------------------------------------------
622 
623 const Bit64u PAGING_PAE_PDE2M_RESERVED_BITS = BX_PAGING_PHY_ADDRESS_RESERVED_BITS | BX_CONST64(0x001FE000);
624 
625 //        Format of a PAE PTE that Maps a 4-KByte Page
626 // -----------------------------------------------------------
627 // 00    | Present (P)
628 // 01    | R/W
629 // 02    | U/S
630 // 03    | Page-Level Write-Through (PWT)
631 // 04    | Page-Level Cache-Disable (PCD)
632 // 05    | Accessed (A)
633 // 06    | Dirty (D)
634 // 07    | PAT (if PAT is supported, reserved otherwise)
635 // 08    | Global (G) (if CR4.PGE=1, ignored otherwise)
636 // 11-09 | (ignored)
637 // PA-12 | Physical address of the 4-KByte page
638 // 51-PA | Reserved (must be zero)
639 // 62-52 | ignored in long mode, reserved (must be 0) in legacy PAE mode
640 // 63    | Execute-Disable (XD) (if EFER.NXE=1, reserved otherwise)
641 // -----------------------------------------------------------
642 
check_entry_PAE(const char * s,Bit64u entry,Bit64u reserved,unsigned rw,bool * nx_fault)643 int BX_CPU_C::check_entry_PAE(const char *s, Bit64u entry, Bit64u reserved, unsigned rw, bool *nx_fault)
644 {
645   if (!(entry & 0x1)) {
646     BX_DEBUG(("PAE %s: entry not present", s));
647     return ERROR_NOT_PRESENT;
648   }
649 
650   if (entry & reserved) {
651     BX_DEBUG(("PAE %s: reserved bit is set 0x" FMT_ADDRX64, s, entry));
652     return ERROR_RESERVED | ERROR_PROTECTION;
653   }
654 
655   if (entry & PAGE_DIRECTORY_NX_BIT) {
656     if (rw == BX_EXECUTE) {
657       BX_DEBUG(("PAE %s: non-executable page fault occurred", s));
658       *nx_fault = true;
659     }
660   }
661 
662   return -1;
663 }
664 
665 #if BX_SUPPORT_MEMTYPE
calculate_pcd_pwt(Bit32u entry)666 BX_CPP_INLINE Bit32u calculate_pcd_pwt(Bit32u entry)
667 {
668   Bit32u pcd_pwt = (entry >> 3) & 0x3; // PCD, PWT are stored in bits 3 and 4
669   return pcd_pwt;
670 }
671 
672 // extract PCD, PWT and PAT pat bits from page table entry
calculate_pat(Bit32u entry,Bit32u lpf_mask)673 BX_CPP_INLINE Bit32u calculate_pat(Bit32u entry, Bit32u lpf_mask)
674 {
675   Bit32u pcd_pwt = calculate_pcd_pwt(entry);
676   // PAT is stored in bit 12 for large pages and in bit 7 for small pages
677   Bit32u pat = ((lpf_mask < 0x1000) ? (entry >> 7) : (entry >> 12)) & 0x1;
678   return pcd_pwt | (pat << 2);
679 }
680 #endif
681 
682 #if BX_SUPPORT_X86_64
683 
684 // Translate a linear address to a physical address in long mode
translate_linear_long_mode(bx_address laddr,Bit32u & lpf_mask,Bit32u & pkey,unsigned user,unsigned rw)685 bx_phy_address BX_CPU_C::translate_linear_long_mode(bx_address laddr, Bit32u &lpf_mask, Bit32u &pkey, unsigned user, unsigned rw)
686 {
687   bx_phy_address ppf = BX_CPU_THIS_PTR cr3 & BX_CR3_PAGING_MASK;
688 
689   bx_phy_address entry_addr[4];
690   Bit64u entry[4];
691   BxMemtype entry_memtype[4] = { 0 };
692 
693   bool nx_fault = false;
694   int leaf;
695 
696   Bit64u offset_mask = BX_CONST64(0x0000ffffffffffff);
697   lpf_mask = 0xfff;
698   Bit32u combined_access = (BX_COMBINED_ACCESS_WRITE | BX_COMBINED_ACCESS_USER);
699   Bit64u curr_entry = BX_CPU_THIS_PTR cr3;
700 
701   Bit64u reserved = PAGING_PAE_RESERVED_BITS;
702   if (! BX_CPU_THIS_PTR efer.get_NXE())
703     reserved |= PAGE_DIRECTORY_NX_BIT;
704 
705   for (leaf = BX_LEVEL_PML4;; --leaf) {
706     entry_addr[leaf] = ppf + ((laddr >> (9 + 9*leaf)) & 0xff8);
707 #if BX_SUPPORT_VMX >= 2
708     if (BX_CPU_THIS_PTR in_vmx_guest) {
709       if (SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_EPT_ENABLE))
710         entry_addr[leaf] = translate_guest_physical(entry_addr[leaf], laddr, 1, 1, BX_READ);
711     }
712 #endif
713 #if BX_SUPPORT_SVM
714     if (BX_CPU_THIS_PTR in_svm_guest && SVM_NESTED_PAGING_ENABLED) {
715       entry_addr[leaf] = nested_walk(entry_addr[leaf], BX_RW, 1);
716     }
717 #endif
718 
719 #if BX_SUPPORT_MEMTYPE
720     entry_memtype[leaf] = resolve_memtype(memtype_by_mtrr(entry_addr[leaf]), memtype_by_pat(calculate_pcd_pwt((Bit32u) curr_entry)));
721 #endif
722     access_read_physical(entry_addr[leaf], 8, &entry[leaf]);
723     BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[leaf], 8, entry_memtype[leaf], BX_READ, (BX_PTE_ACCESS + leaf), (Bit8u*)(&entry[leaf]));
724 
725     offset_mask >>= 9;
726 
727     curr_entry = entry[leaf];
728     int fault = check_entry_PAE(bx_paging_level[leaf], curr_entry, reserved, rw, &nx_fault);
729     if (fault >= 0)
730       page_fault(fault, laddr, user, rw);
731 
732     ppf = curr_entry & BX_CONST64(0x000ffffffffff000);
733 
734     if (leaf == BX_LEVEL_PTE) break;
735 
736     if (curr_entry & 0x80) {
737       if (leaf > (BX_LEVEL_PDE + !!is_cpu_extension_supported(BX_ISA_1G_PAGES))) {
738         BX_DEBUG(("long mode %s: PS bit set !", bx_paging_level[leaf]));
739         page_fault(ERROR_RESERVED | ERROR_PROTECTION, laddr, user, rw);
740       }
741 
742       ppf &= BX_CONST64(0x000fffffffffe000);
743       if (ppf & offset_mask) {
744          BX_DEBUG(("long mode %s: reserved bit is set: 0x" FMT_ADDRX64, bx_paging_level[leaf], curr_entry));
745          page_fault(ERROR_RESERVED | ERROR_PROTECTION, laddr, user, rw);
746       }
747 
748       lpf_mask = (Bit32u) offset_mask;
749       break;
750     }
751 
752     combined_access &= curr_entry; // U/S and R/W
753   }
754 
755   bool isWrite = (rw & 1); // write or r-m-w
756 
757 #if BX_SUPPORT_PKEYS
758   if (rw != BX_EXECUTE) {
759     if (BX_CPU_THIS_PTR cr4.get_PKE()) {
760       pkey = (entry[leaf] >> 59) & 0xf;
761 
762       // check of accessDisable bit set
763       if (user) {
764         if (BX_CPU_THIS_PTR pkru & (1<<(pkey*2))) {
765           BX_ERROR(("protection key access not allowed PKRU=%x pkey=%d", BX_CPU_THIS_PTR pkru, pkey));
766           page_fault(ERROR_PROTECTION | ERROR_PKEY, laddr, user, rw);
767         }
768       }
769 
770       // check of writeDisable bit set
771       if (BX_CPU_THIS_PTR pkru & (1<<(pkey*2+1))) {
772         if (isWrite && (user || BX_CPU_THIS_PTR cr0.get_WP())) {
773           BX_ERROR(("protection key write not allowed PKRU=%x pkey=%d", BX_CPU_THIS_PTR pkru, pkey));
774           page_fault(ERROR_PROTECTION | ERROR_PKEY, laddr, user, rw);
775         }
776       }
777     }
778 
779     if (BX_CPU_THIS_PTR cr4.get_PKS() && !user) {
780       pkey = (entry[leaf] >> 59) & 0xf;
781 
782       // check of accessDisable bit set
783       if (BX_CPU_THIS_PTR pkrs & (1<<(pkey*2))) {
784         BX_ERROR(("protection key access not allowed PKRS=%x pkey=%d", BX_CPU_THIS_PTR pkrs, pkey));
785         page_fault(ERROR_PROTECTION | ERROR_PKEY, laddr, user, rw);
786       }
787 
788       // check of writeDisable bit set
789       if (BX_CPU_THIS_PTR pkrs & (1<<(pkey*2+1))) {
790         if (isWrite && BX_CPU_THIS_PTR cr0.get_WP()) {
791           BX_ERROR(("protection key write not allowed PKRS=%x pkey=%d", BX_CPU_THIS_PTR pkrs, pkey));
792           page_fault(ERROR_PROTECTION | ERROR_PKEY, laddr, user, rw);
793         }
794       }
795     }
796   }
797 #endif
798 
799 #if BX_SUPPORT_CET
800   bool shadow_stack = (rw & 4) != 0;
801   if (shadow_stack) {
802     // shadow stack pages:
803     //  - R/W bit=1 in every paging structure entry except the leaf
804     //  - R/W bit=0 and Dirty=1 for leaf entry
805     bool shadow_stack_page = ((combined_access & BX_COMBINED_ACCESS_WRITE) != 0) && ((entry[leaf] & 0x40) != 0) && ((entry[leaf] & 0x02) == 0);
806     if (!shadow_stack_page) {
807       BX_DEBUG(("shadow stack access to not shadow stack page CA=%x entry=%lx\n", combined_access, Bit32u(entry[leaf] & 0xfff)));
808       page_fault(ERROR_PROTECTION, laddr, user, rw);
809     }
810 
811     combined_access &= entry[leaf]; // U/S and R/W
812 
813     // must be to shadow stack page, check that U/S match
814     if ((combined_access & BX_COMBINED_ACCESS_USER) ^ (user << 2)) {
815       BX_DEBUG(("shadow stack U/S access mismatch"));
816       page_fault(ERROR_PROTECTION, laddr, user, rw);
817     }
818     combined_access |= BX_COMBINED_SHADOW_STACK;
819   }
820   else
821 #endif
822   {
823     combined_access &= entry[leaf]; // U/S and R/W
824 
825     unsigned priv_index = (BX_CPU_THIS_PTR cr0.get_WP() << 4) | // bit 4
826                           (user<<3) |                           // bit 3
827                           (combined_access | (unsigned)isWrite);// bit 2,1,0
828 
829     if (!priv_check[priv_index] || nx_fault)
830       page_fault(ERROR_PROTECTION, laddr, user, rw);
831   }
832 
833   if (BX_CPU_THIS_PTR cr4.get_SMEP() && rw == BX_EXECUTE && !user) {
834     if (combined_access & BX_COMBINED_ACCESS_USER)
835       page_fault(ERROR_PROTECTION, laddr, user, rw);
836   }
837 
838   // SMAP protections are disabled if EFLAGS.AC=1
839   if (BX_CPU_THIS_PTR cr4.get_SMAP() && ! BX_CPU_THIS_PTR get_AC() && rw != BX_EXECUTE && ! user) {
840     if (combined_access & BX_COMBINED_ACCESS_USER)
841       page_fault(ERROR_PROTECTION, laddr, user, rw);
842   }
843 
844   if (BX_CPU_THIS_PTR cr4.get_PGE())
845     combined_access |= (entry[leaf] & BX_COMBINED_GLOBAL_PAGE);
846 
847 #if BX_SUPPORT_MEMTYPE
848   combined_access |= (memtype_by_pat(calculate_pat((Bit32u) entry[leaf], lpf_mask)) << 9);
849 #endif
850 
851   // Update A/D bits if needed
852   update_access_dirty_PAE(entry_addr, entry, entry_memtype, BX_LEVEL_PML4, leaf, isWrite);
853 
854   return (ppf | combined_access);
855 }
856 
857 #endif
858 
update_access_dirty_PAE(bx_phy_address * entry_addr,Bit64u * entry,BxMemtype * entry_memtype,unsigned max_level,unsigned leaf,unsigned write)859 void BX_CPU_C::update_access_dirty_PAE(bx_phy_address *entry_addr, Bit64u *entry, BxMemtype *entry_memtype, unsigned max_level, unsigned leaf, unsigned write)
860 {
861   // Update A bit if needed
862   for (unsigned level=max_level; level > leaf; level--) {
863     if (!(entry[level] & 0x20)) {
864       entry[level] |= 0x20;
865       access_write_physical(entry_addr[level], 8, &entry[level]);
866       BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[level], 8, entry_memtype[level], BX_WRITE,
867             (BX_PTE_ACCESS + level), (Bit8u*)(&entry[level]));
868     }
869   }
870 
871   // Update A/D bits if needed
872   if (!(entry[leaf] & 0x20) || (write && !(entry[leaf] & 0x40))) {
873     entry[leaf] |= (0x20 | (write<<6)); // Update A and possibly D bits
874     access_write_physical(entry_addr[leaf], 8, &entry[leaf]);
875     BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[leaf], 8, entry_memtype[leaf], BX_WRITE,
876             (BX_PTE_ACCESS + leaf), (Bit8u*)(&entry[leaf]));
877   }
878 }
879 
880 //          Format of Legacy PAE PDPTR entry (PDPTE)
881 // -----------------------------------------------------------
882 // 00    | Present (P)
883 // 02-01 | Reserved (must be zero)
884 // 03    | Page-Level Write-Through (PWT) (486+), 0=reserved otherwise
885 // 04    | Page-Level Cache-Disable (PCD) (486+), 0=reserved otherwise
886 // 08-05 | Reserved (must be zero)
887 // 11-09 | (ignored)
888 // PA-12 | Physical address of 4-KByte aligned page directory
889 // 63-PA | Reserved (must be zero)
890 // -----------------------------------------------------------
891 
892 const Bit64u PAGING_PAE_PDPTE_RESERVED_BITS = BX_PAGING_PHY_ADDRESS_RESERVED_BITS | BX_CONST64(0xFFF00000000001E6);
893 
CheckPDPTR(bx_phy_address cr3_val)894 bool BX_CPP_AttrRegparmN(1) BX_CPU_C::CheckPDPTR(bx_phy_address cr3_val)
895 {
896   // with Nested Paging PDPTRs are not loaded for guest page tables but
897   // accessed on demand as part of the guest page walk
898 #if BX_SUPPORT_SVM
899   if (BX_CPU_THIS_PTR in_svm_guest && SVM_NESTED_PAGING_ENABLED)
900     return 1;
901 #endif
902 
903   cr3_val &= 0xffffffe0;
904 #if BX_SUPPORT_VMX >= 2
905   if (BX_CPU_THIS_PTR in_vmx_guest) {
906     if (SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_EPT_ENABLE))
907       cr3_val = translate_guest_physical(cr3_val, 0, 0, 1, BX_READ);
908   }
909 #endif
910 
911   Bit64u pdptr[4];
912   unsigned n;
913 
914   for (n=0; n<4; n++) {
915     // read and check PDPTE entries
916     bx_phy_address pdpe_entry_addr = (bx_phy_address) (cr3_val | (n << 3));
917     access_read_physical(pdpe_entry_addr, 8, &(pdptr[n]));
918     BX_NOTIFY_PHY_MEMORY_ACCESS(pdpe_entry_addr, 8, BX_MEMTYPE_INVALID, BX_READ, (BX_PDPTR0_ACCESS + n), (Bit8u*) &(pdptr[n]));
919 
920     if (pdptr[n] & 0x1) {
921        if (pdptr[n] & PAGING_PAE_PDPTE_RESERVED_BITS) return 0;
922     }
923   }
924 
925   // load new PDPTRs
926   for (n=0; n<4; n++)
927     BX_CPU_THIS_PTR PDPTR_CACHE.entry[n] = pdptr[n];
928 
929   return 1; /* PDPTRs are fine */
930 }
931 
932 #if BX_SUPPORT_VMX >= 2
CheckPDPTR(Bit64u * pdptr)933 bool BX_CPP_AttrRegparmN(1) BX_CPU_C::CheckPDPTR(Bit64u *pdptr)
934 {
935   for (unsigned n=0; n<4; n++) {
936      if (pdptr[n] & 0x1) {
937         if (pdptr[n] & PAGING_PAE_PDPTE_RESERVED_BITS) return 0;
938      }
939   }
940 
941   return 1; /* PDPTRs are fine */
942 }
943 #endif
944 
translate_linear_load_PDPTR(bx_address laddr,unsigned user,unsigned rw)945 bx_phy_address BX_CPU_C::translate_linear_load_PDPTR(bx_address laddr, unsigned user, unsigned rw)
946 {
947   unsigned index = (laddr >> 30) & 0x3;
948   Bit64u pdptr;
949 
950 #if BX_SUPPORT_SVM
951   if (BX_CPU_THIS_PTR in_svm_guest && SVM_NESTED_PAGING_ENABLED)
952   {
953     bx_phy_address cr3_val = BX_CPU_THIS_PTR cr3 & 0xffffffe0;
954     cr3_val = nested_walk(cr3_val, BX_RW, 1);
955 
956     bx_phy_address pdpe_entry_addr = (bx_phy_address) (cr3_val | (index << 3));
957     access_read_physical(pdpe_entry_addr, 8, &pdptr);
958     BX_NOTIFY_PHY_MEMORY_ACCESS(pdpe_entry_addr, 8, BX_MEMTYPE_INVALID, BX_READ, (BX_PDPTR0_ACCESS + index), (Bit8u*) &pdptr);
959 
960     if (pdptr & 0x1) {
961       if (pdptr & PAGING_PAE_PDPTE_RESERVED_BITS) {
962         BX_DEBUG(("PAE PDPTE%d entry reserved bits set: 0x" FMT_ADDRX64, index, pdptr));
963         page_fault(ERROR_RESERVED | ERROR_PROTECTION, laddr, user, rw);
964       }
965     }
966   }
967   else
968 #endif
969   {
970     pdptr = BX_CPU_THIS_PTR PDPTR_CACHE.entry[index];
971   }
972 
973   if (! (pdptr & 0x1)) {
974     BX_DEBUG(("PAE PDPTE entry not present !"));
975     page_fault(ERROR_NOT_PRESENT, laddr, user, rw);
976   }
977 
978   return pdptr;
979 }
980 
981 // Translate a linear address to a physical address in PAE paging mode
translate_linear_PAE(bx_address laddr,Bit32u & lpf_mask,unsigned user,unsigned rw)982 bx_phy_address BX_CPU_C::translate_linear_PAE(bx_address laddr, Bit32u &lpf_mask, unsigned user, unsigned rw)
983 {
984   bx_phy_address entry_addr[2];
985   Bit64u entry[2];
986   BxMemtype entry_memtype[2] = { 0 };
987   bool nx_fault = false;
988   int leaf;
989 
990   lpf_mask = 0xfff;
991   Bit32u combined_access = (BX_COMBINED_ACCESS_WRITE | BX_COMBINED_ACCESS_USER);
992 
993   Bit64u reserved = PAGING_LEGACY_PAE_RESERVED_BITS;
994   if (! BX_CPU_THIS_PTR efer.get_NXE())
995     reserved |= PAGE_DIRECTORY_NX_BIT;
996 
997   Bit64u pdpte = translate_linear_load_PDPTR(laddr, user, rw);
998   bx_phy_address ppf = pdpte & BX_CONST64(0x000ffffffffff000);
999   Bit64u curr_entry = pdpte;
1000 
1001   for (leaf = BX_LEVEL_PDE;; --leaf) {
1002     entry_addr[leaf] = ppf + ((laddr >> (9 + 9*leaf)) & 0xff8);
1003 #if BX_SUPPORT_VMX >= 2
1004     if (BX_CPU_THIS_PTR in_vmx_guest) {
1005       if (SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_EPT_ENABLE))
1006         entry_addr[leaf] = translate_guest_physical(entry_addr[leaf], laddr, 1, 1, BX_READ);
1007     }
1008 #endif
1009 #if BX_SUPPORT_SVM
1010     if (BX_CPU_THIS_PTR in_svm_guest && SVM_NESTED_PAGING_ENABLED) {
1011       entry_addr[leaf] = nested_walk(entry_addr[leaf], BX_RW, 1);
1012     }
1013 #endif
1014 
1015 #if BX_SUPPORT_MEMTYPE
1016     entry_memtype[leaf] = resolve_memtype(memtype_by_mtrr(entry_addr[leaf]), memtype_by_pat(calculate_pcd_pwt((Bit32u) curr_entry)));
1017 #endif
1018     access_read_physical(entry_addr[leaf], 8, &entry[leaf]);
1019     BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[leaf], 8, entry_memtype[leaf], BX_READ, (BX_PTE_ACCESS + leaf), (Bit8u*)(&entry[leaf]));
1020 
1021     curr_entry = entry[leaf];
1022     int fault = check_entry_PAE(bx_paging_level[leaf], curr_entry, reserved, rw, &nx_fault);
1023     if (fault >= 0)
1024       page_fault(fault, laddr, user, rw);
1025 
1026     ppf = curr_entry & BX_CONST64(0x000ffffffffff000);
1027 
1028     if (leaf == BX_LEVEL_PTE) break;
1029 
1030     // Ignore CR4.PSE in PAE mode
1031     if (curr_entry & 0x80) {
1032       if (curr_entry & PAGING_PAE_PDE2M_RESERVED_BITS) {
1033         BX_DEBUG(("PAE PDE2M: reserved bit is set PDE=0x" FMT_ADDRX64, curr_entry));
1034         page_fault(ERROR_RESERVED | ERROR_PROTECTION, laddr, user, rw);
1035       }
1036 
1037       // Make up the physical page frame address
1038       ppf = (bx_phy_address)(curr_entry & BX_CONST64(0x000fffffffe00000));
1039       lpf_mask = 0x1fffff;
1040       break;
1041     }
1042 
1043     combined_access &= curr_entry; // U/S and R/W
1044   }
1045 
1046   bool isWrite = (rw & 1); // write or r-m-w
1047 
1048 #if BX_SUPPORT_CET
1049   bool shadow_stack = (rw & 4) != 0;
1050   if (shadow_stack) {
1051     // shadow stack pages:
1052     //  - R/W bit=1 in every paging structure entry except the leaf
1053     //  - R/W bit=0 and Dirty=1 for leaf entry
1054     bool shadow_stack_page = ((combined_access & BX_COMBINED_ACCESS_WRITE) != 0) && ((entry[leaf] & 0x40) != 0) && ((entry[leaf] & 0x02) == 0);
1055     if (!shadow_stack_page)
1056       page_fault(ERROR_PROTECTION, laddr, user, rw);
1057 
1058     combined_access &= entry[leaf]; // U/S and R/W
1059 
1060     // must be to shadow stack page, check that U/S match
1061     if ((combined_access & BX_COMBINED_ACCESS_USER) ^ (user << 2)) {
1062       BX_DEBUG(("shadow stack U/S access mismatch"));
1063       page_fault(ERROR_PROTECTION, laddr, user, rw);
1064     }
1065     combined_access |= BX_COMBINED_SHADOW_STACK;
1066   }
1067   else
1068 #endif
1069   {
1070     combined_access &= entry[leaf]; // U/S and R/W
1071 
1072     unsigned priv_index = (BX_CPU_THIS_PTR cr0.get_WP() << 4) | // bit 4
1073                           (user<<3) |                           // bit 3
1074                           (combined_access | (unsigned)isWrite);// bit 2,1,0
1075 
1076     if (!priv_check[priv_index] || nx_fault)
1077       page_fault(ERROR_PROTECTION, laddr, user, rw);
1078   }
1079 
1080   if (BX_CPU_THIS_PTR cr4.get_SMEP() && rw == BX_EXECUTE && !user) {
1081     if (combined_access & BX_COMBINED_ACCESS_USER)
1082       page_fault(ERROR_PROTECTION, laddr, user, rw);
1083   }
1084 
1085   // SMAP protections are disabled if EFLAGS.AC=1
1086   if (BX_CPU_THIS_PTR cr4.get_SMAP() && ! BX_CPU_THIS_PTR get_AC() && rw != BX_EXECUTE && ! user) {
1087     if (combined_access & BX_COMBINED_ACCESS_USER)
1088       page_fault(ERROR_PROTECTION, laddr, user, rw);
1089   }
1090 
1091   if (BX_CPU_THIS_PTR cr4.get_PGE())
1092     combined_access |= (entry[leaf] & BX_COMBINED_GLOBAL_PAGE); // G
1093 
1094 #if BX_SUPPORT_MEMTYPE
1095   combined_access |= (memtype_by_pat(calculate_pat((Bit32u) entry[leaf], lpf_mask)) << 9);
1096 #endif
1097 
1098   // Update A/D bits if needed
1099   update_access_dirty_PAE(entry_addr, entry, entry_memtype, BX_LEVEL_PDE, leaf, isWrite);
1100 
1101   return (ppf | combined_access);
1102 }
1103 
1104 #endif
1105 
1106 //           Format of a PDE that Maps a 4-MByte Page
1107 // -----------------------------------------------------------
1108 // 00    | Present (P)
1109 // 01    | R/W
1110 // 02    | U/S
1111 // 03    | Page-Level Write-Through (PWT)
1112 // 04    | Page-Level Cache-Disable (PCD)
1113 // 05    | Accessed (A)
1114 // 06    | Dirty (D)
1115 // 07    | Page size, must be 1 to indicate 4-Mbyte page
1116 // 08    | Global (G) (if CR4.PGE=1, ignored otherwise)
1117 // 11-09 | (ignored)
1118 // 12    | PAT (if PAT is supported, reserved otherwise)
1119 // PA-13 | Bits PA-32 of physical address of the 4-MByte page
1120 // 21-PA | Reserved (must be zero)
1121 // 31-22 | Bits 31-22 of physical address of the 4-MByte page
1122 // -----------------------------------------------------------
1123 
1124 #if BX_PHY_ADDRESS_WIDTH > 40
1125 const Bit32u PAGING_PDE4M_RESERVED_BITS = 0; // there are no reserved bits in PDE4M when physical address is wider than 40 bit
1126 #else
1127 const Bit32u PAGING_PDE4M_RESERVED_BITS = ((1 << (41-BX_PHY_ADDRESS_WIDTH))-1) << (13 + BX_PHY_ADDRESS_WIDTH - 32);
1128 #endif
1129 
1130 // Translate a linear address to a physical address in legacy paging mode
translate_linear_legacy(bx_address laddr,Bit32u & lpf_mask,unsigned user,unsigned rw)1131 bx_phy_address BX_CPU_C::translate_linear_legacy(bx_address laddr, Bit32u &lpf_mask, unsigned user, unsigned rw)
1132 {
1133   bx_phy_address entry_addr[2], ppf = (Bit32u) BX_CPU_THIS_PTR cr3 & BX_CR3_PAGING_MASK;
1134   Bit32u entry[2];
1135   BxMemtype entry_memtype[2] = { 0 };
1136   int leaf;
1137 
1138   lpf_mask = 0xfff;
1139   Bit32u combined_access = (BX_COMBINED_ACCESS_WRITE | BX_COMBINED_ACCESS_USER);
1140   Bit32u curr_entry = (Bit32u) BX_CPU_THIS_PTR cr3;
1141 
1142   for (leaf = BX_LEVEL_PDE;; --leaf) {
1143     entry_addr[leaf] = ppf + ((laddr >> (10 + 10*leaf)) & 0xffc);
1144 #if BX_SUPPORT_VMX >= 2
1145     if (BX_CPU_THIS_PTR in_vmx_guest) {
1146       if (SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_EPT_ENABLE))
1147         entry_addr[leaf] = translate_guest_physical(entry_addr[leaf], laddr, 1, 1, BX_READ);
1148     }
1149 #endif
1150 #if BX_SUPPORT_SVM
1151     if (BX_CPU_THIS_PTR in_svm_guest && SVM_NESTED_PAGING_ENABLED) {
1152       entry_addr[leaf] = nested_walk(entry_addr[leaf], BX_RW, 1);
1153     }
1154 #endif
1155 
1156 #if BX_SUPPORT_MEMTYPE
1157     entry_memtype[leaf] = resolve_memtype(memtype_by_mtrr(entry_addr[leaf]), memtype_by_pat(calculate_pcd_pwt(curr_entry)));
1158 #endif
1159     access_read_physical(entry_addr[leaf], 4, &entry[leaf]);
1160     BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[leaf], 4, entry_memtype[leaf], BX_READ, (BX_PTE_ACCESS + leaf), (Bit8u*)(&entry[leaf]));
1161 
1162     curr_entry = entry[leaf];
1163     if (!(curr_entry & 0x1)) {
1164       BX_DEBUG(("%s: entry not present", bx_paging_level[leaf]));
1165       page_fault(ERROR_NOT_PRESENT, laddr, user, rw);
1166     }
1167 
1168     ppf = curr_entry & 0xfffff000;
1169 
1170     if (leaf == BX_LEVEL_PTE) break;
1171 
1172 #if BX_CPU_LEVEL >= 5
1173     if ((curr_entry & 0x80) != 0 && BX_CPU_THIS_PTR cr4.get_PSE()) {
1174       // 4M paging, only if CR4.PSE enabled, ignore PDE.PS otherwise
1175       if (curr_entry & PAGING_PDE4M_RESERVED_BITS) {
1176         BX_DEBUG(("PSE PDE4M: reserved bit is set: PDE=0x%08x", entry[BX_LEVEL_PDE]));
1177         page_fault(ERROR_RESERVED | ERROR_PROTECTION, laddr, user, rw);
1178       }
1179 
1180       // make up the physical frame number
1181       ppf = (curr_entry & 0xffc00000);
1182 #if BX_PHY_ADDRESS_WIDTH > 32
1183       ppf |= ((bx_phy_address)(curr_entry & 0x003fe000)) << 19;
1184 #endif
1185       lpf_mask = 0x3fffff;
1186       break;
1187     }
1188 #endif
1189 
1190     combined_access &= curr_entry; // U/S and R/W
1191   }
1192 
1193   bool isWrite = (rw & 1); // write or r-m-w
1194 
1195 #if BX_SUPPORT_CET
1196   bool shadow_stack = (rw & 4) != 0;
1197   if (shadow_stack) {
1198     // shadow stack pages:
1199     //  - R/W bit=1 in every paging structure entry except the leaf
1200     //  - R/W bit=0 and Dirty=1 for leaf entry
1201     bool shadow_stack_page = ((combined_access & BX_COMBINED_ACCESS_WRITE) != 0) && ((entry[leaf] & 0x40) != 0) && ((entry[leaf] & 0x02) == 0);
1202     if (!shadow_stack_page)
1203       page_fault(ERROR_PROTECTION, laddr, user, rw);
1204 
1205     combined_access &= entry[leaf]; // U/S and R/W
1206 
1207     // must be to shadow stack page, check that U/S match
1208     if ((combined_access & BX_COMBINED_ACCESS_USER) ^ (user << 2)) {
1209       BX_DEBUG(("shadow stack U/S access mismatch"));
1210       page_fault(ERROR_PROTECTION, laddr, user, rw);
1211     }
1212     combined_access |= BX_COMBINED_SHADOW_STACK;
1213   }
1214   else
1215 #endif
1216   {
1217     combined_access &= entry[leaf]; // U/S and R/W
1218 
1219     unsigned priv_index =
1220 #if BX_CPU_LEVEL >= 4
1221         (BX_CPU_THIS_PTR cr0.get_WP() << 4) |   // bit 4
1222 #endif
1223         (user<<3) |                             // bit 3
1224         (combined_access | (unsigned)isWrite);  // bit 2,1,0
1225 
1226     if (!priv_check[priv_index])
1227       page_fault(ERROR_PROTECTION, laddr, user, rw);
1228   }
1229 
1230 #if BX_CPU_LEVEL >= 6
1231   if (BX_CPU_THIS_PTR cr4.get_SMEP() && rw == BX_EXECUTE && !user) {
1232     if (combined_access & BX_COMBINED_ACCESS_USER)
1233       page_fault(ERROR_PROTECTION, laddr, user, rw);
1234   }
1235 
1236   // SMAP protections are disabled if EFLAGS.AC=1
1237   if (BX_CPU_THIS_PTR cr4.get_SMAP() && ! BX_CPU_THIS_PTR get_AC() && rw != BX_EXECUTE && ! user) {
1238     if (combined_access & BX_COMBINED_ACCESS_USER)
1239       page_fault(ERROR_PROTECTION, laddr, user, rw);
1240   }
1241 
1242   if (BX_CPU_THIS_PTR cr4.get_PGE())
1243     combined_access |= (entry[leaf] & BX_COMBINED_GLOBAL_PAGE);
1244 
1245 #if BX_SUPPORT_MEMTYPE
1246   combined_access |= (memtype_by_pat(calculate_pat(entry[leaf], lpf_mask)) << 9);
1247 #endif
1248 
1249 #endif
1250 
1251   update_access_dirty(entry_addr, entry, entry_memtype, leaf, isWrite);
1252 
1253   return (ppf | combined_access);
1254 }
1255 
update_access_dirty(bx_phy_address * entry_addr,Bit32u * entry,BxMemtype * entry_memtype,unsigned leaf,unsigned write)1256 void BX_CPU_C::update_access_dirty(bx_phy_address *entry_addr, Bit32u *entry, BxMemtype *entry_memtype, unsigned leaf, unsigned write)
1257 {
1258   if (leaf == BX_LEVEL_PTE) {
1259     // Update PDE A bit if needed
1260     if (!(entry[BX_LEVEL_PDE] & 0x20)) {
1261       entry[BX_LEVEL_PDE] |= 0x20;
1262       access_write_physical(entry_addr[BX_LEVEL_PDE], 4, &entry[BX_LEVEL_PDE]);
1263       BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[BX_LEVEL_PDE], 4, entry_memtype[BX_LEVEL_PDE], BX_WRITE, BX_PDE_ACCESS, (Bit8u*)(&entry[BX_LEVEL_PDE]));
1264     }
1265   }
1266 
1267   // Update A/D bits if needed
1268   if (!(entry[leaf] & 0x20) || (write && !(entry[leaf] & 0x40))) {
1269     entry[leaf] |= (0x20 | (write<<6)); // Update A and possibly D bits
1270     access_write_physical(entry_addr[leaf], 4, &entry[leaf]);
1271     BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[leaf], 4, entry_memtype[leaf], BX_WRITE, (BX_PTE_ACCESS + leaf), (Bit8u*)(&entry[leaf]));
1272   }
1273 }
1274 
1275 // Translate a linear address to a physical address
translate_linear(bx_TLB_entry * tlbEntry,bx_address laddr,unsigned user,unsigned rw)1276 bx_phy_address BX_CPU_C::translate_linear(bx_TLB_entry *tlbEntry, bx_address laddr, unsigned user, unsigned rw)
1277 {
1278 #if BX_SUPPORT_X86_64
1279   if (! long_mode()) laddr &= 0xffffffff;
1280 #endif
1281 
1282   bx_phy_address paddress, ppf, poffset = PAGE_OFFSET(laddr);
1283   unsigned isWrite = rw & 1; // write or r-m-w
1284   unsigned isExecute = (rw == BX_EXECUTE);
1285   unsigned isShadowStack = (rw & 4); // 4 if shadowstack and 0 otherwise
1286   bx_address lpf = LPFOf(laddr);
1287 
1288   INC_TLB_STAT(tlbLookups);
1289   if (isExecute)
1290     INC_TLB_STAT(tlbExecuteLookups);
1291   if (isWrite)
1292     INC_TLB_STAT(tlbWriteLookups);
1293 
1294   // already looked up TLB for code access
1295   if (! isExecute && TLB_LPFOf(tlbEntry->lpf) == lpf)
1296   {
1297     paddress = tlbEntry->ppf | poffset;
1298 
1299 #if BX_SUPPORT_PKEYS
1300     if (isWrite) {
1301       if (tlbEntry->accessBits & (1 << (isShadowStack | (isWrite<<1) | user)) & BX_CPU_THIS_PTR wr_pkey[tlbEntry->pkey])
1302         return paddress;
1303     }
1304     else {
1305       if (tlbEntry->accessBits & (1 << (isShadowStack | user)) & BX_CPU_THIS_PTR rd_pkey[tlbEntry->pkey])
1306         return paddress;
1307     }
1308 #else
1309     if (tlbEntry->accessBits & (1 << (isShadowStack | (isWrite<<1) | user)))
1310       return paddress;
1311 #endif
1312 
1313     // The current access does not have permission according to the info
1314     // in our TLB cache entry.  Re-walk the page tables, in case there is
1315     // updated information in the memory image, and let the long path code
1316     // generate an exception if one is warranted.
1317 
1318     // Invalidate the TLB entry before re-walk as re-walk may end with paging fault.
1319     // The entry will be reinitialized later if page walk succeeds.
1320     tlbEntry->invalidate();
1321   }
1322 
1323   INC_TLB_STAT(tlbMisses);
1324   if (isExecute)
1325     INC_TLB_STAT(tlbExecuteMisses);
1326   if (isWrite)
1327     INC_TLB_STAT(tlbWriteMisses);
1328 
1329   Bit32u lpf_mask = 0xfff; // 4K pages
1330   Bit32u combined_access = BX_COMBINED_ACCESS_WRITE | BX_COMBINED_ACCESS_USER;
1331 #if BX_SUPPORT_X86_64
1332   Bit32u pkey = 0;
1333 #endif
1334 
1335   if(BX_CPU_THIS_PTR cr0.get_PG())
1336   {
1337     BX_DEBUG(("page walk for%s address 0x" FMT_LIN_ADDRX, isShadowStack ? " shadow stack" : "", laddr));
1338 
1339 #if BX_CPU_LEVEL >= 6
1340 #if BX_SUPPORT_X86_64
1341     if (long_mode())
1342       paddress = translate_linear_long_mode(laddr, lpf_mask, pkey, user, rw);
1343     else
1344 #endif
1345       if (BX_CPU_THIS_PTR cr4.get_PAE())
1346         paddress = translate_linear_PAE(laddr, lpf_mask, user, rw);
1347       else
1348 #endif
1349         paddress = translate_linear_legacy(laddr, lpf_mask, user, rw);
1350 
1351     // translate_linear functions return combined U/S, R/W bits, Global Page bit
1352     // and also effective page tables memory type in lower 12 bits of the physical address.
1353     // Bit 1 - R/W bit
1354     // Bit 2 - U/S bit
1355     // Bit 9,10,11 - Effective Memory Table from page tables
1356     combined_access = paddress & lpf_mask;
1357     paddress = (paddress & ~((Bit64u) lpf_mask)) | (laddr & lpf_mask);
1358 
1359 #if BX_CPU_LEVEL >= 5
1360     if (lpf_mask > 0xfff) {
1361       if (isExecute)
1362         BX_CPU_THIS_PTR ITLB.split_large = true;
1363       else
1364         BX_CPU_THIS_PTR DTLB.split_large = true;
1365     }
1366 #endif
1367   }
1368   else {
1369     // no paging
1370     paddress = (bx_phy_address) laddr;
1371     combined_access |= (BX_MEMTYPE_WB << 9); // act as memory type by paging is WB
1372   }
1373 
1374   // Calculate physical memory address and fill in TLB cache entry
1375 #if BX_SUPPORT_VMX >= 2
1376   if (BX_CPU_THIS_PTR in_vmx_guest) {
1377     if (SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_EPT_ENABLE)) {
1378       paddress = translate_guest_physical(paddress, laddr, 1, 0, rw, isShadowStack & !user);
1379     }
1380   }
1381 #endif
1382 #if BX_SUPPORT_SVM
1383   if (BX_CPU_THIS_PTR in_svm_guest && SVM_NESTED_PAGING_ENABLED) {
1384     // hack: ignore isExecute attribute in SMM mode under SVM virtualization
1385     if (BX_CPU_THIS_PTR in_smm && rw == BX_EXECUTE) rw = BX_READ;
1386 
1387     paddress = nested_walk(paddress, rw, 0);
1388   }
1389 #endif
1390   paddress = A20ADDR(paddress);
1391   ppf = PPFOf(paddress);
1392 
1393   // direct memory access is NOT allowed by default
1394   tlbEntry->lpf = lpf | TLB_NoHostPtr;
1395   tlbEntry->lpf_mask = lpf_mask;
1396 #if BX_SUPPORT_PKEYS
1397   tlbEntry->pkey = pkey;
1398 #endif
1399   tlbEntry->ppf = ppf;
1400   tlbEntry->accessBits = 0;
1401 
1402   if (isExecute) {
1403     tlbEntry->accessBits |= TLB_SysExecuteOK;
1404   }
1405   else {
1406 #if BX_SUPPORT_CET
1407     if (isShadowStack) {
1408       tlbEntry->accessBits |= TLB_SysReadOK | TLB_SysReadShadowStackOK;
1409       if (isWrite)
1410         tlbEntry->accessBits |= TLB_SysWriteShadowStackOK;
1411     }
1412     else
1413 #endif
1414     {
1415       tlbEntry->accessBits |= TLB_SysReadOK;
1416       if (isWrite)
1417         tlbEntry->accessBits |= TLB_SysWriteOK;
1418     }
1419   }
1420 
1421   if (! BX_CPU_THIS_PTR cr0.get_PG()
1422 #if BX_SUPPORT_VMX >= 2
1423         && ! (BX_CPU_THIS_PTR in_vmx_guest && SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_EPT_ENABLE))
1424 #endif
1425 #if BX_SUPPORT_SVM
1426         && ! (BX_CPU_THIS_PTR in_svm_guest && SVM_NESTED_PAGING_ENABLED)
1427 #endif
1428     ) {
1429     if (isExecute)
1430       tlbEntry->accessBits |= TLB_UserExecuteOK;
1431     else
1432       tlbEntry->accessBits |= TLB_UserReadOK | TLB_UserWriteOK;
1433   }
1434   else {
1435     if ((combined_access & BX_COMBINED_ACCESS_USER) != 0) {
1436 
1437       if (user) {
1438         if (isExecute) {
1439           tlbEntry->accessBits |= TLB_UserExecuteOK;
1440         }
1441         else {
1442 #if BX_SUPPORT_CET
1443           if (isShadowStack) {
1444             tlbEntry->accessBits |= TLB_UserReadOK | TLB_UserReadShadowStackOK;
1445             if (isWrite)
1446               tlbEntry->accessBits |= TLB_UserWriteShadowStackOK;
1447           }
1448           else
1449 #endif
1450           {
1451             tlbEntry->accessBits |= TLB_UserReadOK;
1452             if (isWrite)
1453               tlbEntry->accessBits |= TLB_UserWriteOK;
1454           }
1455         }
1456       }
1457 
1458 #if BX_CPU_LEVEL >= 6
1459       if (isExecute) {
1460         if (BX_CPU_THIS_PTR cr4.get_SMEP())
1461           tlbEntry->accessBits &= ~TLB_SysExecuteOK;
1462       }
1463       else {
1464         if (BX_CPU_THIS_PTR cr4.get_SMAP())
1465           tlbEntry->accessBits &= ~(TLB_SysReadOK | TLB_SysWriteOK);
1466       }
1467 #endif
1468 
1469 #if BX_SUPPORT_CET
1470       // system shadow stack accesses cannot access user pages
1471       tlbEntry->accessBits &= ~(TLB_SysReadShadowStackOK | TLB_SysWriteShadowStackOK);
1472 #endif
1473     }
1474   }
1475 
1476 #if BX_CPU_LEVEL >= 6
1477   if (combined_access & BX_COMBINED_GLOBAL_PAGE) // Global bit
1478     tlbEntry->accessBits |= TLB_GlobalPage;
1479 #endif
1480 
1481   // Attempt to get a host pointer to this physical page. Put that
1482   // pointer in the TLB cache. Note if the request is vetoed, NULL
1483   // will be returned, and it's OK to OR zero in anyways.
1484   tlbEntry->hostPageAddr = BX_CPU_THIS_PTR getHostMemAddr(ppf, rw);
1485   if (tlbEntry->hostPageAddr) {
1486     // All access allowed also via direct pointer
1487 #if BX_X86_DEBUGGER
1488     if (! hwbreakpoint_check(laddr, BX_HWDebugMemW, BX_HWDebugMemRW))
1489 #endif
1490        tlbEntry->lpf = lpf; // allow direct access with HostPtr
1491   }
1492 
1493 #if BX_SUPPORT_MEMTYPE
1494   tlbEntry->memtype = resolve_memtype(memtype_by_mtrr(tlbEntry->ppf), combined_access >> 9 /* effective page tables memory type */);
1495 #endif
1496 
1497   return paddress;
1498 }
1499 
get_memtype_name(BxMemtype memtype)1500 const char *get_memtype_name(BxMemtype memtype)
1501 {
1502   static const char *mem_type_string[9] = { "UC", "WC", "RESERVED2", "RESERVED3", "WT", "WP", "WB", "UC-", "INVALID" };
1503   if (memtype > BX_MEMTYPE_INVALID) memtype = BX_MEMTYPE_INVALID;
1504   return mem_type_string[memtype];
1505 }
1506 
1507 #if BX_SUPPORT_MEMTYPE
memtype_by_mtrr(bx_phy_address pAddr)1508 BxMemtype BX_CPP_AttrRegparmN(1) BX_CPU_C::memtype_by_mtrr(bx_phy_address pAddr)
1509 {
1510 #if BX_CPU_LEVEL >= 6
1511   if (is_cpu_extension_supported(BX_ISA_MTRR)) {
1512     const Bit32u BX_MTRR_DEFTYPE_FIXED_MTRR_ENABLE_MASK = (1 << 10);
1513     const Bit32u BX_MTRR_ENABLE_MASK = (1 << 11);
1514 
1515     if (BX_CPU_THIS_PTR msr.mtrr_deftype & BX_MTRR_ENABLE_MASK) {
1516       // fixed range MTRR take priority over variable range MTRR when enabled
1517       if (pAddr < 0x100000 && (BX_CPU_THIS_PTR msr.mtrr_deftype & BX_MTRR_DEFTYPE_FIXED_MTRR_ENABLE_MASK)) {
1518         if (pAddr < 0x80000) {
1519           unsigned index = (pAddr >> 16) & 0x7;
1520           return (BxMemtype) BX_CPU_THIS_PTR msr.mtrrfix64k.ubyte(index);
1521         }
1522         if (pAddr < 0xc0000) {
1523           unsigned index = ((pAddr - 0x80000) >> 14) & 0xf;
1524           return (BxMemtype) BX_CPU_THIS_PTR msr.mtrrfix16k[index >> 3].ubyte(index & 0x7);
1525         }
1526         else {
1527           unsigned index =  (pAddr - 0xc0000) >> 12;
1528           return (BxMemtype) BX_CPU_THIS_PTR msr.mtrrfix4k [index >> 3].ubyte(index & 0x7);
1529         }
1530       }
1531 
1532       int memtype = -1;
1533 
1534       for (unsigned i=0; i < BX_NUM_VARIABLE_RANGE_MTRRS; i++) {
1535         Bit64u base = BX_CPU_THIS_PTR msr.mtrrphys[i*2];
1536         Bit64u mask = BX_CPU_THIS_PTR msr.mtrrphys[i*2 + 1];
1537         if ((mask & BX_MTRR_ENABLE_MASK) == 0) continue;
1538         mask = PPFOf(mask);
1539         if ((pAddr & mask) == (base & mask)) {
1540           //
1541           // Matched variable MTRR, check overlap rules:
1542           // - if two or more variable memory ranges match and the memory types are identical,
1543           //   then that memory type is used.
1544           // - if two or more variable memory ranges match and one of the memory types is UC,
1545           //   the UC memory type used.
1546           // - if two or more variable memory ranges match and the memory types are WT and WB,
1547           //   the WT memory type is used.
1548           // - For overlaps not defined by the above rules, processor behavior is undefined.
1549           //
1550           BxMemtype curr_memtype = BxMemtype(base & 0xff);
1551           if (curr_memtype == BX_MEMTYPE_UC)
1552             return BX_MEMTYPE_UC;
1553 
1554           if (memtype == -1) {
1555             memtype = curr_memtype; // first match
1556           }
1557           else if (memtype != (int) curr_memtype) {
1558             if (curr_memtype == BX_MEMTYPE_WT && memtype == BX_MEMTYPE_WB)
1559               memtype = BX_MEMTYPE_WT;
1560             else if (curr_memtype == BX_MEMTYPE_WB && memtype == BX_MEMTYPE_WT)
1561               memtype = BX_MEMTYPE_WT;
1562             else
1563               memtype = BX_MEMTYPE_INVALID;
1564           }
1565         }
1566       }
1567 
1568       if (memtype != -1)
1569         return BxMemtype(memtype);
1570 
1571       // didn't match any variable range MTRR, return default memory type
1572       return BxMemtype(BX_CPU_THIS_PTR msr.mtrr_deftype & 0xff);
1573     }
1574 
1575     // return UC memory type when MTRRs are not enabled
1576     return BX_MEMTYPE_UC;
1577   }
1578 #endif
1579 
1580   // return INVALID memory type when MTRRs are not supported
1581   return BX_MEMTYPE_INVALID;
1582 }
1583 
memtype_by_pat(unsigned pat)1584 BxMemtype BX_CPP_AttrRegparmN(1) BX_CPU_C::memtype_by_pat(unsigned pat)
1585 {
1586   return (BxMemtype) BX_CPU_THIS_PTR msr.pat.ubyte(pat);
1587 }
1588 
resolve_memtype(BxMemtype mtrr_memtype,BxMemtype pat_memtype)1589 BxMemtype BX_CPP_AttrRegparmN(2) BX_CPU_C::resolve_memtype(BxMemtype mtrr_memtype, BxMemtype pat_memtype)
1590 {
1591   if (BX_CPU_THIS_PTR cr0.get_CD())
1592     return BX_MEMTYPE_UC;
1593 
1594   if (mtrr_memtype == BX_MEMTYPE_INVALID) // will result in ignore of MTRR memory type
1595     mtrr_memtype = BX_MEMTYPE_WB;
1596 
1597   switch(pat_memtype) {
1598     case BX_MEMTYPE_UC:
1599     case BX_MEMTYPE_WC:
1600       return pat_memtype;
1601 
1602     case BX_MEMTYPE_WT:
1603     case BX_MEMTYPE_WP:
1604       if (mtrr_memtype == BX_MEMTYPE_WC) return BX_MEMTYPE_UC;
1605       return (mtrr_memtype < pat_memtype) ? mtrr_memtype : pat_memtype;
1606 
1607     case BX_MEMTYPE_WB:
1608       return mtrr_memtype;
1609 
1610     case BX_MEMTYPE_UC_WEAK:
1611       return (mtrr_memtype == BX_MEMTYPE_WC) ? BX_MEMTYPE_WC : BX_MEMTYPE_UC;
1612 
1613     default:
1614       BX_PANIC(("unexpected PAT memory type: %u", (unsigned) pat_memtype));
1615   }
1616 
1617   return BX_MEMTYPE_INVALID; // keep compiler happy
1618 }
1619 #endif
1620 
1621 #if BX_SUPPORT_SVM
1622 
nested_page_fault(unsigned fault,bx_phy_address guest_paddr,unsigned rw,unsigned is_page_walk)1623 void BX_CPU_C::nested_page_fault(unsigned fault, bx_phy_address guest_paddr, unsigned rw, unsigned is_page_walk)
1624 {
1625   unsigned isWrite = rw & 1;
1626 
1627   Bit64u error_code = fault | (1 << 2) | (isWrite << 1);
1628   if (rw == BX_EXECUTE)
1629     error_code |= ERROR_CODE_ACCESS; // I/D = 1
1630 
1631   if (is_page_walk)
1632     error_code |= BX_CONST64(1) << 33;
1633   else
1634     error_code |= BX_CONST64(1) << 32;
1635 
1636   Svm_Vmexit(SVM_VMEXIT_NPF, error_code, guest_paddr);
1637 }
1638 
nested_walk_long_mode(bx_phy_address guest_paddr,unsigned rw,bool is_page_walk)1639 bx_phy_address BX_CPU_C::nested_walk_long_mode(bx_phy_address guest_paddr, unsigned rw, bool is_page_walk)
1640 {
1641   bx_phy_address entry_addr[4];
1642   Bit64u entry[4];
1643   BxMemtype entry_memtype[4] = { BX_MEMTYPE_INVALID };
1644   bool nx_fault = false;
1645   int leaf;
1646 
1647   SVM_CONTROLS *ctrls = &BX_CPU_THIS_PTR vmcb.ctrls;
1648   SVM_HOST_STATE *host_state = &BX_CPU_THIS_PTR vmcb.host_state;
1649   bx_phy_address ppf = ctrls->ncr3 & BX_CR3_PAGING_MASK;
1650   Bit64u offset_mask = BX_CONST64(0x0000ffffffffffff);
1651   unsigned combined_access = BX_COMBINED_ACCESS_WRITE | BX_COMBINED_ACCESS_USER;
1652 
1653   Bit64u reserved = PAGING_PAE_RESERVED_BITS;
1654   if (! host_state->efer.get_NXE())
1655     reserved |= PAGE_DIRECTORY_NX_BIT;
1656 
1657   for (leaf = BX_LEVEL_PML4;; --leaf) {
1658     entry_addr[leaf] = ppf + ((guest_paddr >> (9 + 9*leaf)) & 0xff8);
1659     access_read_physical(entry_addr[leaf], 8, &entry[leaf]);
1660     BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[leaf], 8, BX_MEMTYPE_INVALID, BX_READ, (BX_PTE_ACCESS + leaf), (Bit8u*)(&entry[leaf]));
1661     offset_mask >>= 9;
1662 
1663     Bit64u curr_entry = entry[leaf];
1664     int fault = check_entry_PAE(bx_paging_level[leaf], curr_entry, reserved, rw, &nx_fault);
1665     if (fault >= 0)
1666       nested_page_fault(fault, guest_paddr, rw, is_page_walk);
1667 
1668     combined_access &= curr_entry; // U/S and R/W
1669     ppf = curr_entry & BX_CONST64(0x000ffffffffff000);
1670 
1671     if (leaf == BX_LEVEL_PTE) break;
1672 
1673     if (curr_entry & 0x80) {
1674       if (leaf > (BX_LEVEL_PDE + !!is_cpu_extension_supported(BX_ISA_1G_PAGES))) {
1675         BX_DEBUG(("Nested PAE Walk %s: PS bit set !", bx_paging_level[leaf]));
1676         nested_page_fault(ERROR_RESERVED | ERROR_PROTECTION, guest_paddr, rw, is_page_walk);
1677       }
1678 
1679       ppf &= BX_CONST64(0x000fffffffffe000);
1680       if (ppf & offset_mask) {
1681         BX_DEBUG(("Nested PAE Walk %s: reserved bit is set: 0x" FMT_ADDRX64, bx_paging_level[leaf], curr_entry));
1682         nested_page_fault(ERROR_RESERVED | ERROR_PROTECTION, guest_paddr, rw, is_page_walk);
1683       }
1684 
1685       break;
1686     }
1687   }
1688 
1689   bool isWrite = (rw & 1); // write or r-m-w
1690 
1691   unsigned priv_index = (1<<3) /* user */ | (combined_access | isWrite);
1692 
1693   if (!priv_check[priv_index] || nx_fault)
1694     nested_page_fault(ERROR_PROTECTION, guest_paddr, rw, is_page_walk);
1695 
1696   // Update A/D bits if needed
1697   update_access_dirty_PAE(entry_addr, entry, entry_memtype, BX_LEVEL_PML4, leaf, isWrite);
1698 
1699   // Make up the physical page frame address
1700   return ppf | (bx_phy_address)(guest_paddr & offset_mask);
1701 }
1702 
nested_walk_PAE(bx_phy_address guest_paddr,unsigned rw,bool is_page_walk)1703 bx_phy_address BX_CPU_C::nested_walk_PAE(bx_phy_address guest_paddr, unsigned rw, bool is_page_walk)
1704 {
1705   bx_phy_address entry_addr[2];
1706   Bit64u entry[2];
1707   BxMemtype entry_memtype[2] = { BX_MEMTYPE_INVALID };
1708   bool nx_fault = false;
1709   int leaf;
1710 
1711   unsigned combined_access = BX_COMBINED_ACCESS_WRITE | BX_COMBINED_ACCESS_USER;
1712 
1713   SVM_CONTROLS *ctrls = &BX_CPU_THIS_PTR vmcb.ctrls;
1714   SVM_HOST_STATE *host_state = &BX_CPU_THIS_PTR vmcb.host_state;
1715   bx_phy_address ncr3 = ctrls->ncr3 & 0xffffffe0;
1716   unsigned index = (guest_paddr >> 30) & 0x3;
1717   Bit64u pdptr;
1718 
1719   bx_phy_address pdpe_entry_addr = (bx_phy_address) (ncr3 | (index << 3));
1720   access_read_physical(pdpe_entry_addr, 8, &pdptr);
1721   BX_NOTIFY_PHY_MEMORY_ACCESS(pdpe_entry_addr, 8, BX_MEMTYPE_INVALID, BX_READ, (BX_PDPTR0_ACCESS + index), (Bit8u*) &pdptr);
1722 
1723   if (! (pdptr & 0x1)) {
1724     BX_DEBUG(("Nested PAE Walk PDPTE%d entry not present !", index));
1725     nested_page_fault(ERROR_NOT_PRESENT, guest_paddr, rw, is_page_walk);
1726   }
1727 
1728   if (pdptr & PAGING_PAE_PDPTE_RESERVED_BITS) {
1729     BX_DEBUG(("Nested PAE Walk PDPTE%d entry reserved bits set: 0x" FMT_ADDRX64, index, pdptr));
1730     nested_page_fault(ERROR_RESERVED | ERROR_PROTECTION, guest_paddr, rw, is_page_walk);
1731   }
1732 
1733   Bit64u reserved = PAGING_LEGACY_PAE_RESERVED_BITS;
1734   if (! host_state->efer.get_NXE())
1735     reserved |= PAGE_DIRECTORY_NX_BIT;
1736 
1737   bx_phy_address ppf = pdptr & BX_CONST64(0x000ffffffffff000);
1738 
1739   for (leaf = BX_LEVEL_PDE;; --leaf) {
1740     entry_addr[leaf] = ppf + ((guest_paddr >> (9 + 9*leaf)) & 0xff8);
1741     access_read_physical(entry_addr[leaf], 8, &entry[leaf]);
1742     BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[leaf], 8, BX_MEMTYPE_INVALID, BX_READ, (BX_PTE_ACCESS + leaf), (Bit8u*)(&entry[leaf]));
1743 
1744     Bit64u curr_entry = entry[leaf];
1745     int fault = check_entry_PAE(bx_paging_level[leaf], curr_entry, reserved, rw, &nx_fault);
1746     if (fault >= 0)
1747       nested_page_fault(fault, guest_paddr, rw, is_page_walk);
1748 
1749     combined_access &= curr_entry; // U/S and R/W
1750     ppf = curr_entry & BX_CONST64(0x000ffffffffff000);
1751 
1752     if (leaf == BX_LEVEL_PTE) break;
1753 
1754     // Ignore CR4.PSE in PAE mode
1755     if (curr_entry & 0x80) {
1756       if (curr_entry & PAGING_PAE_PDE2M_RESERVED_BITS) {
1757         BX_DEBUG(("PAE PDE2M: reserved bit is set PDE=0x" FMT_ADDRX64, curr_entry));
1758         nested_page_fault(ERROR_RESERVED | ERROR_PROTECTION, guest_paddr, rw, is_page_walk);
1759       }
1760 
1761       // Make up the physical page frame address
1762       ppf = (bx_phy_address)((curr_entry & BX_CONST64(0x000fffffffe00000)) | (guest_paddr & 0x001ff000));
1763       break;
1764     }
1765   }
1766 
1767   bool isWrite = (rw & 1); // write or r-m-w
1768 
1769   unsigned priv_index = (1<<3) /* user */ | (combined_access | isWrite);
1770 
1771   if (!priv_check[priv_index] || nx_fault)
1772     nested_page_fault(ERROR_PROTECTION, guest_paddr, rw, is_page_walk);
1773 
1774   // Update A/D bits if needed
1775   update_access_dirty_PAE(entry_addr, entry, entry_memtype, BX_LEVEL_PDE, leaf, isWrite);
1776 
1777   Bit32u page_offset = PAGE_OFFSET(guest_paddr);
1778   return ppf | page_offset;
1779 }
1780 
nested_walk_legacy(bx_phy_address guest_paddr,unsigned rw,bool is_page_walk)1781 bx_phy_address BX_CPU_C::nested_walk_legacy(bx_phy_address guest_paddr, unsigned rw, bool is_page_walk)
1782 {
1783   bx_phy_address entry_addr[2];
1784   Bit32u entry[2];
1785   BxMemtype entry_memtype[2] = { BX_MEMTYPE_INVALID };
1786   int leaf;
1787 
1788   SVM_CONTROLS *ctrls = &BX_CPU_THIS_PTR vmcb.ctrls;
1789   SVM_HOST_STATE *host_state = &BX_CPU_THIS_PTR vmcb.host_state;
1790   bx_phy_address ppf = ctrls->ncr3 & BX_CR3_PAGING_MASK;
1791   unsigned combined_access = BX_COMBINED_ACCESS_WRITE | BX_COMBINED_ACCESS_USER;
1792 
1793   for (leaf = BX_LEVEL_PDE;; --leaf) {
1794     entry_addr[leaf] = ppf + ((guest_paddr >> (10 + 10*leaf)) & 0xffc);
1795     access_read_physical(entry_addr[leaf], 4, &entry[leaf]);
1796     BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[leaf], 4, BX_MEMTYPE_INVALID, BX_READ, (BX_PTE_ACCESS + leaf), (Bit8u*)(&entry[leaf]));
1797 
1798     Bit32u curr_entry = entry[leaf];
1799     if (!(curr_entry & 0x1)) {
1800       BX_DEBUG(("Nested %s Walk: entry not present", bx_paging_level[leaf]));
1801       nested_page_fault(ERROR_NOT_PRESENT, guest_paddr, rw, is_page_walk);
1802     }
1803 
1804     combined_access &= curr_entry; // U/S and R/W
1805     ppf = curr_entry & 0xfffff000;
1806 
1807     if (leaf == BX_LEVEL_PTE) break;
1808 
1809     if ((curr_entry & 0x80) != 0 && host_state->cr4.get_PSE()) {
1810       // 4M paging, only if CR4.PSE enabled, ignore PDE.PS otherwise
1811       if (curr_entry & PAGING_PDE4M_RESERVED_BITS) {
1812         BX_DEBUG(("Nested PSE Walk PDE4M: reserved bit is set: PDE=0x%08x", entry[BX_LEVEL_PDE]));
1813         nested_page_fault(ERROR_RESERVED | ERROR_PROTECTION, guest_paddr, rw, is_page_walk);
1814       }
1815 
1816       // make up the physical frame number
1817       ppf = (curr_entry & 0xffc00000) | (guest_paddr & 0x003ff000);
1818 #if BX_PHY_ADDRESS_WIDTH > 32
1819       ppf |= ((bx_phy_address)(curr_entry & 0x003fe000)) << 19;
1820 #endif
1821       break;
1822     }
1823   }
1824 
1825   bool isWrite = (rw & 1); // write or r-m-w
1826 
1827   unsigned priv_index = (1<<3) /* user */ | (combined_access | isWrite);
1828 
1829   if (!priv_check[priv_index])
1830     nested_page_fault(ERROR_PROTECTION, guest_paddr, rw, is_page_walk);
1831 
1832   update_access_dirty(entry_addr, entry, entry_memtype, leaf, isWrite);
1833 
1834   Bit32u page_offset = PAGE_OFFSET(guest_paddr);
1835   return ppf | page_offset;
1836 }
1837 
nested_walk(bx_phy_address guest_paddr,unsigned rw,bool is_page_walk)1838 bx_phy_address BX_CPU_C::nested_walk(bx_phy_address guest_paddr, unsigned rw, bool is_page_walk)
1839 {
1840   SVM_HOST_STATE *host_state = &BX_CPU_THIS_PTR vmcb.host_state;
1841 
1842   BX_DEBUG(("Nested walk for guest paddr 0x" FMT_PHY_ADDRX, guest_paddr));
1843 
1844   if (host_state->efer.get_LMA())
1845     return nested_walk_long_mode(guest_paddr, rw, is_page_walk);
1846   else if (host_state->cr4.get_PAE())
1847     return nested_walk_PAE(guest_paddr, rw, is_page_walk);
1848   else
1849     return nested_walk_legacy(guest_paddr, rw, is_page_walk);
1850 }
1851 
1852 #endif
1853 
1854 #if BX_SUPPORT_VMX >= 2
1855 
1856 /* EPT access type */
1857 enum {
1858   BX_EPT_READ    = 0x01,
1859   BX_EPT_WRITE   = 0x02,
1860   BX_EPT_EXECUTE = 0x04
1861 };
1862 
1863 /* EPT access mask */
1864 enum {
1865   BX_EPT_ENTRY_NOT_PRESENT        = 0x00,
1866   BX_EPT_ENTRY_READ_ONLY          = 0x01,
1867   BX_EPT_ENTRY_WRITE_ONLY         = 0x02,
1868   BX_EPT_ENTRY_READ_WRITE         = 0x03,
1869   BX_EPT_ENTRY_EXECUTE_ONLY       = 0x04,
1870   BX_EPT_ENTRY_READ_EXECUTE       = 0x05,
1871   BX_EPT_ENTRY_WRITE_EXECUTE      = 0x06,
1872   BX_EPT_ENTRY_READ_WRITE_EXECUTE = 0x07
1873 };
1874 
1875 #define BX_VMX_EPT_ACCESS_DIRTY_ENABLED                 (BX_CPU_THIS_PTR vmcs.eptptr & 0x40)
1876 #define BX_VMX_EPT_SUPERVISOR_SHADOW_STACK_CTRL_ENABLED (BX_CPU_THIS_PTR vmcs.eptptr & 0x80)
1877 
1878 //                   Format of a EPT Entry
1879 // -----------------------------------------------------------
1880 // 00    | Read access
1881 // 01    | Write access
1882 // 02    | Execute Access
1883 // 05-03 | EPT Memory type (for leaf entries, reserved otherwise)
1884 // 06    | Ignore PAT memory type (for leaf entries, reserved otherwise)
1885 // 07    | Page Size, must be 1 to indicate a Large Page
1886 // 08    | Accessed bit (if supported, ignored otherwise)
1887 // 09    | Dirty bit (for leaf entries, if supported, ignored otherwise)
1888 // 11-10 | (ignored)
1889 // PA-12 | Physical address
1890 // 51-PA | Reserved (must be zero)
1891 // 61-52 | (ignored)
1892 // 60    | Supervisor Shadow Stack Page (CET)
1893 // 61    | Super Page Protected (SPP)
1894 // 63    | Suppress #VE
1895 // -----------------------------------------------------------
1896 
1897 const Bit64u BX_SUPPRESS_EPT_VIOLATION_EXCEPTION = (BX_CONST64(1) << 63);
1898 const Bit64u BX_SUB_PAGE_PROTECTED               = (BX_CONST64(1) << 61);
1899 const Bit64u BX_SUPERVISOR_SHADOW_STACK_PAGE     = (BX_CONST64(1) << 60);
1900 
1901 const Bit64u PAGING_EPT_RESERVED_BITS = BX_PAGING_PHY_ADDRESS_RESERVED_BITS;
1902 
translate_guest_physical(bx_phy_address guest_paddr,bx_address guest_laddr,bool guest_laddr_valid,bool is_page_walk,unsigned rw,bool supervisor_shadow_stack)1903 bx_phy_address BX_CPU_C::translate_guest_physical(bx_phy_address guest_paddr, bx_address guest_laddr, bool guest_laddr_valid, bool is_page_walk, unsigned rw, bool supervisor_shadow_stack)
1904 {
1905   VMCS_CACHE *vm = &BX_CPU_THIS_PTR vmcs;
1906   bx_phy_address entry_addr[4], ppf = LPFOf(vm->eptptr);
1907   Bit64u entry[4];
1908   int leaf;
1909 
1910 #if BX_SUPPORT_MEMTYPE
1911   // The MTRRs have no effect on the memory type used for an access to an EPT paging structures.
1912   BxMemtype eptptr_memtype = BX_CPU_THIS_PTR cr0.get_CD() ? (BX_MEMTYPE_UC) : BxMemtype(vm->eptptr & 0x7);
1913 #endif
1914 
1915   Bit32u combined_access = 0x7, access_mask = 0;
1916   Bit64u offset_mask = BX_CONST64(0x0000ffffffffffff);
1917 
1918   BX_DEBUG(("EPT walk for guest paddr 0x" FMT_PHY_ADDRX, guest_paddr));
1919 
1920   // when EPT A/D enabled treat guest page table accesses as writes
1921   if (BX_VMX_EPT_ACCESS_DIRTY_ENABLED && is_page_walk && guest_laddr_valid)
1922     rw = BX_WRITE;
1923 
1924   if (rw == BX_EXECUTE) access_mask |= BX_EPT_EXECUTE;
1925   if (rw & 1) access_mask |= BX_EPT_WRITE; // write or r-m-w
1926   if ((rw & 3) == BX_READ) access_mask |= BX_EPT_READ;  // handle correctly shadow stack reads
1927 
1928   Bit32u vmexit_reason = 0;
1929 
1930   for (leaf = BX_LEVEL_PML4;; --leaf) {
1931     entry_addr[leaf] = ppf + ((guest_paddr >> (9 + 9*leaf)) & 0xff8);
1932     access_read_physical(entry_addr[leaf], 8, &entry[leaf]);
1933     BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[leaf], 8, MEMTYPE(eptptr_memtype), BX_READ, (BX_EPT_PTE_ACCESS + leaf), (Bit8u*)(&entry[leaf]));
1934 
1935     offset_mask >>= 9;
1936     Bit64u curr_entry = entry[leaf];
1937     Bit32u curr_access_mask = curr_entry & 0x7;
1938 
1939     if (curr_access_mask == BX_EPT_ENTRY_NOT_PRESENT) {
1940       BX_DEBUG(("EPT %s: not present", bx_paging_level[leaf]));
1941       vmexit_reason = VMX_VMEXIT_EPT_VIOLATION;
1942       break;
1943     }
1944 
1945     if (curr_access_mask == BX_EPT_ENTRY_WRITE_ONLY || curr_access_mask == BX_EPT_ENTRY_WRITE_EXECUTE) {
1946       BX_DEBUG(("EPT %s: EPT misconfiguration mask=%d", bx_paging_level[leaf], curr_access_mask));
1947       vmexit_reason = VMX_VMEXIT_EPT_MISCONFIGURATION;
1948       break;
1949     }
1950 
1951     extern bool isMemTypeValidMTRR(unsigned memtype);
1952     if (! isMemTypeValidMTRR((curr_entry >> 3) & 7)) {
1953       BX_DEBUG(("EPT %s: EPT misconfiguration memtype=%d",
1954         bx_paging_level[leaf], (unsigned)((curr_entry >> 3) & 7)));
1955       vmexit_reason = VMX_VMEXIT_EPT_MISCONFIGURATION;
1956       break;
1957     }
1958 
1959     if (curr_entry & PAGING_EPT_RESERVED_BITS) {
1960       BX_DEBUG(("EPT %s: reserved bit is set 0x" FMT_ADDRX64, bx_paging_level[leaf], curr_entry));
1961       vmexit_reason = VMX_VMEXIT_EPT_MISCONFIGURATION;
1962       break;
1963     }
1964 
1965     ppf = curr_entry & BX_CONST64(0x000ffffffffff000);
1966 
1967     if (leaf == BX_LEVEL_PTE) break;
1968 
1969     if (curr_entry & 0x80) {
1970       if (leaf > (BX_LEVEL_PDE + !!is_cpu_extension_supported(BX_ISA_1G_PAGES))) {
1971         BX_DEBUG(("EPT %s: PS bit set !", bx_paging_level[leaf]));
1972         vmexit_reason = VMX_VMEXIT_EPT_MISCONFIGURATION;
1973         break;
1974       }
1975 
1976       ppf &= BX_CONST64(0x000fffffffffe000);
1977       if (ppf & offset_mask) {
1978          BX_DEBUG(("EPT %s: reserved bit is set: 0x" FMT_ADDRX64, bx_paging_level[leaf], curr_entry));
1979          vmexit_reason = VMX_VMEXIT_EPT_MISCONFIGURATION;
1980          break;
1981       }
1982 
1983       // Make up the physical page frame address
1984       ppf += (bx_phy_address)(guest_paddr & offset_mask);
1985       break;
1986     }
1987 
1988     // EPT non leaf entry, check for reserved bits
1989     if ((curr_entry >> 3) & 0xf) {
1990       BX_DEBUG(("EPT %s: EPT misconfiguration, reserved bits set for non-leaf entry", bx_paging_level[leaf]));
1991       vmexit_reason = VMX_VMEXIT_EPT_MISCONFIGURATION;
1992       break;
1993     }
1994 
1995     combined_access &= curr_access_mask;
1996   }
1997 
1998   // defer final combined_access calculation (with leaf entry) until CET is handled
1999 
2000   if (!vmexit_reason) {
2001 #if BX_SUPPORT_CET
2002     if (BX_VMX_EPT_SUPERVISOR_SHADOW_STACK_CTRL_ENABLED && supervisor_shadow_stack) {
2003       // The EPT.R bit is set in all EPT paging-structure entry controlling the translation
2004       // The EPT.W bit is set in all EPT paging-structure entry controlling the translation except the leaf entry (allowed for shadow stack write access)
2005       // The SSS bit (bit 60) is 1 in the EPT paging-structure entry maps the page
2006       bool supervisor_shadow_stack_page = ((combined_access & BX_EPT_ENTRY_READ_WRITE) == BX_EPT_ENTRY_READ_WRITE) &&
2007                                              ((entry[leaf] & BX_EPT_READ) != 0) &&
2008                                              (((entry[leaf] & BX_EPT_WRITE) == 0) || !(access_mask & BX_EPT_WRITE)) &&
2009                                              ((entry[leaf] & BX_SUPERVISOR_SHADOW_STACK_PAGE) != 0);
2010       if (!supervisor_shadow_stack_page) {
2011         BX_ERROR(("VMEXIT: supervisor shadow stack access to non supervisor shadow stack page"));
2012         vmexit_reason = VMX_VMEXIT_EPT_VIOLATION;
2013       }
2014     }
2015     else
2016 #endif
2017     {
2018       combined_access &= entry[leaf];
2019 
2020       if ((access_mask & combined_access) != access_mask) {
2021         vmexit_reason = VMX_VMEXIT_EPT_VIOLATION;
2022         if (SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_SUBPAGE_WR_PROTECT_CTRL) && (entry[leaf] & BX_SUB_PAGE_PROTECTED) != 0 && leaf == BX_LEVEL_PTE) {
2023           if ((access_mask & BX_EPT_WRITE) != 0 && (combined_access & BX_EPT_WRITE) == 0 && guest_laddr_valid && ! is_page_walk)
2024             if (spp_walk(guest_paddr, guest_laddr, MEMTYPE(eptptr_memtype)))
2025               vmexit_reason = 0;
2026         }
2027       }
2028     }
2029   }
2030 
2031   if (vmexit_reason) {
2032     BX_ERROR(("VMEXIT: EPT %s for guest paddr 0x" FMT_PHY_ADDRX " laddr 0x" FMT_ADDRX,
2033        (vmexit_reason == VMX_VMEXIT_EPT_VIOLATION) ? "violation" : "misconfig", guest_paddr, guest_laddr));
2034 
2035     Bit32u vmexit_qualification = 0;
2036 
2037     // no VMExit qualification for EPT Misconfiguration VMExit
2038     if (vmexit_reason == VMX_VMEXIT_EPT_VIOLATION) {
2039       combined_access &= entry[leaf];
2040       vmexit_qualification = access_mask | (combined_access << 3);
2041       if (guest_laddr_valid) {
2042         vmexit_qualification |= (1<<7);
2043         if (! is_page_walk) vmexit_qualification |= (1<<8);
2044       }
2045       if (BX_CPU_THIS_PTR nmi_unblocking_iret)
2046         vmexit_qualification |= (1 << 12);
2047 #if BX_SUPPORT_CET
2048       if (rw & 4) // shadow stack access
2049         vmexit_qualification |= (1 << 13);
2050 
2051       if (BX_VMX_EPT_SUPERVISOR_SHADOW_STACK_CTRL_ENABLED && (entry[leaf] & BX_SUPERVISOR_SHADOW_STACK_PAGE) != 0)
2052         vmexit_qualification |= (1 << 14);
2053 #endif
2054       if (SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_EPT_VIOLATION_EXCEPTION)) {
2055         if ((entry[leaf] & BX_SUPPRESS_EPT_VIOLATION_EXCEPTION) == 0)
2056           Virtualization_Exception(vmexit_qualification, guest_paddr, guest_laddr);
2057       }
2058     }
2059 
2060     VMwrite64(VMCS_64BIT_GUEST_PHYSICAL_ADDR, guest_paddr);
2061     VMwrite_natural(VMCS_GUEST_LINEAR_ADDR, guest_laddr);
2062     VMexit(vmexit_reason, vmexit_qualification);
2063   }
2064 
2065   if (BX_VMX_EPT_ACCESS_DIRTY_ENABLED) {
2066     // write access and Dirty-bit is not set in the leaf entry
2067     unsigned dirty_update = (rw & 1) && !(entry[leaf] & 0x200);
2068     if (SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_PML_ENABLE))
2069       vmx_page_modification_logging(guest_paddr, dirty_update);
2070 
2071     update_ept_access_dirty(entry_addr, entry, MEMTYPE(eptptr_memtype), leaf, rw & 1);
2072   }
2073 
2074   Bit32u page_offset = PAGE_OFFSET(guest_paddr);
2075   return ppf | page_offset;
2076 }
2077 
2078 // Access bit 8, Dirty bit 9
update_ept_access_dirty(bx_phy_address * entry_addr,Bit64u * entry,BxMemtype eptptr_memtype,unsigned leaf,unsigned write)2079 void BX_CPU_C::update_ept_access_dirty(bx_phy_address *entry_addr, Bit64u *entry, BxMemtype eptptr_memtype, unsigned leaf, unsigned write)
2080 {
2081   // Update A bit if needed
2082   for (unsigned level=BX_LEVEL_PML4; level > leaf; level--) {
2083     if (!(entry[level] & 0x100)) {
2084       entry[level] |= 0x100;
2085       access_write_physical(entry_addr[level], 8, &entry[level]);
2086       BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[level], 8, MEMTYPE(eptptr_memtype), BX_WRITE, (BX_EPT_PTE_ACCESS + level), (Bit8u*)(&entry[level]));
2087     }
2088   }
2089 
2090   // Update A/D bits if needed
2091   if (!(entry[leaf] & 0x100) || (write && !(entry[leaf] & 0x200))) {
2092     entry[leaf] |= (0x100 | (write<<9)); // Update A and possibly D bits
2093     access_write_physical(entry_addr[leaf], 8, &entry[leaf]);
2094     BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[leaf], 8, MEMTYPE(eptptr_memtype), BX_WRITE, (BX_EPT_PTE_ACCESS + leaf), (Bit8u*)(&entry[leaf]));
2095   }
2096 }
2097 
2098 const Bit64u PAGING_SPP_RESERVED_BITS = BX_PAGING_PHY_ADDRESS_RESERVED_BITS | BX_CONST64(0xFFF0000000000FFE);
2099 
2100 const Bit32u VMX_SPP_NOT_PRESENT_QUALIFICATION = (1<<11);
2101 
spp_walk(bx_phy_address guest_paddr,bx_address guest_laddr,BxMemtype memtype)2102 bool BX_CPU_C::spp_walk(bx_phy_address guest_paddr, bx_address guest_laddr, BxMemtype memtype)
2103 {
2104   VMCS_CACHE *vm = &BX_CPU_THIS_PTR vmcs;
2105   bx_phy_address entry_addr[4], ppf = LPFOf(vm->spptp);
2106   Bit64u entry[4];
2107   int leaf;
2108 
2109   BX_DEBUG(("SPP walk for guest paddr 0x" FMT_PHY_ADDRX, guest_paddr));
2110 
2111   Bit32u vmexit_reason = 0;
2112   Bit32u vmexit_qualification = 0;
2113 
2114   for (leaf = BX_LEVEL_PML4;; --leaf) {
2115     entry_addr[leaf] = ppf + ((guest_paddr >> (9 + 9*leaf)) & 0xff8);
2116     access_read_physical(entry_addr[leaf], 8, &entry[leaf]);
2117     BX_NOTIFY_PHY_MEMORY_ACCESS(entry_addr[leaf], 8, MEMTYPE(memtype), BX_READ, (BX_EPT_SPP_PTE_ACCESS + leaf), (Bit8u*)(&entry[leaf]));
2118 
2119     if (leaf == BX_LEVEL_PTE) break;
2120 
2121     Bit64u curr_entry = entry[leaf];
2122 
2123     if (!(curr_entry & 1)) {
2124       BX_DEBUG(("SPP %s: not present", bx_paging_level[leaf]));
2125       vmexit_reason = VMX_VMEXIT_SPP;
2126       vmexit_qualification = VMX_SPP_NOT_PRESENT_QUALIFICATION;
2127       break;
2128     }
2129 
2130     if (curr_entry & PAGING_SPP_RESERVED_BITS) {
2131       BX_DEBUG(("SPP %s: reserved bit is set 0x" FMT_ADDRX64, bx_paging_level[leaf], curr_entry));
2132       vmexit_reason = VMX_VMEXIT_SPP;
2133       break;
2134     }
2135 
2136     ppf = curr_entry & BX_CONST64(0x000ffffffffff000);
2137   }
2138 
2139   if (vmexit_reason) {
2140     BX_ERROR(("VMEXIT: SPP %s for guest paddr 0x" FMT_PHY_ADDRX " laddr 0x" FMT_ADDRX,
2141        (vmexit_qualification == VMX_SPP_NOT_PRESENT_QUALIFICATION) ? "violation" : "misconfig", guest_paddr, guest_laddr));
2142 
2143     if (BX_CPU_THIS_PTR nmi_unblocking_iret)
2144       vmexit_qualification |= (1 << 12);
2145 
2146     VMwrite64(VMCS_64BIT_GUEST_PHYSICAL_ADDR, guest_paddr);
2147     VMwrite_natural(VMCS_GUEST_LINEAR_ADDR, guest_laddr);
2148     VMexit(vmexit_reason, vmexit_qualification);
2149   }
2150 
2151   Bit32u spp_bit = 2 * ((guest_paddr & 0xFFF) >> 7);
2152   return (entry[BX_LEVEL_PTE] >> spp_bit) & 1;
2153 }
2154 
2155 #endif
2156 
2157 #if BX_DEBUGGER
2158 
dbg_print_paging_pte(int level,Bit64u entry)2159 void dbg_print_paging_pte(int level, Bit64u entry)
2160 {
2161   dbg_printf("%4s: 0x%08x%08x", bx_paging_level[level], GET32H(entry), GET32L(entry));
2162 
2163   if (entry & BX_CONST64(0x8000000000000000))
2164     dbg_printf(" XD");
2165   else
2166     dbg_printf("   ");
2167 
2168   if (level == BX_LEVEL_PTE) {
2169     dbg_printf("    %s %s %s",
2170       (entry & 0x0100) ? "G" : "g",
2171       (entry & 0x0080) ? "PAT" : "pat",
2172       (entry & 0x0040) ? "D" : "d");
2173   }
2174   else {
2175     if (entry & 0x80) {
2176       dbg_printf(" PS %s %s %s",
2177         (entry & 0x0100) ? "G" : "g",
2178         (entry & 0x1000) ? "PAT" : "pat",
2179         (entry & 0x0040) ? "D" : "d");
2180     }
2181     else {
2182       dbg_printf(" ps        ");
2183     }
2184   }
2185 
2186   dbg_printf(" %s %s %s %s %s %s\n",
2187     (entry & 0x20) ? "A" : "a",
2188     (entry & 0x10) ? "PCD" : "pcd",
2189     (entry & 0x08) ? "PWT" : "pwt",
2190     (entry & 0x04) ? "U" : "S",
2191     (entry & 0x02) ? "W" : "R",
2192     (entry & 0x01) ? "P" : "p");
2193 }
2194 
2195 #if BX_SUPPORT_VMX >= 2
dbg_print_ept_paging_pte(int level,Bit64u entry)2196 void dbg_print_ept_paging_pte(int level, Bit64u entry)
2197 {
2198   dbg_printf("EPT %4s: 0x%08x%08x", bx_paging_level[level], GET32H(entry), GET32L(entry));
2199 
2200   if (level != BX_LEVEL_PTE && (entry & 0x80))
2201     dbg_printf(" PS");
2202   else
2203     dbg_printf("   ");
2204 
2205   dbg_printf(" %s %s %s",
2206     (entry & 0x04) ? "E" : "e",
2207     (entry & 0x02) ? "W" : "w",
2208     (entry & 0x01) ? "R" : "r");
2209 
2210   if (level == BX_LEVEL_PTE || (entry & 0x80)) {
2211     dbg_printf(" %s %s\n",
2212       (entry & 0x40) ? "IGNORE_PAT" : "ignore_pat",
2213       get_memtype_name(BxMemtype((entry >> 3) & 0x7)));
2214   }
2215   else {
2216     dbg_printf("\n");
2217   }
2218 }
2219 #endif
2220 
2221 #endif // BX_DEBUGGER
2222 
2223 #if BX_SUPPORT_VMX >= 2
dbg_translate_guest_physical(bx_phy_address guest_paddr,bx_phy_address * phy,bool verbose)2224 bool BX_CPU_C::dbg_translate_guest_physical(bx_phy_address guest_paddr, bx_phy_address *phy, bool verbose)
2225 {
2226   VMCS_CACHE *vm = &BX_CPU_THIS_PTR vmcs;
2227   bx_phy_address pt_address = LPFOf(vm->eptptr);
2228   Bit64u offset_mask = BX_CONST64(0x0000ffffffffffff);
2229 
2230   for (int level = 3; level >= 0; --level) {
2231     Bit64u pte;
2232     pt_address += ((guest_paddr >> (9 + 9*level)) & 0xff8);
2233     offset_mask >>= 9;
2234     BX_MEM(0)->readPhysicalPage(BX_CPU_THIS, pt_address, 8, &pte);
2235 #if BX_DEBUGGER
2236     if (verbose)
2237       dbg_print_ept_paging_pte(level, pte);
2238 #endif
2239     switch(pte & 7) {
2240     case BX_EPT_ENTRY_NOT_PRESENT:
2241     case BX_EPT_ENTRY_WRITE_ONLY:
2242     case BX_EPT_ENTRY_WRITE_EXECUTE:
2243       return 0;
2244     }
2245     if (pte & BX_PAGING_PHY_ADDRESS_RESERVED_BITS)
2246       return 0;
2247 
2248     pt_address = bx_phy_address(pte & BX_CONST64(0x000ffffffffff000));
2249 
2250     if (level == BX_LEVEL_PTE) break;
2251 
2252     if (pte & 0x80) {
2253        if (level > (BX_LEVEL_PDE + !!is_cpu_extension_supported(BX_ISA_1G_PAGES)))
2254          return 0;
2255 
2256         pt_address &= BX_CONST64(0x000fffffffffe000);
2257         if (pt_address & offset_mask) return 0;
2258         break;
2259       }
2260   }
2261 
2262   *phy = pt_address + (bx_phy_address)(guest_paddr & offset_mask);
2263   return 1;
2264 }
2265 #endif
2266 
dbg_xlate_linear2phy(bx_address laddr,bx_phy_address * phy,bx_address * lpf_mask,bool verbose)2267 bool BX_CPU_C::dbg_xlate_linear2phy(bx_address laddr, bx_phy_address *phy, bx_address *lpf_mask, bool verbose)
2268 {
2269   bx_phy_address paddress;
2270   bx_address offset_mask = 0xfff;
2271 
2272 #if BX_SUPPORT_X86_64
2273   if (! long_mode()) laddr &= 0xffffffff;
2274 #endif
2275 
2276   if (! BX_CPU_THIS_PTR cr0.get_PG()) {
2277     paddress = (bx_phy_address) laddr;
2278   }
2279   else {
2280     bx_phy_address pt_address = BX_CPU_THIS_PTR cr3 & BX_CR3_PAGING_MASK;
2281 
2282 #if BX_CPU_LEVEL >= 6
2283     if (BX_CPU_THIS_PTR cr4.get_PAE()) {
2284       offset_mask = BX_CONST64(0x0000ffffffffffff);
2285 
2286       int level = 3;
2287       if (! long_mode()) {
2288         pt_address = BX_CPU_THIS_PTR PDPTR_CACHE.entry[(laddr >> 30) & 3];
2289         if (! (pt_address & 0x1)) {
2290            offset_mask = 0x3fffffff;
2291            goto page_fault;
2292 	}
2293         offset_mask >>= 18;
2294         pt_address &= BX_CONST64(0x000ffffffffff000);
2295         level = 1;
2296       }
2297 
2298       for (; level >= 0; --level) {
2299         Bit64u pte;
2300         pt_address += ((laddr >> (9 + 9*level)) & 0xff8);
2301         offset_mask >>= 9;
2302 #if BX_SUPPORT_VMX >= 2
2303         if (BX_CPU_THIS_PTR in_vmx_guest) {
2304           if (SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_EPT_ENABLE)) {
2305             if (! dbg_translate_guest_physical(pt_address, &pt_address, verbose))
2306               goto page_fault;
2307           }
2308         }
2309 #endif
2310         BX_MEM(0)->readPhysicalPage(BX_CPU_THIS, pt_address, 8, &pte);
2311 #if BX_DEBUGGER
2312         if (verbose)
2313           dbg_print_paging_pte(level, pte);
2314 #endif
2315         if(!(pte & 1))
2316           goto page_fault;
2317         if (pte & BX_PAGING_PHY_ADDRESS_RESERVED_BITS)
2318           goto page_fault;
2319         pt_address = bx_phy_address(pte & BX_CONST64(0x000ffffffffff000));
2320         if (level == BX_LEVEL_PTE) break;
2321         if (pte & 0x80) {
2322           // large page
2323           pt_address &= BX_CONST64(0x000fffffffffe000);
2324           if (pt_address & offset_mask)
2325             goto page_fault;
2326           if (is_cpu_extension_supported(BX_ISA_1G_PAGES) && level == BX_LEVEL_PDPTE) break;
2327           if (level == BX_LEVEL_PDE) break;
2328           goto page_fault;
2329         }
2330       }
2331       paddress = pt_address + (bx_phy_address)(laddr & offset_mask);
2332     }
2333     else   // not PAE
2334 #endif
2335     {
2336       offset_mask = 0xfff;
2337       for (int level = 1; level >= 0; --level) {
2338         Bit32u pte;
2339         pt_address += ((laddr >> (10 + 10*level)) & 0xffc);
2340 #if BX_SUPPORT_VMX >= 2
2341         if (BX_CPU_THIS_PTR in_vmx_guest) {
2342           if (SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_EPT_ENABLE)) {
2343             if (! dbg_translate_guest_physical(pt_address, &pt_address, verbose))
2344               goto page_fault;
2345           }
2346         }
2347 #endif
2348         BX_MEM(0)->readPhysicalPage(BX_CPU_THIS, pt_address, 4, &pte);
2349 #if BX_DEBUGGER
2350         if (verbose)
2351           dbg_print_paging_pte(level, pte);
2352 #endif
2353         if (!(pte & 1))
2354           goto page_fault;
2355         pt_address = pte & 0xfffff000;
2356 #if BX_CPU_LEVEL >= 6
2357         if (level == BX_LEVEL_PDE && (pte & 0x80) != 0 && BX_CPU_THIS_PTR cr4.get_PSE()) {
2358           offset_mask = 0x3fffff;
2359           pt_address = pte & 0xffc00000;
2360 #if BX_PHY_ADDRESS_WIDTH > 32
2361           pt_address += ((bx_phy_address)(pte & 0x003fe000)) << 19;
2362 #endif
2363           break;
2364         }
2365 #endif
2366       }
2367       paddress = pt_address + (bx_phy_address)(laddr & offset_mask);
2368     }
2369   }
2370 #if BX_SUPPORT_VMX >= 2
2371   if (BX_CPU_THIS_PTR in_vmx_guest) {
2372     if (SECONDARY_VMEXEC_CONTROL(VMX_VM_EXEC_CTRL3_EPT_ENABLE)) {
2373       if (! dbg_translate_guest_physical(paddress, &paddress, verbose))
2374         goto page_fault;
2375     }
2376   }
2377 #endif
2378 
2379   if (lpf_mask)
2380     *lpf_mask = offset_mask;
2381   *phy = A20ADDR(paddress);
2382   return 1;
2383 
2384 page_fault:
2385   if (lpf_mask)
2386     *lpf_mask = offset_mask;
2387   *phy = 0;
2388   return 0;
2389 }
2390 
access_write_linear(bx_address laddr,unsigned len,unsigned curr_pl,unsigned xlate_rw,Bit32u ac_mask,void * data)2391 int BX_CPU_C::access_write_linear(bx_address laddr, unsigned len, unsigned curr_pl, unsigned xlate_rw, Bit32u ac_mask, void *data)
2392 {
2393 #if BX_SUPPORT_CET
2394   BX_ASSERT(xlate_rw == BX_WRITE || xlate_rw == BX_SHADOW_STACK_WRITE);
2395 #else
2396   BX_ASSERT(xlate_rw == BX_WRITE);
2397 #endif
2398 
2399   Bit32u pageOffset = PAGE_OFFSET(laddr);
2400 
2401   bool user = (curr_pl == 3);
2402 
2403   bx_TLB_entry *tlbEntry = BX_DTLB_ENTRY_OF(laddr, 0);
2404 
2405 #if BX_SUPPORT_X86_64
2406   if (! IsCanonical(laddr)) {
2407     BX_ERROR(("access_write_linear(): canonical failure"));
2408     return -1;
2409   }
2410 #endif
2411 
2412 #if BX_CPU_LEVEL >= 4 && BX_SUPPORT_ALIGNMENT_CHECK
2413   if (BX_CPU_THIS_PTR alignment_check() && user) {
2414     if (pageOffset & ac_mask) {
2415       BX_ERROR(("access_write_linear(): #AC misaligned access"));
2416       exception(BX_AC_EXCEPTION, 0);
2417     }
2418   }
2419 #endif
2420 
2421   /* check for reference across multiple pages */
2422   if ((pageOffset + len) <= 4096) {
2423     // Access within single page.
2424     BX_CPU_THIS_PTR address_xlation.paddress1 = translate_linear(tlbEntry, laddr, user, xlate_rw);
2425     BX_CPU_THIS_PTR address_xlation.pages     = 1;
2426 #if BX_SUPPORT_MEMTYPE
2427     BX_CPU_THIS_PTR address_xlation.memtype1  = tlbEntry->get_memtype();
2428 #endif
2429 
2430     BX_NOTIFY_LIN_MEMORY_ACCESS(laddr, BX_CPU_THIS_PTR address_xlation.paddress1,
2431                           len, tlbEntry->get_memtype(), xlate_rw, (Bit8u*) data);
2432 
2433     access_write_physical(BX_CPU_THIS_PTR address_xlation.paddress1, len, data);
2434 
2435 #if BX_X86_DEBUGGER
2436     hwbreakpoint_match(laddr, len, xlate_rw);
2437 #endif
2438   }
2439   else {
2440     // access across 2 pages
2441     BX_CPU_THIS_PTR address_xlation.len1 = 4096 - pageOffset;
2442     BX_CPU_THIS_PTR address_xlation.len2 = len - BX_CPU_THIS_PTR address_xlation.len1;
2443     BX_CPU_THIS_PTR address_xlation.pages = 2;
2444     bx_address laddr2 = laddr + BX_CPU_THIS_PTR address_xlation.len1;
2445 #if BX_SUPPORT_X86_64
2446     if (! long64_mode()) laddr2 &= 0xffffffff; /* handle linear address wrap in legacy mode */
2447     else {
2448       if (! IsCanonical(laddr2)) {
2449         BX_ERROR(("access_write_linear(): canonical failure for second half of page split access"));
2450         return -1;
2451       }
2452     }
2453 #endif
2454 
2455     bx_TLB_entry *tlbEntry2 = BX_DTLB_ENTRY_OF(laddr2, 0);
2456 
2457     BX_CPU_THIS_PTR address_xlation.paddress1 = translate_linear(tlbEntry, laddr, user, xlate_rw);
2458     BX_CPU_THIS_PTR address_xlation.paddress2 = translate_linear(tlbEntry2, laddr2, user, xlate_rw);
2459 #if BX_SUPPORT_MEMTYPE
2460     BX_CPU_THIS_PTR address_xlation.memtype1 = tlbEntry->get_memtype();
2461     BX_CPU_THIS_PTR address_xlation.memtype2 = tlbEntry2->get_memtype();
2462 #endif
2463 
2464 #ifdef BX_LITTLE_ENDIAN
2465     BX_NOTIFY_LIN_MEMORY_ACCESS(laddr, BX_CPU_THIS_PTR address_xlation.paddress1,
2466         BX_CPU_THIS_PTR address_xlation.len1, tlbEntry->get_memtype(),
2467         xlate_rw, (Bit8u*) data);
2468     access_write_physical(BX_CPU_THIS_PTR address_xlation.paddress1,
2469         BX_CPU_THIS_PTR address_xlation.len1, data);
2470     BX_NOTIFY_LIN_MEMORY_ACCESS(laddr2, BX_CPU_THIS_PTR address_xlation.paddress2,
2471         BX_CPU_THIS_PTR address_xlation.len2, tlbEntry2->get_memtype(),
2472         xlate_rw, ((Bit8u*)data) + BX_CPU_THIS_PTR address_xlation.len1);
2473     access_write_physical(BX_CPU_THIS_PTR address_xlation.paddress2,
2474         BX_CPU_THIS_PTR address_xlation.len2,
2475         ((Bit8u*)data) + BX_CPU_THIS_PTR address_xlation.len1);
2476 #else // BX_BIG_ENDIAN
2477     BX_NOTIFY_LIN_MEMORY_ACCESS(laddr, BX_CPU_THIS_PTR address_xlation.paddress1,
2478         BX_CPU_THIS_PTR address_xlation.len1, tlbEntry->get_memtype(),
2479         xlate_rw, ((Bit8u*)data) + (len - BX_CPU_THIS_PTR address_xlation.len1));
2480     access_write_physical(BX_CPU_THIS_PTR address_xlation.paddress1,
2481         BX_CPU_THIS_PTR address_xlation.len1,
2482         ((Bit8u*)data) + (len - BX_CPU_THIS_PTR address_xlation.len1));
2483     BX_NOTIFY_LIN_MEMORY_ACCESS(laddr2, BX_CPU_THIS_PTR address_xlation.paddress2,
2484         BX_CPU_THIS_PTR address_xlation.len2, tlbEntry2->get_memtype(),
2485         xlate_rw, (Bit8u*) data);
2486     access_write_physical(BX_CPU_THIS_PTR address_xlation.paddress2,
2487         BX_CPU_THIS_PTR address_xlation.len2, data);
2488 #endif
2489 
2490 #if BX_X86_DEBUGGER
2491     hwbreakpoint_match(laddr,  BX_CPU_THIS_PTR address_xlation.len1, xlate_rw);
2492     hwbreakpoint_match(laddr2, BX_CPU_THIS_PTR address_xlation.len2, xlate_rw);
2493 #endif
2494   }
2495 
2496   return 0;
2497 }
2498 
access_read_linear(bx_address laddr,unsigned len,unsigned curr_pl,unsigned xlate_rw,Bit32u ac_mask,void * data)2499 int BX_CPU_C::access_read_linear(bx_address laddr, unsigned len, unsigned curr_pl, unsigned xlate_rw, Bit32u ac_mask, void *data)
2500 {
2501 #if BX_SUPPORT_CET
2502   BX_ASSERT(xlate_rw == BX_READ || xlate_rw == BX_RW || xlate_rw == BX_SHADOW_STACK_READ || xlate_rw == BX_SHADOW_STACK_RW);
2503 #else
2504   BX_ASSERT(xlate_rw == BX_READ || xlate_rw == BX_RW);
2505 #endif
2506 
2507   Bit32u pageOffset = PAGE_OFFSET(laddr);
2508 
2509   bool user = (curr_pl == 3);
2510 
2511 #if BX_SUPPORT_X86_64
2512   if (! IsCanonical(laddr)) {
2513     BX_ERROR(("access_read_linear(): canonical failure"));
2514     return -1;
2515   }
2516 #endif
2517 
2518 #if BX_CPU_LEVEL >= 4 && BX_SUPPORT_ALIGNMENT_CHECK
2519   if (BX_CPU_THIS_PTR alignment_check() && user) {
2520     if (pageOffset & ac_mask) {
2521       BX_ERROR(("access_read_linear(): #AC misaligned access"));
2522       exception(BX_AC_EXCEPTION, 0);
2523     }
2524   }
2525 #endif
2526 
2527   bx_TLB_entry *tlbEntry = BX_DTLB_ENTRY_OF(laddr, 0);
2528 
2529   /* check for reference across multiple pages */
2530   if ((pageOffset + len) <= 4096) {
2531     // Access within single page.
2532     BX_CPU_THIS_PTR address_xlation.paddress1 = translate_linear(tlbEntry, laddr, user, xlate_rw);
2533     BX_CPU_THIS_PTR address_xlation.pages     = 1;
2534 #if BX_SUPPORT_MEMTYPE
2535     BX_CPU_THIS_PTR address_xlation.memtype1  = tlbEntry->get_memtype();
2536 #endif
2537     access_read_physical(BX_CPU_THIS_PTR address_xlation.paddress1, len, data);
2538     BX_NOTIFY_LIN_MEMORY_ACCESS(laddr, BX_CPU_THIS_PTR address_xlation.paddress1, len, tlbEntry->get_memtype(), xlate_rw, (Bit8u*) data);
2539 
2540 #if BX_X86_DEBUGGER
2541     hwbreakpoint_match(laddr, len, xlate_rw);
2542 #endif
2543   }
2544   else {
2545     // access across 2 pages
2546     BX_CPU_THIS_PTR address_xlation.len1 = 4096 - pageOffset;
2547     BX_CPU_THIS_PTR address_xlation.len2 = len - BX_CPU_THIS_PTR address_xlation.len1;
2548     BX_CPU_THIS_PTR address_xlation.pages = 2;
2549     bx_address laddr2 = laddr + BX_CPU_THIS_PTR address_xlation.len1;
2550 #if BX_SUPPORT_X86_64
2551     if (! long64_mode()) laddr2 &= 0xffffffff; /* handle linear address wrap in legacy mode */
2552     else {
2553       if (! IsCanonical(laddr2)) {
2554         BX_ERROR(("access_read_linear(): canonical failure for second half of page split access"));
2555         return -1;
2556       }
2557     }
2558 #endif
2559 
2560     bx_TLB_entry *tlbEntry2 = BX_DTLB_ENTRY_OF(laddr2, 0);
2561 
2562     BX_CPU_THIS_PTR address_xlation.paddress1 = translate_linear(tlbEntry, laddr, user, xlate_rw);
2563     BX_CPU_THIS_PTR address_xlation.paddress2 = translate_linear(tlbEntry2, laddr2, user, xlate_rw);
2564 #if BX_SUPPORT_MEMTYPE
2565     BX_CPU_THIS_PTR address_xlation.memtype1 = tlbEntry->get_memtype();
2566     BX_CPU_THIS_PTR address_xlation.memtype2 = tlbEntry2->get_memtype();
2567 #endif
2568 
2569 #ifdef BX_LITTLE_ENDIAN
2570     access_read_physical(BX_CPU_THIS_PTR address_xlation.paddress1,
2571         BX_CPU_THIS_PTR address_xlation.len1, data);
2572     BX_NOTIFY_LIN_MEMORY_ACCESS(laddr, BX_CPU_THIS_PTR address_xlation.paddress1,
2573         BX_CPU_THIS_PTR address_xlation.len1, tlbEntry->get_memtype(),
2574         xlate_rw, (Bit8u*) data);
2575     access_read_physical(BX_CPU_THIS_PTR address_xlation.paddress2,
2576         BX_CPU_THIS_PTR address_xlation.len2,
2577         ((Bit8u*)data) + BX_CPU_THIS_PTR address_xlation.len1);
2578     BX_NOTIFY_LIN_MEMORY_ACCESS(laddr2, BX_CPU_THIS_PTR address_xlation.paddress2,
2579         BX_CPU_THIS_PTR address_xlation.len2, tlbEntry2->get_memtype(),
2580         xlate_rw, ((Bit8u*)data) + BX_CPU_THIS_PTR address_xlation.len1);
2581 #else // BX_BIG_ENDIAN
2582     access_read_physical(BX_CPU_THIS_PTR address_xlation.paddress1,
2583         BX_CPU_THIS_PTR address_xlation.len1,
2584         ((Bit8u*)data) + (len - BX_CPU_THIS_PTR address_xlation.len1));
2585     BX_NOTIFY_LIN_MEMORY_ACCESS(laddr, BX_CPU_THIS_PTR address_xlation.paddress1,
2586         BX_CPU_THIS_PTR address_xlation.len1, tlbEntry->get_memtype(),
2587         xlate_rw, ((Bit8u*)data) + (len - BX_CPU_THIS_PTR address_xlation.len1));
2588     access_read_physical(BX_CPU_THIS_PTR address_xlation.paddress2,
2589         BX_CPU_THIS_PTR address_xlation.len2, data);
2590     BX_NOTIFY_LIN_MEMORY_ACCESS(laddr2, BX_CPU_THIS_PTR address_xlation.paddress2,
2591         BX_CPU_THIS_PTR address_xlation.len2, tlbEntry2->get_memtype(),
2592         xlate_rw, (Bit8u*) data);
2593 #endif
2594 
2595 #if BX_X86_DEBUGGER
2596     hwbreakpoint_match(laddr,  BX_CPU_THIS_PTR address_xlation.len1, xlate_rw);
2597     hwbreakpoint_match(laddr2, BX_CPU_THIS_PTR address_xlation.len2, xlate_rw);
2598 #endif
2599   }
2600 
2601   return 0;
2602 }
2603 
access_write_physical(bx_phy_address paddr,unsigned len,void * data)2604 void BX_CPU_C::access_write_physical(bx_phy_address paddr, unsigned len, void *data)
2605 {
2606 #if BX_SUPPORT_VMX && BX_SUPPORT_X86_64
2607   if (is_virtual_apic_page(paddr)) {
2608     VMX_Virtual_Apic_Write(paddr, len, data);
2609     return;
2610   }
2611 #endif
2612 
2613 #if BX_SUPPORT_APIC
2614   if (BX_CPU_THIS_PTR lapic.is_selected(paddr)) {
2615     BX_CPU_THIS_PTR lapic.write(paddr, data, len);
2616     return;
2617   }
2618 #endif
2619 
2620   BX_MEM(0)->writePhysicalPage(BX_CPU_THIS, paddr, len, data);
2621 }
2622 
access_read_physical(bx_phy_address paddr,unsigned len,void * data)2623 void BX_CPU_C::access_read_physical(bx_phy_address paddr, unsigned len, void *data)
2624 {
2625 #if BX_SUPPORT_VMX && BX_SUPPORT_X86_64
2626   if (is_virtual_apic_page(paddr)) {
2627     paddr = VMX_Virtual_Apic_Read(paddr, len, data);
2628   }
2629 #endif
2630 
2631 #if BX_SUPPORT_APIC
2632   if (BX_CPU_THIS_PTR lapic.is_selected(paddr)) {
2633     BX_CPU_THIS_PTR lapic.read(paddr, data, len);
2634     return;
2635   }
2636 #endif
2637 
2638   BX_MEM(0)->readPhysicalPage(BX_CPU_THIS, paddr, len, data);
2639 }
2640 
getHostMemAddr(bx_phy_address paddr,unsigned rw)2641 bx_hostpageaddr_t BX_CPU_C::getHostMemAddr(bx_phy_address paddr, unsigned rw)
2642 {
2643 #if BX_SUPPORT_VMX && BX_SUPPORT_X86_64
2644   if (is_virtual_apic_page(paddr))
2645     return 0; // Do not allow direct access to virtual apic page
2646 #endif
2647 
2648 #if BX_SUPPORT_APIC
2649   if (BX_CPU_THIS_PTR lapic.is_selected(paddr))
2650     return 0; // Vetoed!  APIC address space
2651 #endif
2652 
2653   return (bx_hostpageaddr_t) BX_MEM(0)->getHostMemAddr(BX_CPU_THIS, paddr, rw);
2654 }
2655 
2656 #if BX_LARGE_RAMFILE
check_addr_in_tlb_buffers(const Bit8u * addr,const Bit8u * end)2657 bool BX_CPU_C::check_addr_in_tlb_buffers(const Bit8u *addr, const Bit8u *end)
2658 {
2659 #if BX_SUPPORT_VMX
2660   if (BX_CPU_THIS_PTR vmcshostptr) {
2661     if ((BX_CPU_THIS_PTR vmcshostptr >= (const bx_hostpageaddr_t)addr) &&
2662         (BX_CPU_THIS_PTR vmcshostptr  < (const bx_hostpageaddr_t)end)) return true;
2663   }
2664 #endif
2665 
2666 #if BX_SUPPORT_SVM
2667   if (BX_CPU_THIS_PTR vmcbhostptr) {
2668     if ((BX_CPU_THIS_PTR vmcbhostptr >= (const bx_hostpageaddr_t)addr) &&
2669         (BX_CPU_THIS_PTR vmcbhostptr  < (const bx_hostpageaddr_t)end)) return true;
2670   }
2671 #endif
2672 
2673   for (unsigned tlb_entry_num=0; tlb_entry_num < BX_DTLB_SIZE; tlb_entry_num++) {
2674     bx_TLB_entry *tlbEntry = &BX_CPU_THIS_PTR DTLB.entry[tlb_entry_num];
2675     if (tlbEntry->valid()) {
2676       if ((tlbEntry->hostPageAddr >= (const bx_hostpageaddr_t)addr) &&
2677           (tlbEntry->hostPageAddr  < (const bx_hostpageaddr_t)end))
2678         return true;
2679     }
2680   }
2681 
2682   for (unsigned tlb_entry_num=0; tlb_entry_num < BX_ITLB_SIZE; tlb_entry_num++) {
2683     bx_TLB_entry *tlbEntry = &BX_CPU_THIS_PTR ITLB.entry[tlb_entry_num];
2684     if (tlbEntry->valid()) {
2685       if ((tlbEntry->hostPageAddr >= (const bx_hostpageaddr_t)addr) &&
2686           (tlbEntry->hostPageAddr  < (const bx_hostpageaddr_t)end))
2687         return true;
2688     }
2689   }
2690 
2691   return false;
2692 }
2693 #endif
2694