1 // Copyright 2008 Dolphin Emulator Project
2 // Licensed under GPLv2+
3 // Refer to the license.txt file included.
4 
5 #include "VideoCommon/Fifo.h"
6 
7 #include <atomic>
8 #include <cstring>
9 
10 #include "Common/Assert.h"
11 #include "Common/Atomic.h"
12 #include "Common/BlockingLoop.h"
13 #include "Common/ChunkFile.h"
14 #include "Common/Event.h"
15 #include "Common/FPURoundMode.h"
16 #include "Common/MemoryUtil.h"
17 #include "Common/MsgHandler.h"
18 
19 #include "Core/ConfigManager.h"
20 #include "Core/CoreTiming.h"
21 #include "Core/HW/Memmap.h"
22 #include "Core/Host.h"
23 
24 #include "VideoCommon/AsyncRequests.h"
25 #include "VideoCommon/CPMemory.h"
26 #include "VideoCommon/CommandProcessor.h"
27 #include "VideoCommon/DataReader.h"
28 #include "VideoCommon/OpcodeDecoding.h"
29 #include "VideoCommon/VertexLoaderManager.h"
30 #include "VideoCommon/VertexManagerBase.h"
31 #include "VideoCommon/VideoBackendBase.h"
32 
33 namespace Fifo
34 {
35 static constexpr u32 FIFO_SIZE = 2 * 1024 * 1024;
36 static constexpr int GPU_TIME_SLOT_SIZE = 1000;
37 
38 static Common::BlockingLoop s_gpu_mainloop;
39 
40 static Common::Flag s_emu_running_state;
41 
42 // Most of this array is unlikely to be faulted in...
43 static u8 s_fifo_aux_data[FIFO_SIZE];
44 static u8* s_fifo_aux_write_ptr;
45 static u8* s_fifo_aux_read_ptr;
46 
47 // This could be in SConfig, but it depends on multiple settings
48 // and can change at runtime.
49 static bool s_use_deterministic_gpu_thread;
50 
51 static CoreTiming::EventType* s_event_sync_gpu;
52 
53 // STATE_TO_SAVE
54 static u8* s_video_buffer;
55 static u8* s_video_buffer_read_ptr;
56 static std::atomic<u8*> s_video_buffer_write_ptr;
57 static std::atomic<u8*> s_video_buffer_seen_ptr;
58 static u8* s_video_buffer_pp_read_ptr;
59 // The read_ptr is always owned by the GPU thread.  In normal mode, so is the
60 // write_ptr, despite it being atomic.  In deterministic GPU thread mode,
61 // things get a bit more complicated:
62 // - The seen_ptr is written by the GPU thread, and points to what it's already
63 // processed as much of as possible - in the case of a partial command which
64 // caused it to stop, not the same as the read ptr.  It's written by the GPU,
65 // under the lock, and updating the cond.
66 // - The write_ptr is written by the CPU thread after it copies data from the
67 // FIFO.  Maybe someday it will be under the lock.  For now, because RunGpuLoop
68 // polls, it's just atomic.
69 // - The pp_read_ptr is the CPU preprocessing version of the read_ptr.
70 
71 static std::atomic<int> s_sync_ticks;
72 static bool s_syncing_suspended;
73 static Common::Event s_sync_wakeup_event;
74 
DoState(PointerWrap & p)75 void DoState(PointerWrap& p)
76 {
77   p.DoArray(s_video_buffer, FIFO_SIZE);
78   u8* write_ptr = s_video_buffer_write_ptr;
79   p.DoPointer(write_ptr, s_video_buffer);
80   s_video_buffer_write_ptr = write_ptr;
81   p.DoPointer(s_video_buffer_read_ptr, s_video_buffer);
82   if (p.mode == PointerWrap::MODE_READ && s_use_deterministic_gpu_thread)
83   {
84     // We're good and paused, right?
85     s_video_buffer_seen_ptr = s_video_buffer_pp_read_ptr = s_video_buffer_read_ptr;
86   }
87 
88   p.Do(s_sync_ticks);
89   p.Do(s_syncing_suspended);
90 }
91 
PauseAndLock(bool doLock,bool unpauseOnUnlock)92 void PauseAndLock(bool doLock, bool unpauseOnUnlock)
93 {
94   if (doLock)
95   {
96     SyncGPU(SyncGPUReason::Other);
97     EmulatorState(false);
98 
99     const SConfig& param = SConfig::GetInstance();
100 
101     if (!param.bCPUThread || s_use_deterministic_gpu_thread)
102       return;
103 
104     s_gpu_mainloop.WaitYield(std::chrono::milliseconds(100), Host_YieldToUI);
105   }
106   else
107   {
108     if (unpauseOnUnlock)
109       EmulatorState(true);
110   }
111 }
112 
Init()113 void Init()
114 {
115   // Padded so that SIMD overreads in the vertex loader are safe
116   s_video_buffer = static_cast<u8*>(Common::AllocateMemoryPages(FIFO_SIZE + 4));
117   ResetVideoBuffer();
118   if (SConfig::GetInstance().bCPUThread)
119     s_gpu_mainloop.Prepare();
120   s_sync_ticks.store(0);
121 }
122 
Shutdown()123 void Shutdown()
124 {
125   if (s_gpu_mainloop.IsRunning())
126     PanicAlert("Fifo shutting down while active");
127 
128   Common::FreeMemoryPages(s_video_buffer, FIFO_SIZE + 4);
129   s_video_buffer = nullptr;
130   s_video_buffer_write_ptr = nullptr;
131   s_video_buffer_pp_read_ptr = nullptr;
132   s_video_buffer_read_ptr = nullptr;
133   s_video_buffer_seen_ptr = nullptr;
134   s_fifo_aux_write_ptr = nullptr;
135   s_fifo_aux_read_ptr = nullptr;
136 }
137 
138 // May be executed from any thread, even the graphics thread.
139 // Created to allow for self shutdown.
ExitGpuLoop()140 void ExitGpuLoop()
141 {
142   // This should break the wait loop in CPU thread
143   CommandProcessor::fifo.bFF_GPReadEnable = false;
144   FlushGpu();
145 
146   // Terminate GPU thread loop
147   s_emu_running_state.Set();
148   s_gpu_mainloop.Stop(s_gpu_mainloop.kNonBlock);
149 }
150 
EmulatorState(bool running)151 void EmulatorState(bool running)
152 {
153   s_emu_running_state.Set(running);
154   if (running)
155     s_gpu_mainloop.Wakeup();
156   else
157     s_gpu_mainloop.AllowSleep();
158 }
159 
SyncGPU(SyncGPUReason reason,bool may_move_read_ptr)160 void SyncGPU(SyncGPUReason reason, bool may_move_read_ptr)
161 {
162   if (s_use_deterministic_gpu_thread)
163   {
164     s_gpu_mainloop.Wait();
165     if (!s_gpu_mainloop.IsRunning())
166       return;
167 
168     // Opportunistically reset FIFOs so we don't wrap around.
169     if (may_move_read_ptr && s_fifo_aux_write_ptr != s_fifo_aux_read_ptr)
170       PanicAlert("aux fifo not synced (%p, %p)", s_fifo_aux_write_ptr, s_fifo_aux_read_ptr);
171 
172     memmove(s_fifo_aux_data, s_fifo_aux_read_ptr, s_fifo_aux_write_ptr - s_fifo_aux_read_ptr);
173     s_fifo_aux_write_ptr -= (s_fifo_aux_read_ptr - s_fifo_aux_data);
174     s_fifo_aux_read_ptr = s_fifo_aux_data;
175 
176     if (may_move_read_ptr)
177     {
178       u8* write_ptr = s_video_buffer_write_ptr;
179 
180       // what's left over in the buffer
181       size_t size = write_ptr - s_video_buffer_pp_read_ptr;
182 
183       memmove(s_video_buffer, s_video_buffer_pp_read_ptr, size);
184       // This change always decreases the pointers.  We write seen_ptr
185       // after write_ptr here, and read it before in RunGpuLoop, so
186       // 'write_ptr > seen_ptr' there cannot become spuriously true.
187       s_video_buffer_write_ptr = write_ptr = s_video_buffer + size;
188       s_video_buffer_pp_read_ptr = s_video_buffer;
189       s_video_buffer_read_ptr = s_video_buffer;
190       s_video_buffer_seen_ptr = write_ptr;
191     }
192   }
193 }
194 
PushFifoAuxBuffer(const void * ptr,size_t size)195 void PushFifoAuxBuffer(const void* ptr, size_t size)
196 {
197   if (size > (size_t)(s_fifo_aux_data + FIFO_SIZE - s_fifo_aux_write_ptr))
198   {
199     SyncGPU(SyncGPUReason::AuxSpace, /* may_move_read_ptr */ false);
200     if (!s_gpu_mainloop.IsRunning())
201     {
202       // GPU is shutting down
203       return;
204     }
205     if (size > (size_t)(s_fifo_aux_data + FIFO_SIZE - s_fifo_aux_write_ptr))
206     {
207       // That will sync us up to the last 32 bytes, so this short region
208       // of FIFO would have to point to a 2MB display list or something.
209       PanicAlert("absurdly large aux buffer");
210       return;
211     }
212   }
213   memcpy(s_fifo_aux_write_ptr, ptr, size);
214   s_fifo_aux_write_ptr += size;
215 }
216 
PopFifoAuxBuffer(size_t size)217 void* PopFifoAuxBuffer(size_t size)
218 {
219   void* ret = s_fifo_aux_read_ptr;
220   s_fifo_aux_read_ptr += size;
221   return ret;
222 }
223 
224 // Description: RunGpuLoop() sends data through this function.
ReadDataFromFifo(u32 readPtr)225 static void ReadDataFromFifo(u32 readPtr)
226 {
227   size_t len = 32;
228   if (len > (size_t)(s_video_buffer + FIFO_SIZE - s_video_buffer_write_ptr))
229   {
230     size_t existing_len = s_video_buffer_write_ptr - s_video_buffer_read_ptr;
231     if (len > (size_t)(FIFO_SIZE - existing_len))
232     {
233       PanicAlert("FIFO out of bounds (existing %zu + new %zu > %u)", existing_len, len, FIFO_SIZE);
234       return;
235     }
236     memmove(s_video_buffer, s_video_buffer_read_ptr, existing_len);
237     s_video_buffer_write_ptr = s_video_buffer + existing_len;
238     s_video_buffer_read_ptr = s_video_buffer;
239   }
240   // Copy new video instructions to s_video_buffer for future use in rendering the new picture
241   Memory::CopyFromEmu(s_video_buffer_write_ptr, readPtr, len);
242   s_video_buffer_write_ptr += len;
243 }
244 
245 // The deterministic_gpu_thread version.
ReadDataFromFifoOnCPU(u32 readPtr)246 static void ReadDataFromFifoOnCPU(u32 readPtr)
247 {
248   size_t len = 32;
249   u8* write_ptr = s_video_buffer_write_ptr;
250   if (len > (size_t)(s_video_buffer + FIFO_SIZE - write_ptr))
251   {
252     // We can't wrap around while the GPU is working on the data.
253     // This should be very rare due to the reset in SyncGPU.
254     SyncGPU(SyncGPUReason::Wraparound);
255     if (!s_gpu_mainloop.IsRunning())
256     {
257       // GPU is shutting down, so the next asserts may fail
258       return;
259     }
260 
261     if (s_video_buffer_pp_read_ptr != s_video_buffer_read_ptr)
262     {
263       PanicAlert("desynced read pointers");
264       return;
265     }
266     write_ptr = s_video_buffer_write_ptr;
267     size_t existing_len = write_ptr - s_video_buffer_pp_read_ptr;
268     if (len > (size_t)(FIFO_SIZE - existing_len))
269     {
270       PanicAlert("FIFO out of bounds (existing %zu + new %zu > %u)", existing_len, len, FIFO_SIZE);
271       return;
272     }
273   }
274   Memory::CopyFromEmu(s_video_buffer_write_ptr, readPtr, len);
275   s_video_buffer_pp_read_ptr = OpcodeDecoder::Run<true>(
276       DataReader(s_video_buffer_pp_read_ptr, write_ptr + len), nullptr, false);
277   // This would have to be locked if the GPU thread didn't spin.
278   s_video_buffer_write_ptr = write_ptr + len;
279 }
280 
ResetVideoBuffer()281 void ResetVideoBuffer()
282 {
283   s_video_buffer_read_ptr = s_video_buffer;
284   s_video_buffer_write_ptr = s_video_buffer;
285   s_video_buffer_seen_ptr = s_video_buffer;
286   s_video_buffer_pp_read_ptr = s_video_buffer;
287   s_fifo_aux_write_ptr = s_fifo_aux_data;
288   s_fifo_aux_read_ptr = s_fifo_aux_data;
289 }
290 
291 // Description: Main FIFO update loop
292 // Purpose: Keep the Core HW updated about the CPU-GPU distance
RunGpuLoop()293 void RunGpuLoop()
294 {
295   AsyncRequests::GetInstance()->SetEnable(true);
296   AsyncRequests::GetInstance()->SetPassthrough(false);
297 
298   s_gpu_mainloop.Run(
299       [] {
300         const SConfig& param = SConfig::GetInstance();
301 
302         // Run events from the CPU thread.
303         AsyncRequests::GetInstance()->PullEvents();
304 
305         // Do nothing while paused
306         if (!s_emu_running_state.IsSet())
307           return;
308 
309         if (s_use_deterministic_gpu_thread)
310         {
311           // All the fifo/CP stuff is on the CPU.  We just need to run the opcode decoder.
312           u8* seen_ptr = s_video_buffer_seen_ptr;
313           u8* write_ptr = s_video_buffer_write_ptr;
314           // See comment in SyncGPU
315           if (write_ptr > seen_ptr)
316           {
317             s_video_buffer_read_ptr =
318                 OpcodeDecoder::Run(DataReader(s_video_buffer_read_ptr, write_ptr), nullptr, false);
319             s_video_buffer_seen_ptr = write_ptr;
320           }
321         }
322         else
323         {
324           CommandProcessor::SCPFifoStruct& fifo = CommandProcessor::fifo;
325           CommandProcessor::SetCPStatusFromGPU();
326 
327           // check if we are able to run this buffer
328           while (!CommandProcessor::IsInterruptWaiting() && fifo.bFF_GPReadEnable &&
329                  fifo.CPReadWriteDistance && !AtBreakpoint())
330           {
331             if (param.bSyncGPU && s_sync_ticks.load() < param.iSyncGpuMinDistance)
332               break;
333 
334             u32 cyclesExecuted = 0;
335             u32 readPtr = fifo.CPReadPointer;
336             ReadDataFromFifo(readPtr);
337 
338             if (readPtr == fifo.CPEnd)
339               readPtr = fifo.CPBase;
340             else
341               readPtr += 32;
342 
343             ASSERT_MSG(COMMANDPROCESSOR, (s32)fifo.CPReadWriteDistance - 32 >= 0,
344                        "Negative fifo.CPReadWriteDistance = %i in FIFO Loop !\nThat can produce "
345                        "instability in the game. Please report it.",
346                        fifo.CPReadWriteDistance - 32);
347 
348             u8* write_ptr = s_video_buffer_write_ptr;
349             s_video_buffer_read_ptr = OpcodeDecoder::Run(
350                 DataReader(s_video_buffer_read_ptr, write_ptr), &cyclesExecuted, false);
351 
352             Common::AtomicStore(fifo.CPReadPointer, readPtr);
353             Common::AtomicAdd(fifo.CPReadWriteDistance, static_cast<u32>(-32));
354             if ((write_ptr - s_video_buffer_read_ptr) == 0)
355               Common::AtomicStore(fifo.SafeCPReadPointer, fifo.CPReadPointer);
356 
357             CommandProcessor::SetCPStatusFromGPU();
358 
359             if (param.bSyncGPU)
360             {
361               cyclesExecuted = (int)(cyclesExecuted / param.fSyncGpuOverclock);
362               int old = s_sync_ticks.fetch_sub(cyclesExecuted);
363               if (old >= param.iSyncGpuMaxDistance &&
364                   old - (int)cyclesExecuted < param.iSyncGpuMaxDistance)
365                 s_sync_wakeup_event.Set();
366             }
367 
368             // This call is pretty important in DualCore mode and must be called in the FIFO Loop.
369             // If we don't, s_swapRequested or s_efbAccessRequested won't be set to false
370             // leading the CPU thread to wait in Video_BeginField or Video_AccessEFB thus slowing
371             // things down.
372             AsyncRequests::GetInstance()->PullEvents();
373           }
374 
375           // fast skip remaining GPU time if fifo is empty
376           if (s_sync_ticks.load() > 0)
377           {
378             int old = s_sync_ticks.exchange(0);
379             if (old >= param.iSyncGpuMaxDistance)
380               s_sync_wakeup_event.Set();
381           }
382 
383           // The fifo is empty and it's unlikely we will get any more work in the near future.
384           // Make sure VertexManager finishes drawing any primitives it has stored in it's buffer.
385           g_vertex_manager->Flush();
386         }
387       },
388       100);
389 
390   AsyncRequests::GetInstance()->SetEnable(false);
391   AsyncRequests::GetInstance()->SetPassthrough(true);
392 }
393 
FlushGpu()394 void FlushGpu()
395 {
396   const SConfig& param = SConfig::GetInstance();
397 
398   if (!param.bCPUThread || s_use_deterministic_gpu_thread)
399     return;
400 
401   s_gpu_mainloop.Wait();
402 }
403 
GpuMaySleep()404 void GpuMaySleep()
405 {
406   s_gpu_mainloop.AllowSleep();
407 }
408 
AtBreakpoint()409 bool AtBreakpoint()
410 {
411   CommandProcessor::SCPFifoStruct& fifo = CommandProcessor::fifo;
412   return fifo.bFF_BPEnable && (fifo.CPReadPointer == fifo.CPBreakpoint);
413 }
414 
RunGpu()415 void RunGpu()
416 {
417   const SConfig& param = SConfig::GetInstance();
418 
419   // wake up GPU thread
420   if (param.bCPUThread && !s_use_deterministic_gpu_thread)
421   {
422     s_gpu_mainloop.Wakeup();
423   }
424 
425   // if the sync GPU callback is suspended, wake it up.
426   if (!SConfig::GetInstance().bCPUThread || s_use_deterministic_gpu_thread ||
427       SConfig::GetInstance().bSyncGPU)
428   {
429     if (s_syncing_suspended)
430     {
431       s_syncing_suspended = false;
432       CoreTiming::ScheduleEvent(GPU_TIME_SLOT_SIZE, s_event_sync_gpu, GPU_TIME_SLOT_SIZE);
433     }
434   }
435 }
436 
RunGpuOnCpu(int ticks)437 static int RunGpuOnCpu(int ticks)
438 {
439   CommandProcessor::SCPFifoStruct& fifo = CommandProcessor::fifo;
440   bool reset_simd_state = false;
441   int available_ticks = int(ticks * SConfig::GetInstance().fSyncGpuOverclock) + s_sync_ticks.load();
442   while (fifo.bFF_GPReadEnable && fifo.CPReadWriteDistance && !AtBreakpoint() &&
443          available_ticks >= 0)
444   {
445     if (s_use_deterministic_gpu_thread)
446     {
447       ReadDataFromFifoOnCPU(fifo.CPReadPointer);
448       s_gpu_mainloop.Wakeup();
449     }
450     else
451     {
452       if (!reset_simd_state)
453       {
454         FPURoundMode::SaveSIMDState();
455         FPURoundMode::LoadDefaultSIMDState();
456         reset_simd_state = true;
457       }
458       ReadDataFromFifo(fifo.CPReadPointer);
459       u32 cycles = 0;
460       s_video_buffer_read_ptr = OpcodeDecoder::Run(
461           DataReader(s_video_buffer_read_ptr, s_video_buffer_write_ptr), &cycles, false);
462       available_ticks -= cycles;
463     }
464 
465     if (fifo.CPReadPointer == fifo.CPEnd)
466       fifo.CPReadPointer = fifo.CPBase;
467     else
468       fifo.CPReadPointer += 32;
469 
470     fifo.CPReadWriteDistance -= 32;
471   }
472 
473   CommandProcessor::SetCPStatusFromGPU();
474 
475   if (reset_simd_state)
476   {
477     FPURoundMode::LoadSIMDState();
478   }
479 
480   // Discard all available ticks as there is nothing to do any more.
481   s_sync_ticks.store(std::min(available_ticks, 0));
482 
483   // If the GPU is idle, drop the handler.
484   if (available_ticks >= 0)
485     return -1;
486 
487   // Always wait at least for GPU_TIME_SLOT_SIZE cycles.
488   return -available_ticks + GPU_TIME_SLOT_SIZE;
489 }
490 
UpdateWantDeterminism(bool want)491 void UpdateWantDeterminism(bool want)
492 {
493   // We are paused (or not running at all yet), so
494   // it should be safe to change this.
495   const SConfig& param = SConfig::GetInstance();
496   bool gpu_thread = false;
497   switch (param.m_GPUDeterminismMode)
498   {
499   case GPUDeterminismMode::Auto:
500     gpu_thread = want;
501     break;
502   case GPUDeterminismMode::Disabled:
503     gpu_thread = false;
504     break;
505   case GPUDeterminismMode::FakeCompletion:
506     gpu_thread = true;
507     break;
508   }
509 
510   gpu_thread = gpu_thread && param.bCPUThread;
511 
512   if (s_use_deterministic_gpu_thread != gpu_thread)
513   {
514     s_use_deterministic_gpu_thread = gpu_thread;
515     if (gpu_thread)
516     {
517       // These haven't been updated in non-deterministic mode.
518       s_video_buffer_seen_ptr = s_video_buffer_pp_read_ptr = s_video_buffer_read_ptr;
519       CopyPreprocessCPStateFromMain();
520       VertexLoaderManager::MarkAllDirty();
521     }
522   }
523 }
524 
UseDeterministicGPUThread()525 bool UseDeterministicGPUThread()
526 {
527   return s_use_deterministic_gpu_thread;
528 }
529 
530 /* This function checks the emulated CPU - GPU distance and may wake up the GPU,
531  * or block the CPU if required. It should be called by the CPU thread regularly.
532  * @ticks The gone emulated CPU time.
533  * @return A good time to call WaitForGpuThread() next.
534  */
WaitForGpuThread(int ticks)535 static int WaitForGpuThread(int ticks)
536 {
537   const SConfig& param = SConfig::GetInstance();
538 
539   int old = s_sync_ticks.fetch_add(ticks);
540   int now = old + ticks;
541 
542   // GPU is idle, so stop polling.
543   if (old >= 0 && s_gpu_mainloop.IsDone())
544     return -1;
545 
546   // Wakeup GPU
547   if (old < param.iSyncGpuMinDistance && now >= param.iSyncGpuMinDistance)
548     RunGpu();
549 
550   // If the GPU is still sleeping, wait for a longer time
551   if (now < param.iSyncGpuMinDistance)
552     return GPU_TIME_SLOT_SIZE + param.iSyncGpuMinDistance - now;
553 
554   // Wait for GPU
555   if (now >= param.iSyncGpuMaxDistance)
556     s_sync_wakeup_event.Wait();
557 
558   return GPU_TIME_SLOT_SIZE;
559 }
560 
SyncGPUCallback(u64 ticks,s64 cyclesLate)561 static void SyncGPUCallback(u64 ticks, s64 cyclesLate)
562 {
563   ticks += cyclesLate;
564   int next = -1;
565 
566   if (!SConfig::GetInstance().bCPUThread || s_use_deterministic_gpu_thread)
567   {
568     next = RunGpuOnCpu((int)ticks);
569   }
570   else if (SConfig::GetInstance().bSyncGPU)
571   {
572     next = WaitForGpuThread((int)ticks);
573   }
574 
575   s_syncing_suspended = next < 0;
576   if (!s_syncing_suspended)
577     CoreTiming::ScheduleEvent(next, s_event_sync_gpu, next);
578 }
579 
580 // Initialize GPU - CPU thread syncing, this gives us a deterministic way to start the GPU thread.
Prepare()581 void Prepare()
582 {
583   s_event_sync_gpu = CoreTiming::RegisterEvent("SyncGPUCallback", SyncGPUCallback);
584   s_syncing_suspended = true;
585 }
586 }  // namespace Fifo
587