1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 use std::{cmp, mem}; 6 use api::units::*; 7 use malloc_size_of::MallocSizeOfOps; 8 use crate::{ 9 device::{CustomVAO, Device, DrawTarget, Program, ReadTarget, Texture, TextureFilter, UploadPBOPool, VBO}, 10 gpu_cache::{GpuBlockData, GpuCacheUpdate, GpuCacheUpdateList}, 11 internal_types::{RenderTargetInfo, Swizzle}, 12 prim_store::DeferredResolve, 13 profiler, 14 render_api::MemoryReport, 15 internal_types::FrameId, 16 }; 17 18 /// Enabling this toggle would force the GPU cache scattered texture to 19 /// be resized every frame, which enables GPU debuggers to see if this 20 /// is performed correctly. 21 const GPU_CACHE_RESIZE_TEST: bool = false; 22 23 /// Tracks the state of each row in the GPU cache texture. 24 struct CacheRow { 25 /// Mirrored block data on CPU for this row. We store a copy of 26 /// the data on the CPU side to improve upload batching. 27 cpu_blocks: Box<[GpuBlockData; super::MAX_VERTEX_TEXTURE_WIDTH]>, 28 /// The first offset in this row that is dirty. 29 min_dirty: u16, 30 /// The last offset in this row that is dirty. 31 max_dirty: u16, 32 } 33 34 impl CacheRow { new() -> Self35 fn new() -> Self { 36 CacheRow { 37 cpu_blocks: Box::new([GpuBlockData::EMPTY; super::MAX_VERTEX_TEXTURE_WIDTH]), 38 min_dirty: super::MAX_VERTEX_TEXTURE_WIDTH as _, 39 max_dirty: 0, 40 } 41 } 42 is_dirty(&self) -> bool43 fn is_dirty(&self) -> bool { 44 return self.min_dirty < self.max_dirty; 45 } 46 clear_dirty(&mut self)47 fn clear_dirty(&mut self) { 48 self.min_dirty = super::MAX_VERTEX_TEXTURE_WIDTH as _; 49 self.max_dirty = 0; 50 } 51 add_dirty(&mut self, block_offset: usize, block_count: usize)52 fn add_dirty(&mut self, block_offset: usize, block_count: usize) { 53 self.min_dirty = self.min_dirty.min(block_offset as _); 54 self.max_dirty = self.max_dirty.max((block_offset + block_count) as _); 55 } 56 dirty_blocks(&self) -> &[GpuBlockData]57 fn dirty_blocks(&self) -> &[GpuBlockData] { 58 return &self.cpu_blocks[self.min_dirty as usize .. self.max_dirty as usize]; 59 } 60 } 61 62 /// The bus over which CPU and GPU versions of the GPU cache 63 /// get synchronized. 64 enum GpuCacheBus { 65 /// PBO-based updates, currently operate on a row granularity. 66 /// Therefore, are subject to fragmentation issues. 67 PixelBuffer { 68 /// Per-row data. 69 rows: Vec<CacheRow>, 70 }, 71 /// Shader-based scattering updates. Currently rendered by a set 72 /// of points into the GPU texture, each carrying a `GpuBlockData`. 73 Scatter { 74 /// Special program to run the scattered update. 75 program: Program, 76 /// VAO containing the source vertex buffers. 77 vao: CustomVAO, 78 /// VBO for positional data, supplied as normalized `u16`. 79 buf_position: VBO<[u16; 2]>, 80 /// VBO for gpu block data. 81 buf_value: VBO<GpuBlockData>, 82 /// Currently stored block count. 83 count: usize, 84 }, 85 } 86 87 /// The device-specific representation of the cache texture in gpu_cache.rs 88 pub struct GpuCacheTexture { 89 texture: Option<Texture>, 90 bus: GpuCacheBus, 91 } 92 93 impl GpuCacheTexture { 94 /// Ensures that we have an appropriately-sized texture. ensure_texture(&mut self, device: &mut Device, height: i32)95 fn ensure_texture(&mut self, device: &mut Device, height: i32) { 96 // If we already have a texture that works, we're done. 97 if self.texture.as_ref().map_or(false, |t| t.get_dimensions().height >= height) { 98 if GPU_CACHE_RESIZE_TEST { 99 // Special debug mode - resize the texture even though it's fine. 100 } else { 101 return; 102 } 103 } 104 105 // Take the old texture, if any. 106 let blit_source = self.texture.take(); 107 108 // Create the new texture. 109 assert!(height >= 2, "Height is too small for ANGLE"); 110 let new_size = DeviceIntSize::new(super::MAX_VERTEX_TEXTURE_WIDTH as _, height); 111 // GpuCacheBus::Scatter always requires the texture to be a render target. For 112 // GpuCacheBus::PixelBuffer, we only create the texture with a render target if 113 // RGBAF32 render targets are actually supported, and only if glCopyImageSubData 114 // is not. glCopyImageSubData does not require a render target to copy the texture 115 // data, and if neither RGBAF32 render targets nor glCopyImageSubData is supported, 116 // we simply re-upload the entire contents rather than copying upon resize. 117 let supports_copy_image_sub_data = device.get_capabilities().supports_copy_image_sub_data; 118 let supports_color_buffer_float = device.get_capabilities().supports_color_buffer_float; 119 let rt_info = if matches!(self.bus, GpuCacheBus::PixelBuffer { .. }) 120 && (supports_copy_image_sub_data || !supports_color_buffer_float) 121 { 122 None 123 } else { 124 Some(RenderTargetInfo { has_depth: false }) 125 }; 126 let mut texture = device.create_texture( 127 api::ImageBufferKind::Texture2D, 128 api::ImageFormat::RGBAF32, 129 new_size.width, 130 new_size.height, 131 TextureFilter::Nearest, 132 rt_info, 133 ); 134 135 // Copy the contents of the previous texture, if applicable. 136 if let Some(blit_source) = blit_source { 137 if !supports_copy_image_sub_data && !supports_color_buffer_float { 138 // Cannot copy texture, so must re-upload everything. 139 match self.bus { 140 GpuCacheBus::PixelBuffer { ref mut rows } => { 141 for row in rows { 142 row.add_dirty(0, super::MAX_VERTEX_TEXTURE_WIDTH); 143 } 144 } 145 GpuCacheBus::Scatter { .. } => { 146 panic!("Texture must be copyable to use scatter GPU cache bus method"); 147 } 148 } 149 } else { 150 device.copy_entire_texture(&mut texture, &blit_source); 151 } 152 device.delete_texture(blit_source); 153 } 154 155 self.texture = Some(texture); 156 } 157 new(device: &mut Device, use_scatter: bool) -> Result<Self, super::RendererError>158 pub fn new(device: &mut Device, use_scatter: bool) -> Result<Self, super::RendererError> { 159 use super::desc::GPU_CACHE_UPDATE; 160 161 let bus = if use_scatter { 162 assert!( 163 device.get_capabilities().supports_color_buffer_float, 164 "GpuCache scatter method requires EXT_color_buffer_float", 165 ); 166 let program = device.create_program_linked( 167 "gpu_cache_update", 168 &[], 169 &GPU_CACHE_UPDATE, 170 )?; 171 let buf_position = device.create_vbo(); 172 let buf_value = device.create_vbo(); 173 //Note: the vertex attributes have to be supplied in the same order 174 // as for program creation, but each assigned to a different stream. 175 let vao = device.create_custom_vao(&[ 176 buf_position.stream_with(&GPU_CACHE_UPDATE.vertex_attributes[0..1]), 177 buf_value .stream_with(&GPU_CACHE_UPDATE.vertex_attributes[1..2]), 178 ]); 179 GpuCacheBus::Scatter { 180 program, 181 vao, 182 buf_position, 183 buf_value, 184 count: 0, 185 } 186 } else { 187 GpuCacheBus::PixelBuffer { 188 rows: Vec::new(), 189 } 190 }; 191 192 Ok(GpuCacheTexture { 193 texture: None, 194 bus, 195 }) 196 } 197 deinit(mut self, device: &mut Device)198 pub fn deinit(mut self, device: &mut Device) { 199 if let Some(t) = self.texture.take() { 200 device.delete_texture(t); 201 } 202 if let GpuCacheBus::Scatter { program, vao, buf_position, buf_value, .. } = self.bus { 203 device.delete_program(program); 204 device.delete_custom_vao(vao); 205 device.delete_vbo(buf_position); 206 device.delete_vbo(buf_value); 207 } 208 } 209 get_height(&self) -> i32210 pub fn get_height(&self) -> i32 { 211 self.texture.as_ref().map_or(0, |t| t.get_dimensions().height) 212 } 213 214 #[cfg(feature = "capture")] get_texture(&self) -> &Texture215 pub fn get_texture(&self) -> &Texture { 216 self.texture.as_ref().unwrap() 217 } 218 prepare_for_updates( &mut self, device: &mut Device, total_block_count: usize, max_height: i32, )219 fn prepare_for_updates( 220 &mut self, 221 device: &mut Device, 222 total_block_count: usize, 223 max_height: i32, 224 ) { 225 self.ensure_texture(device, max_height); 226 match self.bus { 227 GpuCacheBus::PixelBuffer { .. } => {}, 228 GpuCacheBus::Scatter { 229 ref mut buf_position, 230 ref mut buf_value, 231 ref mut count, 232 .. 233 } => { 234 *count = 0; 235 if total_block_count > buf_value.allocated_count() { 236 device.allocate_vbo(buf_position, total_block_count, super::ONE_TIME_USAGE_HINT); 237 device.allocate_vbo(buf_value, total_block_count, super::ONE_TIME_USAGE_HINT); 238 } 239 } 240 } 241 } 242 invalidate(&mut self)243 pub fn invalidate(&mut self) { 244 match self.bus { 245 GpuCacheBus::PixelBuffer { ref mut rows, .. } => { 246 info!("Invalidating GPU caches"); 247 for row in rows { 248 row.add_dirty(0, super::MAX_VERTEX_TEXTURE_WIDTH); 249 } 250 } 251 GpuCacheBus::Scatter { .. } => { 252 warn!("Unable to invalidate scattered GPU cache"); 253 } 254 } 255 } 256 update(&mut self, device: &mut Device, updates: &GpuCacheUpdateList)257 fn update(&mut self, device: &mut Device, updates: &GpuCacheUpdateList) { 258 match self.bus { 259 GpuCacheBus::PixelBuffer { ref mut rows, .. } => { 260 for update in &updates.updates { 261 match *update { 262 GpuCacheUpdate::Copy { 263 block_index, 264 block_count, 265 address, 266 } => { 267 let row = address.v as usize; 268 269 // Ensure that the CPU-side shadow copy of the GPU cache data has enough 270 // rows to apply this patch. 271 while rows.len() <= row { 272 // Add a new row. 273 rows.push(CacheRow::new()); 274 } 275 276 // Copy the blocks from the patch array in the shadow CPU copy. 277 let block_offset = address.u as usize; 278 let data = &mut rows[row].cpu_blocks; 279 for i in 0 .. block_count { 280 data[block_offset + i] = updates.blocks[block_index + i]; 281 } 282 283 // This row is dirty (needs to be updated in GPU texture). 284 rows[row].add_dirty(block_offset, block_count); 285 } 286 } 287 } 288 } 289 GpuCacheBus::Scatter { 290 ref buf_position, 291 ref buf_value, 292 ref mut count, 293 .. 294 } => { 295 //TODO: re-use this heap allocation 296 // Unused positions will be left as 0xFFFF, which translates to 297 // (1.0, 1.0) in the vertex output position and gets culled out 298 let mut position_data = vec![[!0u16; 2]; updates.blocks.len()]; 299 let size = self.texture.as_ref().unwrap().get_dimensions().to_usize(); 300 301 for update in &updates.updates { 302 match *update { 303 GpuCacheUpdate::Copy { 304 block_index, 305 block_count, 306 address, 307 } => { 308 // Convert the absolute texel position into normalized 309 let y = ((2*address.v as usize + 1) << 15) / size.height; 310 for i in 0 .. block_count { 311 let x = ((2*address.u as usize + 2*i + 1) << 15) / size.width; 312 position_data[block_index + i] = [x as _, y as _]; 313 } 314 } 315 } 316 } 317 318 device.fill_vbo(buf_value, &updates.blocks, *count); 319 device.fill_vbo(buf_position, &position_data, *count); 320 *count += position_data.len(); 321 } 322 } 323 } 324 flush(&mut self, device: &mut Device, pbo_pool: &mut UploadPBOPool) -> usize325 fn flush(&mut self, device: &mut Device, pbo_pool: &mut UploadPBOPool) -> usize { 326 let texture = self.texture.as_ref().unwrap(); 327 match self.bus { 328 GpuCacheBus::PixelBuffer { ref mut rows } => { 329 let rows_dirty = rows 330 .iter() 331 .filter(|row| row.is_dirty()) 332 .count(); 333 if rows_dirty == 0 { 334 return 0 335 } 336 337 let mut uploader = device.upload_texture(pbo_pool); 338 339 for (row_index, row) in rows.iter_mut().enumerate() { 340 if !row.is_dirty() { 341 continue; 342 } 343 344 let blocks = row.dirty_blocks(); 345 let rect = DeviceIntRect::from_origin_and_size( 346 DeviceIntPoint::new(row.min_dirty as i32, row_index as i32), 347 DeviceIntSize::new(blocks.len() as i32, 1), 348 ); 349 350 uploader.upload(device, texture, rect, None, None, blocks.as_ptr(), blocks.len()); 351 352 row.clear_dirty(); 353 } 354 355 uploader.flush(device); 356 357 rows_dirty 358 } 359 GpuCacheBus::Scatter { ref program, ref vao, count, .. } => { 360 device.disable_depth(); 361 device.set_blend(false); 362 device.bind_program(program); 363 device.bind_custom_vao(vao); 364 device.bind_draw_target( 365 DrawTarget::from_texture( 366 texture, 367 false, 368 ), 369 ); 370 device.draw_nonindexed_points(0, count as _); 371 0 372 } 373 } 374 } 375 376 #[cfg(feature = "replay")] remove_texture(&mut self, device: &mut Device)377 pub fn remove_texture(&mut self, device: &mut Device) { 378 if let Some(t) = self.texture.take() { 379 device.delete_texture(t); 380 } 381 } 382 383 #[cfg(feature = "replay")] load_from_data(&mut self, texture: Texture, data: Vec<u8>)384 pub fn load_from_data(&mut self, texture: Texture, data: Vec<u8>) { 385 assert!(self.texture.is_none()); 386 match self.bus { 387 GpuCacheBus::PixelBuffer { ref mut rows, .. } => { 388 let dim = texture.get_dimensions(); 389 let blocks = unsafe { 390 std::slice::from_raw_parts( 391 data.as_ptr() as *const GpuBlockData, 392 data.len() / mem::size_of::<GpuBlockData>(), 393 ) 394 }; 395 // fill up the CPU cache from the contents we just loaded 396 rows.clear(); 397 rows.extend((0 .. dim.height).map(|_| CacheRow::new())); 398 let chunks = blocks.chunks(super::MAX_VERTEX_TEXTURE_WIDTH); 399 debug_assert_eq!(chunks.len(), rows.len()); 400 for (row, chunk) in rows.iter_mut().zip(chunks) { 401 row.cpu_blocks.copy_from_slice(chunk); 402 } 403 } 404 GpuCacheBus::Scatter { .. } => {} 405 } 406 self.texture = Some(texture); 407 } 408 report_memory_to(&self, report: &mut MemoryReport, size_op_funs: &MallocSizeOfOps)409 pub fn report_memory_to(&self, report: &mut MemoryReport, size_op_funs: &MallocSizeOfOps) { 410 if let GpuCacheBus::PixelBuffer{ref rows, ..} = self.bus { 411 for row in rows.iter() { 412 report.gpu_cache_cpu_mirror += unsafe { (size_op_funs.size_of_op)(row.cpu_blocks.as_ptr() as *const _) }; 413 } 414 } 415 416 // GPU cache GPU memory. 417 report.gpu_cache_textures += 418 self.texture.as_ref().map_or(0, |t| t.size_in_bytes()); 419 } 420 } 421 422 impl super::Renderer { update_gpu_cache(&mut self)423 pub fn update_gpu_cache(&mut self) { 424 let _gm = self.gpu_profiler.start_marker("gpu cache update"); 425 426 // For an artificial stress test of GPU cache resizing, 427 // always pass an extra update list with at least one block in it. 428 let gpu_cache_height = self.gpu_cache_texture.get_height(); 429 if gpu_cache_height != 0 && GPU_CACHE_RESIZE_TEST { 430 self.pending_gpu_cache_updates.push(GpuCacheUpdateList { 431 frame_id: FrameId::INVALID, 432 clear: false, 433 height: gpu_cache_height, 434 blocks: vec![[1f32; 4].into()], 435 updates: Vec::new(), 436 debug_commands: Vec::new(), 437 }); 438 } 439 440 let (updated_blocks, max_requested_height) = self 441 .pending_gpu_cache_updates 442 .iter() 443 .fold((0, gpu_cache_height), |(count, height), list| { 444 (count + list.blocks.len(), cmp::max(height, list.height)) 445 }); 446 447 if max_requested_height > self.get_max_texture_size() && !self.gpu_cache_overflow { 448 self.gpu_cache_overflow = true; 449 self.renderer_errors.push(super::RendererError::MaxTextureSize); 450 } 451 452 // Note: if we decide to switch to scatter-style GPU cache update 453 // permanently, we can have this code nicer with `BufferUploader` kind 454 // of helper, similarly to how `TextureUploader` API is used. 455 self.gpu_cache_texture.prepare_for_updates( 456 &mut self.device, 457 updated_blocks, 458 max_requested_height, 459 ); 460 461 for update_list in self.pending_gpu_cache_updates.drain(..) { 462 assert!(update_list.height <= max_requested_height); 463 if update_list.frame_id > self.gpu_cache_frame_id { 464 self.gpu_cache_frame_id = update_list.frame_id 465 } 466 self.gpu_cache_texture 467 .update(&mut self.device, &update_list); 468 } 469 470 self.profile.start_time(profiler::GPU_CACHE_UPLOAD_TIME); 471 let updated_rows = self.gpu_cache_texture.flush( 472 &mut self.device, 473 &mut self.texture_upload_pbo_pool 474 ); 475 self.gpu_cache_upload_time += self.profile.end_time(profiler::GPU_CACHE_UPLOAD_TIME); 476 477 self.profile.set(profiler::GPU_CACHE_ROWS_UPDATED, updated_rows); 478 self.profile.set(profiler::GPU_CACHE_BLOCKS_UPDATED, updated_blocks); 479 } 480 prepare_gpu_cache( &mut self, deferred_resolves: &[DeferredResolve], ) -> Result<(), super::RendererError>481 pub fn prepare_gpu_cache( 482 &mut self, 483 deferred_resolves: &[DeferredResolve], 484 ) -> Result<(), super::RendererError> { 485 if self.pending_gpu_cache_clear { 486 let use_scatter = 487 matches!(self.gpu_cache_texture.bus, GpuCacheBus::Scatter { .. }); 488 let new_cache = GpuCacheTexture::new(&mut self.device, use_scatter)?; 489 let old_cache = mem::replace(&mut self.gpu_cache_texture, new_cache); 490 old_cache.deinit(&mut self.device); 491 self.pending_gpu_cache_clear = false; 492 } 493 494 let deferred_update_list = self.update_deferred_resolves(deferred_resolves); 495 self.pending_gpu_cache_updates.extend(deferred_update_list); 496 497 self.update_gpu_cache(); 498 499 // Note: the texture might have changed during the `update`, 500 // so we need to bind it here. 501 self.device.bind_texture( 502 super::TextureSampler::GpuCache, 503 self.gpu_cache_texture.texture.as_ref().unwrap(), 504 Swizzle::default(), 505 ); 506 507 Ok(()) 508 } 509 read_gpu_cache(&mut self) -> (DeviceIntSize, Vec<u8>)510 pub fn read_gpu_cache(&mut self) -> (DeviceIntSize, Vec<u8>) { 511 let texture = self.gpu_cache_texture.texture.as_ref().unwrap(); 512 let size = device_size_as_framebuffer_size(texture.get_dimensions()); 513 let mut texels = vec![0; (size.width * size.height * 16) as usize]; 514 self.device.begin_frame(); 515 self.device.bind_read_target(ReadTarget::from_texture(texture)); 516 self.device.read_pixels_into( 517 size.into(), 518 api::ImageFormat::RGBAF32, 519 &mut texels, 520 ); 521 self.device.reset_read_target(); 522 self.device.end_frame(); 523 (texture.get_dimensions(), texels) 524 } 525 } 526