Bug 1498732 - Ensure PBO texture upload is performed asynchronously on webrender on Adrenos. r=kvark
authorJamie Nicol <jnicol@mozilla.com>
Mon, 25 Feb 2019 15:00:09 +0000
changeset 460897 a9eee2d6d9b84752ec885fe2f55a4c3c4337b83f
parent 460896 dd8f4d598a431d201ac936435c1b8b96dfcd2dde
child 460898 a69e03c02d34a08c915087ea403e68bbcebe7bdb
push id35613
push usernerli@mozilla.com
push dateTue, 26 Feb 2019 03:52:35 +0000
treeherdermozilla-central@faec87a80ed1 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerskvark
bugs1498732
milestone67.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1498732 - Ensure PBO texture upload is performed asynchronously on webrender on Adrenos. r=kvark Currently on Android we upload texture data to the webrender texture cache using a PBO. On Adreno GPUs, however, this upload is still being done synchronously, and profiles show a lot of time spent waiting in glTexSubImage3D. The problem is that the stride of the data in the PBO is not a multiple of 256 bytes, so the driver is not able to DMA the upload. This patch ensures that data is laid out optimally in the PBO, using glMapBufferRange then copying the data line-by-line if required. This allows the driver to perform the upload asynchronously as intended. Differential Revision: https://phabricator.services.mozilla.com/D20492
gfx/wr/webrender/src/device/gl.rs
--- a/gfx/wr/webrender/src/device/gl.rs
+++ b/gfx/wr/webrender/src/device/gl.rs
@@ -15,16 +15,17 @@ use log::Level;
 use sha2::{Digest, Sha256};
 use smallvec::SmallVec;
 use std::borrow::Cow;
 use std::cell::{Cell, RefCell};
 use std::cmp;
 use std::collections::hash_map::Entry;
 use std::marker::PhantomData;
 use std::mem;
+use std::num::NonZeroUsize;
 use std::os::raw::c_void;
 use std::ops::Add;
 use std::path::PathBuf;
 use std::ptr;
 use std::rc::Rc;
 use std::slice;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicUsize, Ordering};
@@ -958,16 +959,18 @@ pub struct Device {
     /// some drivers, particularly ANGLE). However, it is not always supported
     /// at all, or for BGRA8 format. If it's not supported for the required
     /// format, we fall back to glTexImage*.
     texture_storage_usage: TexStorageUsage,
 
     /// Whether the function glCopyImageSubData is available.
     supports_copy_image_sub_data: bool,
 
+    optimal_pbo_stride: NonZeroUsize,
+
     // GL extensions
     extensions: Vec<String>,
 }
 
 /// Contains the parameters necessary to bind a draw target.
 #[derive(Clone, Copy)]
 pub enum DrawTarget<'a> {
     /// Use the device's default draw target, with the provided dimensions,
@@ -1167,16 +1170,27 @@ impl Device {
                     TexStorageUsage::Never
                 },
             )
         };
 
         let supports_copy_image_sub_data = supports_extension(&extensions, "GL_EXT_copy_image") ||
             supports_extension(&extensions, "GL_ARB_copy_image");
 
+        // On Adreno GPUs PBO texture upload is only performed asynchronously
+        // if the stride of the data in the PBO is a multiple of 256 bytes.
+        // Other platforms may have similar requirements and should be added
+        // here.
+        // The default value should be 4.
+        let optimal_pbo_stride = if renderer_name.contains("Adreno") {
+            NonZeroUsize::new(256).unwrap()
+        } else {
+            NonZeroUsize::new(4).unwrap()
+        };
+
         Device {
             gl,
             resource_override_path,
             upload_method,
             inside_frame: false,
 
             capabilities: Capabilities {
                 supports_multisampling: false, //TODO
@@ -1200,17 +1214,18 @@ impl Device {
 
             max_texture_size,
             max_texture_layers,
             renderer_name,
             cached_programs,
             frame_id: GpuFrameId(0),
             extensions,
             texture_storage_usage,
-            supports_copy_image_sub_data
+            supports_copy_image_sub_data,
+            optimal_pbo_stride,
         }
     }
 
     pub fn gl(&self) -> &gl::Gl {
         &*self.gl
     }
 
     pub fn rc_gl(&self) -> &Rc<gl::Gl> {
@@ -2153,16 +2168,17 @@ impl Device {
                 Some(PixelBuffer::new(hint.to_gl(), upload_size))
             },
         };
 
         TextureUploader {
             target: UploadTarget {
                 gl: &*self.gl,
                 bgra_format: self.bgra_format_external,
+                optimal_pbo_stride: self.optimal_pbo_stride,
                 texture,
             },
             buffer,
             marker: PhantomData,
         }
     }
 
     /// Performs an immediate (non-PBO) texture upload.
@@ -2874,16 +2890,17 @@ impl PixelBuffer {
             chunks: SmallVec::new(),
         }
     }
 }
 
 struct UploadTarget<'a> {
     gl: &'a gl::Gl,
     bgra_format: gl::GLuint,
+    optimal_pbo_stride: NonZeroUsize,
     texture: &'a Texture,
 }
 
 pub struct TextureUploader<'a, T> {
     target: UploadTarget<'a>,
     buffer: Option<PixelBuffer>,
     marker: PhantomData<T>,
 }
@@ -2894,16 +2911,23 @@ impl<'a, T> Drop for TextureUploader<'a,
             for chunk in buffer.chunks {
                 self.target.update_impl(chunk);
             }
             self.target.gl.bind_buffer(gl::PIXEL_UNPACK_BUFFER, 0);
         }
     }
 }
 
+fn round_up_to_multiple(val: usize, mul: NonZeroUsize) -> usize {
+    match val % mul.get() {
+        rem if rem > 0 => val - rem + mul.get(),
+        _ => val,
+    }
+}
+
 impl<'a, T> TextureUploader<'a, T> {
     pub fn upload(
         &mut self,
         mut rect: DeviceIntRect,
         layer_index: i32,
         stride: Option<i32>,
         data: &[T],
     ) -> usize {
@@ -2915,69 +2939,106 @@ impl<'a, T> TextureUploader<'a, T> {
         if cfg!(debug_assertions) && cropped.map_or(true, |r| r != rect) {
             warn!("Cropping texture upload {:?} to {:?}", rect, cropped);
         }
         rect = match cropped {
             None => return 0,
             Some(r) => r,
         };
 
-        let bytes_pp = self.target.texture.format.bytes_per_pixel();
-        let upload_size = match stride {
-            Some(stride) => ((rect.size.height - 1) * stride + rect.size.width * bytes_pp) as usize,
-            None => (rect.size.area() * bytes_pp) as usize,
-        };
-        assert!(upload_size <= data.len() * mem::size_of::<T>());
+        let bytes_pp = self.target.texture.format.bytes_per_pixel() as usize;
+        let width_bytes = rect.size.width as usize * bytes_pp;
+
+        let src_stride = stride.map_or(width_bytes, |stride| {
+            assert!(stride >= 0);
+            stride as usize
+        });
+        let src_size = (rect.size.height as usize - 1) * src_stride + width_bytes;
+        assert!(src_size <= data.len() * mem::size_of::<T>());
+
+        // for optimal PBO texture uploads the stride of the data in
+        // the buffer may have to be a multiple of a certain value.
+        let dst_stride = round_up_to_multiple(src_stride, self.target.optimal_pbo_stride);
+        let dst_size = (rect.size.height as usize - 1) * dst_stride + width_bytes;
 
         match self.buffer {
             Some(ref mut buffer) => {
-                let elem_count = upload_size / mem::size_of::<T>();
-                assert_eq!(elem_count * mem::size_of::<T>(), upload_size);
-                let slice = &data[.. elem_count];
-
-                if buffer.size_used + upload_size > buffer.size_allocated {
+                if buffer.size_used + dst_size > buffer.size_allocated {
                     // flush
                     for chunk in buffer.chunks.drain() {
                         self.target.update_impl(chunk);
                     }
                     buffer.size_used = 0;
                 }
 
-                if upload_size > buffer.size_allocated {
-                    gl::buffer_data(
-                        self.target.gl,
+                if dst_size > buffer.size_allocated {
+                    // allocate a buffer large enough
+                    self.target.gl.buffer_data_untyped(
                         gl::PIXEL_UNPACK_BUFFER,
-                        slice,
+                        dst_size as _,
+                        ptr::null(),
                         buffer.usage,
                     );
-                    buffer.size_allocated = upload_size;
-                } else {
+                    buffer.size_allocated = dst_size;
+                }
+
+                if src_stride == dst_stride {
+                    // the stride is already optimal, so simply copy
+                    // the data as-is in to the buffer
+                    let elem_count = src_size / mem::size_of::<T>();
+                    assert_eq!(elem_count * mem::size_of::<T>(), src_size);
+                    let slice = &data[.. elem_count];
+
                     gl::buffer_sub_data(
                         self.target.gl,
                         gl::PIXEL_UNPACK_BUFFER,
                         buffer.size_used as _,
                         slice,
                     );
+                } else {
+                    // copy the data line-by-line in to the buffer so
+                    // that it has an optimal stride
+                    let ptr = self.target.gl.map_buffer_range(
+                        gl::PIXEL_UNPACK_BUFFER,
+                        buffer.size_used as _,
+                        dst_size as _,
+                        gl::MAP_WRITE_BIT | gl::MAP_INVALIDATE_RANGE_BIT);
+
+                    unsafe {
+                        let src: &[u8] = slice::from_raw_parts(data.as_ptr() as *const u8, src_size);
+                        let dst: &mut [u8] = slice::from_raw_parts_mut(ptr as *mut u8, dst_size);
+
+                        for y in 0..rect.size.height as usize {
+                            let src_start = y * src_stride;
+                            let src_end = src_start + width_bytes;
+                            let dst_start = y * dst_stride;
+                            let dst_end = dst_start + width_bytes;
+
+                            dst[dst_start..dst_end].copy_from_slice(&src[src_start..src_end])
+                        }
+                    }
+
+                    self.target.gl.unmap_buffer(gl::PIXEL_UNPACK_BUFFER);
                 }
 
                 buffer.chunks.push(UploadChunk {
-                    rect, layer_index, stride,
+                    rect, layer_index, stride: Some(dst_stride as i32),
                     offset: buffer.size_used,
                 });
-                buffer.size_used += upload_size;
+                buffer.size_used += dst_size;
             }
             None => {
                 self.target.update_impl(UploadChunk {
                     rect, layer_index, stride,
                     offset: data.as_ptr() as _,
                 });
             }
         }
 
-        upload_size
+        dst_size
     }
 }
 
 impl<'a> UploadTarget<'a> {
     fn update_impl(&mut self, chunk: UploadChunk) {
         let (gl_format, bpp, data_type) = match self.texture.format {
             ImageFormat::R8 => (gl::RED, 1, gl::UNSIGNED_BYTE),
             ImageFormat::R16 => (gl::RED, 2, gl::UNSIGNED_SHORT),