Bug 1598380 - Use single PBO per texture for uploads. r=kvark
authorJamie Nicol <jnicol@mozilla.com>
Tue, 17 Dec 2019 19:07:12 +0000
changeset 507457 44161695885886b689baae2ad8b357f81f3eef23
parent 507456 246a4db8140218f0ab29ff29cc38f624822126ec
child 507458 c5a0a59e597d46b0d2c27d39e807b5ff298d6645
push id36927
push useraiakab@mozilla.com
push dateWed, 18 Dec 2019 00:51:03 +0000
treeherdermozilla-central@35af0b925215 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerskvark
bugs1598380, 1603783
milestone73.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1598380 - Use single PBO per texture for uploads. r=kvark Uploading texture data is showing up frequently in profiles on Adreno devices, especially when zooming on a text-heavy page. Specifically, the time is spent in glMapBufferRange and glBufferSubData, most likely when internally allocating the buffer before transferring data in to it. Currently, we are creating a new PBO, by calling glBufferData(), for each individual upload region. This change makes it so that we calculate the required size for all upload regions to a texture, then create single a PBO of the required size. The entire buffer is then mapped only once, and each individual upload chunk is written to it. This can require the driver to allocate a large buffer, sometimes multiple megabytes in size. However, it handles this case much better than allocating tens or even hundreds of smaller buffers. An upload chunk may require more space in a PBO than the original CPU-side buffer, so that the data is aligned correctly for performance or correctness reasons. Therefore it is the caller of Device.upload_texture()'s responsibility to call a new function, Device.required_upload_size(), to calculate the required size beforehand. On AMD Macs, there is a bug where PBO uploads from a non-zero offset can fail. See bug 1603783. Therefore this patch preserves the current behaviour on AMD Mac, reallocating a new PBO for each upload, therefore ensuring the offset is always zero. Differential Revision: https://phabricator.services.mozilla.com/D56382
gfx/wr/webrender/src/device/gl.rs
gfx/wr/webrender/src/internal_types.rs
gfx/wr/webrender/src/renderer.rs
gfx/wr/webrender/src/texture_cache.rs
--- a/gfx/wr/webrender/src/device/gl.rs
+++ b/gfx/wr/webrender/src/device/gl.rs
@@ -941,16 +941,19 @@ pub struct Capabilities {
     pub supports_pixel_local_storage: bool,
     /// Whether advanced blend equations are supported.
     pub supports_advanced_blend_equation: bool,
     /// Whether KHR_debug is supported for getting debug messages from
     /// the driver.
     pub supports_khr_debug: bool,
     /// Whether we can configure texture units to do swizzling on sampling.
     pub supports_texture_swizzle: bool,
+    /// Whether the driver supports uploading to textures from a non-zero
+    /// offset within a PBO.
+    pub supports_nonzero_pbo_offsets: bool,
 }
 
 #[derive(Clone, Debug)]
 pub enum ShaderError {
     Compilation(String, String), // name, error message
     Link(String, String),        // name, error message
 }
 
@@ -1435,31 +1438,36 @@ impl Device {
         // The default value should be 4.
         let is_amd_macos = cfg!(target_os = "macos") && renderer_name.starts_with("AMD");
         let optimal_pbo_stride = if is_adreno || is_amd_macos {
             NonZeroUsize::new(256).unwrap()
         } else {
             NonZeroUsize::new(4).unwrap()
         };
 
+        // On AMD Macs there is a driver bug which causes some texture uploads
+        // from a non-zero offset within a PBO to fail. See bug 1603783.
+        let supports_nonzero_pbo_offsets = !is_amd_macos;
+
         Device {
             gl,
             base_gl: None,
             resource_override_path,
             upload_method,
             inside_frame: false,
 
             capabilities: Capabilities {
                 supports_multisampling: false, //TODO
                 supports_copy_image_sub_data,
                 supports_blit_to_texture_array,
                 supports_pixel_local_storage,
                 supports_advanced_blend_equation,
                 supports_khr_debug,
                 supports_texture_swizzle,
+                supports_nonzero_pbo_offsets,
             },
 
             color_formats,
             bgra_formats,
             swizzle_settings: SwizzleSettings {
                 bgra8_sampling_swizzle,
             },
 
@@ -2692,50 +2700,108 @@ impl Device {
     }
 
     pub fn delete_pbo(&mut self, mut pbo: PBO) {
         self.gl.delete_buffers(&[pbo.id]);
         pbo.id = 0;
         pbo.reserved_size = 0
     }
 
+    /// Returns the size and stride in bytes required to upload an area of pixels
+    /// of the specified size, to a texture of the specified format.
+    pub fn required_upload_size_and_stride(&self, size: DeviceIntSize, format: ImageFormat) -> (usize, usize) {
+        assert!(size.width >= 0);
+        assert!(size.height >= 0);
+
+        let bytes_pp = format.bytes_per_pixel() as usize;
+        let width_bytes = size.width as usize * bytes_pp;
+
+        let dst_stride = round_up_to_multiple(width_bytes, self.optimal_pbo_stride);
+
+        // The size of the chunk should only need to be (height - 1) * dst_stride + width_bytes,
+        // however, the android emulator will error unless it is height * dst_stride.
+        // See bug 1587047 for details.
+        // Using the full final row also ensures that the offset of the next chunk is
+        // optimally aligned.
+        let dst_size = dst_stride * size.height as usize;
+
+        (dst_size, dst_stride)
+    }
+
+    /// (Re)allocates and maps a PBO, returning a `PixelBuffer` if successful.
+    /// The contents can be written to using the `mapping` field.
+    /// The buffer must be bound to `GL_PIXEL_UNPACK_BUFFER` before calling this function,
+    /// and must be unmapped using `glUnmapBuffer` prior to uploading the contents to a texture.
+    fn create_upload_buffer<'a>(&mut self, hint: VertexUsageHint, size: usize) -> Result<PixelBuffer<'a>, ()> {
+        self.gl.buffer_data_untyped(
+            gl::PIXEL_UNPACK_BUFFER,
+            size as _,
+            ptr::null(),
+            hint.to_gl(),
+        );
+        let ptr = self.gl.map_buffer_range(
+            gl::PIXEL_UNPACK_BUFFER,
+            0,
+            size as _,
+            gl::MAP_WRITE_BIT | gl::MAP_INVALIDATE_BUFFER_BIT,
+        );
+
+        if ptr != ptr::null_mut() {
+            let mapping = unsafe {
+                slice::from_raw_parts_mut(ptr as *mut _, size)
+            };
+            Ok(PixelBuffer::new(size, mapping))
+        } else {
+            error!("Failed to map PBO of size {} bytes", size);
+            Err(())
+        }
+    }
+
+    /// Returns a `TextureUploader` which can be used to upload texture data to `texture`.
+    /// The total size in bytes is specified by `upload_size`, and must be greater than zero
+    /// and at least as large as the sum of the sizes returned from
+    /// `required_upload_size_and_stride()` for each subsequent call to `TextureUploader.upload()`.
     pub fn upload_texture<'a, T>(
         &'a mut self,
         texture: &'a Texture,
         pbo: &PBO,
-        upload_count: usize,
+        upload_size: usize,
     ) -> TextureUploader<'a, T> {
         debug_assert!(self.inside_frame);
+        assert_ne!(upload_size, 0, "Must specify valid upload size");
+
         self.bind_texture(DEFAULT_TEXTURE, texture, Swizzle::default());
 
-        let buffer = match self.upload_method {
-            UploadMethod::Immediate => None,
+        let uploader_type = match self.upload_method {
+            UploadMethod::Immediate => TextureUploaderType::Immediate,
             UploadMethod::PixelBuffer(hint) => {
-                let upload_size = upload_count * mem::size_of::<T>();
                 self.gl.bind_buffer(gl::PIXEL_UNPACK_BUFFER, pbo.id);
-                if upload_size != 0 {
-                    self.gl.buffer_data_untyped(
-                        gl::PIXEL_UNPACK_BUFFER,
-                        upload_size as _,
-                        ptr::null(),
-                        hint.to_gl(),
-                    );
+                if self.capabilities.supports_nonzero_pbo_offsets {
+                    match self.create_upload_buffer(hint, upload_size) {
+                        Ok(buffer) => TextureUploaderType::MutliUseBuffer(buffer),
+                        Err(_) => {
+                            // If allocating the buffer failed, fall back to immediate uploads
+                            self.gl.bind_buffer(gl::PIXEL_UNPACK_BUFFER, 0);
+                            TextureUploaderType::Immediate
+                        }
+                    }
+                } else {
+                    // If we cannot upload from non-zero offsets, then we must
+                    // reallocate a new buffer for each upload.
+                    TextureUploaderType::SingleUseBuffers(hint)
                 }
-                Some(PixelBuffer::new(hint.to_gl(), upload_size))
             },
         };
 
         TextureUploader {
             target: UploadTarget {
-                gl: &*self.gl,
-                bgra_format: self.bgra_formats.external,
-                optimal_pbo_stride: self.optimal_pbo_stride,
+                device: self,
                 texture,
             },
-            buffer,
+            uploader_type,
             marker: PhantomData,
         }
     }
 
     /// Performs an immediate (non-PBO) texture upload.
     pub fn upload_texture_immediate<T: Texel>(
         &mut self,
         texture: &Texture,
@@ -3480,58 +3546,80 @@ pub struct FormatDesc {
 struct UploadChunk {
     rect: DeviceIntRect,
     layer_index: i32,
     stride: Option<i32>,
     offset: usize,
     format_override: Option<ImageFormat>,
 }
 
-struct PixelBuffer {
-    usage: gl::GLenum,
+struct PixelBuffer<'a> {
     size_allocated: usize,
     size_used: usize,
     // small vector avoids heap allocation for a single chunk
     chunks: SmallVec<[UploadChunk; 1]>,
+    mapping: &'a mut [mem::MaybeUninit<u8>],
 }
 
-impl PixelBuffer {
+impl<'a> PixelBuffer<'a> {
     fn new(
-        usage: gl::GLenum,
         size_allocated: usize,
+        mapping: &'a mut [mem::MaybeUninit<u8>],
     ) -> Self {
         PixelBuffer {
-            usage,
             size_allocated,
             size_used: 0,
             chunks: SmallVec::new(),
+            mapping,
         }
     }
+
+    fn flush_chunks(&mut self, target: &mut UploadTarget) {
+        for chunk in self.chunks.drain() {
+            target.update_impl(chunk);
+        }
+        self.size_used = 0;
+    }
+}
+
+impl<'a> Drop for PixelBuffer<'a> {
+    fn drop(&mut self) {
+        assert_eq!(self.chunks.len(), 0, "PixelBuffer must be flushed before dropping.");
+    }
 }
 
 struct UploadTarget<'a> {
-    gl: &'a dyn gl::Gl,
-    bgra_format: gl::GLuint,
-    optimal_pbo_stride: NonZeroUsize,
+    device: &'a mut Device,
     texture: &'a Texture,
 }
 
+enum TextureUploaderType<'a> {
+    Immediate,
+    SingleUseBuffers(VertexUsageHint),
+    MutliUseBuffer(PixelBuffer<'a>)
+}
+
 pub struct TextureUploader<'a, T> {
     target: UploadTarget<'a>,
-    buffer: Option<PixelBuffer>,
+    uploader_type: TextureUploaderType<'a>,
     marker: PhantomData<T>,
 }
 
 impl<'a, T> Drop for TextureUploader<'a, T> {
     fn drop(&mut self) {
-        if let Some(buffer) = self.buffer.take() {
-            for chunk in buffer.chunks {
-                self.target.update_impl(chunk);
+        match self.uploader_type {
+            TextureUploaderType::MutliUseBuffer(ref mut buffer) => {
+                self.target.device.gl.unmap_buffer(gl::PIXEL_UNPACK_BUFFER);
+                buffer.flush_chunks(&mut self.target);
+                self.target.device.gl.bind_buffer(gl::PIXEL_UNPACK_BUFFER, 0);
             }
-            self.target.gl.bind_buffer(gl::PIXEL_UNPACK_BUFFER, 0);
+            TextureUploaderType::SingleUseBuffers(_) => {
+                self.target.device.gl.bind_buffer(gl::PIXEL_UNPACK_BUFFER, 0);
+            }
+            TextureUploaderType::Immediate => {}
         }
     }
 }
 
 impl<'a, T> TextureUploader<'a, T> {
     pub fn upload(
         &mut self,
         mut rect: DeviceIntRect,
@@ -3559,175 +3647,182 @@ impl<'a, T> TextureUploader<'a, T> {
 
         let src_stride = stride.map_or(width_bytes, |stride| {
             assert!(stride >= 0);
             stride as usize
         });
         let src_size = (rect.size.height as usize - 1) * src_stride + width_bytes;
         assert!(src_size <= len * mem::size_of::<T>());
 
-        // for optimal PBO texture uploads the stride of the data in
+        // for optimal PBO texture uploads the offset and stride of the data in
         // the buffer may have to be a multiple of a certain value.
-        let dst_stride = round_up_to_multiple(src_stride, self.target.optimal_pbo_stride);
-        // The size of the PBO should only need to be (height - 1) * dst_stride + width_bytes,
-        // however, the android emulator will error unless it is height * dst_stride.
-        // See bug 1587047 for details.
-        let dst_size = rect.size.height as usize * dst_stride;
-
-        match self.buffer {
-            Some(ref mut buffer) => {
-                if buffer.size_used + dst_size > buffer.size_allocated {
-                    // flush
-                    for chunk in buffer.chunks.drain() {
-                        self.target.update_impl(chunk);
+        let (dst_size, dst_stride) = self.target.device.required_upload_size_and_stride(
+            rect.size,
+            self.target.texture.format,
+        );
+
+        // Choose the buffer to use, if any, allocating a new single-use buffer if required.
+        let mut single_use_buffer = None;
+        let mut buffer = match self.uploader_type {
+            TextureUploaderType::MutliUseBuffer(ref mut buffer) => Some(buffer),
+            TextureUploaderType::SingleUseBuffers(hint) => {
+                match self.target.device.create_upload_buffer(hint, dst_size) {
+                    Ok(buffer) => {
+                        single_use_buffer = Some(buffer);
+                        single_use_buffer.as_mut()
                     }
-                    buffer.size_used = 0;
+                    Err(_) => {
+                        // If allocating the buffer failed, fall back to immediate uploads
+                        self.target.device.gl.bind_buffer(gl::PIXEL_UNPACK_BUFFER, 0);
+                        self.uploader_type = TextureUploaderType::Immediate;
+                        None
+                    }
                 }
-
-                if dst_size > buffer.size_allocated {
-                    // allocate a buffer large enough
-                    self.target.gl.buffer_data_untyped(
-                        gl::PIXEL_UNPACK_BUFFER,
-                        dst_size as _,
-                        ptr::null(),
-                        buffer.usage,
-                    );
-                    buffer.size_allocated = dst_size;
+            }
+            TextureUploaderType::Immediate => None,
+        };
+
+        match buffer {
+            Some(ref mut buffer) => {
+                if !self.target.device.capabilities.supports_nonzero_pbo_offsets {
+                    assert_eq!(buffer.size_used, 0, "PBO uploads from non-zero offset are not supported.");
                 }
-
-                if src_stride == dst_stride {
-                    // the stride is already optimal, so simply copy
-                    // the data as-is in to the buffer
-                    assert_eq!(src_size % mem::size_of::<T>(), 0);
-                    self.target.gl.buffer_sub_data_untyped(
-                        gl::PIXEL_UNPACK_BUFFER,
-                        buffer.size_used as isize,
-                        src_size as isize,
-                        data as *const _,
-                    );
-                } else {
-                    // copy the data line-by-line in to the buffer so
-                    // that it has an optimal stride
-                    let ptr = self.target.gl.map_buffer_range(
-                        gl::PIXEL_UNPACK_BUFFER,
-                        buffer.size_used as _,
-                        dst_size as _,
-                        gl::MAP_WRITE_BIT | gl::MAP_INVALIDATE_RANGE_BIT,
-                    );
-
-                    unsafe {
-                        let src: &[mem::MaybeUninit<u8>] = slice::from_raw_parts(data as *const _, src_size);
-                        let dst: &mut [mem::MaybeUninit<u8>] = slice::from_raw_parts_mut(ptr as *mut _, dst_size);
-
+                assert!(buffer.size_used + dst_size <= buffer.size_allocated, "PixelBuffer is too small");
+
+                unsafe {
+                    let src: &[mem::MaybeUninit<u8>] = slice::from_raw_parts(data as *const _, src_size);
+
+                    if src_stride == dst_stride {
+                        // the stride is already optimal, so simply copy
+                        // the data as-is in to the buffer
+                        let dst_start = buffer.size_used;
+                        let dst_end = dst_start + src_size;
+
+                        buffer.mapping[dst_start..dst_end].copy_from_slice(src);
+                    } else {
+                        // copy the data line-by-line in to the buffer so
+                        // that it has an optimal stride
                         for y in 0..rect.size.height as usize {
                             let src_start = y * src_stride;
                             let src_end = src_start + width_bytes;
-                            let dst_start = y * dst_stride;
+                            let dst_start = buffer.size_used + y * dst_stride;
                             let dst_end = dst_start + width_bytes;
 
-                            dst[dst_start..dst_end].copy_from_slice(&src[src_start..src_end])
+                            buffer.mapping[dst_start..dst_end].copy_from_slice(&src[src_start..src_end])
                         }
                     }
-
-                    self.target.gl.unmap_buffer(gl::PIXEL_UNPACK_BUFFER);
                 }
 
                 buffer.chunks.push(UploadChunk {
                     rect,
                     layer_index,
                     stride: Some(dst_stride as i32),
                     offset: buffer.size_used,
                     format_override,
                 });
                 buffer.size_used += dst_size;
             }
             None => {
+                if cfg!(debug_assertions) {
+                    let mut bound_buffer = [0];
+                    unsafe {
+                        self.target.device.gl.get_integer_v(gl::PIXEL_UNPACK_BUFFER_BINDING, &mut bound_buffer);
+                    }
+                    assert_eq!(bound_buffer[0], 0, "GL_PIXEL_UNPACK_BUFFER must not be bound for immediate uploads.");
+                }
+
                 self.target.update_impl(UploadChunk {
                     rect,
                     layer_index,
                     stride,
                     offset: data as _,
                     format_override,
                 });
             }
         }
 
+        // Flush the buffer if it is for single-use.
+        if let Some(ref mut buffer) = single_use_buffer {
+            self.target.device.gl.unmap_buffer(gl::PIXEL_UNPACK_BUFFER);
+            buffer.flush_chunks(&mut self.target);
+        }
+
         dst_size
     }
 }
 
 impl<'a> UploadTarget<'a> {
     fn update_impl(&mut self, chunk: UploadChunk) {
         let format = chunk.format_override.unwrap_or(self.texture.format);
         let (gl_format, bpp, data_type) = match format {
             ImageFormat::R8 => (gl::RED, 1, gl::UNSIGNED_BYTE),
             ImageFormat::R16 => (gl::RED, 2, gl::UNSIGNED_SHORT),
-            ImageFormat::BGRA8 => (self.bgra_format, 4, gl::UNSIGNED_BYTE),
+            ImageFormat::BGRA8 => (self.device.bgra_formats.external, 4, gl::UNSIGNED_BYTE),
             ImageFormat::RGBA8 => (gl::RGBA, 4, gl::UNSIGNED_BYTE),
             ImageFormat::RG8 => (gl::RG, 2, gl::UNSIGNED_BYTE),
             ImageFormat::RG16 => (gl::RG, 4, gl::UNSIGNED_SHORT),
             ImageFormat::RGBAF32 => (gl::RGBA, 16, gl::FLOAT),
             ImageFormat::RGBAI32 => (gl::RGBA_INTEGER, 16, gl::INT),
         };
 
         let row_length = match chunk.stride {
             Some(value) => value / bpp,
             None => self.texture.size.width,
         };
 
         if chunk.stride.is_some() {
-            self.gl.pixel_store_i(
+            self.device.gl.pixel_store_i(
                 gl::UNPACK_ROW_LENGTH,
                 row_length as _,
             );
         }
 
         let pos = chunk.rect.origin;
         let size = chunk.rect.size;
 
         match self.texture.target {
             gl::TEXTURE_2D_ARRAY => {
-                self.gl.tex_sub_image_3d_pbo(
+                self.device.gl.tex_sub_image_3d_pbo(
                     self.texture.target,
                     0,
                     pos.x as _,
                     pos.y as _,
                     chunk.layer_index,
                     size.width as _,
                     size.height as _,
                     1,
                     gl_format,
                     data_type,
                     chunk.offset,
                 );
             }
             gl::TEXTURE_2D | gl::TEXTURE_RECTANGLE | gl::TEXTURE_EXTERNAL_OES => {
-                self.gl.tex_sub_image_2d_pbo(
+                self.device.gl.tex_sub_image_2d_pbo(
                     self.texture.target,
                     0,
                     pos.x as _,
                     pos.y as _,
                     size.width as _,
                     size.height as _,
                     gl_format,
                     data_type,
                     chunk.offset,
                 );
             }
             _ => panic!("BUG: Unexpected texture target!"),
         }
 
         // If using tri-linear filtering, build the mip-map chain for this texture.
         if self.texture.filter == TextureFilter::Trilinear {
-            self.gl.generate_mipmap(self.texture.target);
+            self.device.gl.generate_mipmap(self.texture.target);
         }
 
         // Reset row length to 0, otherwise the stride would apply to all texture uploads.
         if chunk.stride.is_some() {
-            self.gl.pixel_store_i(gl::UNPACK_ROW_LENGTH, 0 as _);
+            self.device.gl.pixel_store_i(gl::UNPACK_ROW_LENGTH, 0 as _);
         }
     }
 }
 
 fn texels_to_u8_slice<T: Texel>(texels: &[T]) -> &[u8] {
     unsafe {
         slice::from_raw_parts(texels.as_ptr() as *const u8, texels.len() * mem::size_of::<T>())
     }
--- a/gfx/wr/webrender/src/internal_types.rs
+++ b/gfx/wr/webrender/src/internal_types.rs
@@ -336,17 +336,16 @@ pub enum TextureCacheAllocationKind {
     Reset(TextureCacheAllocInfo),
     /// Frees the texture and the corresponding cache ID.
     Free,
 }
 
 /// Command to update the contents of the texture cache.
 #[derive(Debug)]
 pub struct TextureCacheUpdate {
-    pub id: CacheTextureId,
     pub rect: DeviceIntRect,
     pub stride: Option<i32>,
     pub offset: i32,
     pub layer_index: i32,
     pub format_override: Option<ImageFormat>,
     pub source: TextureUpdateSource,
 }
 
@@ -358,61 +357,63 @@ pub struct TextureCacheUpdate {
 #[derive(Default)]
 pub struct TextureUpdateList {
     /// Indicates that there was some kind of cleanup clear operation. Used for
     /// sanity checks.
     pub clears_shared_cache: bool,
     /// Commands to alloc/realloc/free the textures. Processed first.
     pub allocations: Vec<TextureCacheAllocation>,
     /// Commands to update the contents of the textures. Processed second.
-    pub updates: Vec<TextureCacheUpdate>,
+    pub updates: FastHashMap<CacheTextureId, Vec<TextureCacheUpdate>>,
 }
 
 impl TextureUpdateList {
     /// Mints a new `TextureUpdateList`.
     pub fn new() -> Self {
         TextureUpdateList {
             clears_shared_cache: false,
             allocations: Vec::new(),
-            updates: Vec::new(),
+            updates: FastHashMap::default(),
         }
     }
 
     /// Returns true if this is a no-op (no updates to be applied).
     pub fn is_nop(&self) -> bool {
         self.allocations.is_empty() && self.updates.is_empty()
     }
 
     /// Sets the clears_shared_cache flag for renderer-side sanity checks.
     #[inline]
     pub fn note_clear(&mut self) {
         self.clears_shared_cache = true;
     }
 
     /// Pushes an update operation onto the list.
     #[inline]
-    pub fn push_update(&mut self, update: TextureCacheUpdate) {
-        self.updates.push(update);
+    pub fn push_update(&mut self, id: CacheTextureId, update: TextureCacheUpdate) {
+        self.updates
+            .entry(id)
+            .or_default()
+            .push(update);
     }
 
     /// Sends a command to the Renderer to clear the portion of the shared region
     /// we just freed. Used when the texture cache debugger is enabled.
     #[cold]
     pub fn push_debug_clear(
         &mut self,
         id: CacheTextureId,
         origin: DeviceIntPoint,
         width: i32,
         height: i32,
         layer_index: usize
     ) {
         let size = DeviceIntSize::new(width, height);
         let rect = DeviceIntRect::new(origin, size);
-        self.push_update(TextureCacheUpdate {
-            id,
+        self.push_update(id, TextureCacheUpdate {
             rect,
             stride: None,
             offset: 0,
             layer_index: layer_index as i32,
             format_override: None,
             source: TextureUpdateSource::DebugClear,
         });
     }
@@ -475,17 +476,17 @@ impl TextureUpdateList {
     }
 
     /// Pushes a free operation onto the list, potentially coalescing with
     /// previous operations.
     pub fn push_free(&mut self, id: CacheTextureId) {
         self.debug_assert_coalesced(id);
 
         // Drop any unapplied updates to the to-be-freed texture.
-        self.updates.retain(|x| x.id != id);
+        self.updates.remove(&id);
 
         // Drop any allocations for it as well. If we happen to be allocating and
         // freeing in the same batch, we can collapse them to a no-op.
         let idx = self.allocations.iter().position(|x| x.id == id);
         let removed_kind = idx.map(|i| self.allocations.remove(i).kind);
         match removed_kind {
             Some(TextureCacheAllocationKind::Alloc(..)) => { /* no-op! */ },
             Some(TextureCacheAllocationKind::Free) => panic!("Double free"),
--- a/gfx/wr/webrender/src/renderer.rs
+++ b/gfx/wr/webrender/src/renderer.rs
@@ -1514,20 +1514,25 @@ impl GpuCacheTexture {
                 let rows_dirty = rows
                     .iter()
                     .filter(|row| row.is_dirty)
                     .count();
                 if rows_dirty == 0 {
                     return 0
                 }
 
+                let (upload_size, _) = device.required_upload_size_and_stride(
+                    DeviceIntSize::new(MAX_VERTEX_TEXTURE_WIDTH as i32, 1),
+                    texture.get_format(),
+                );
+
                 let mut uploader = device.upload_texture(
                     texture,
                     buffer,
-                    rows_dirty * MAX_VERTEX_TEXTURE_WIDTH,
+                    rows_dirty * upload_size,
                 );
 
                 for (row_index, row) in rows.iter_mut().enumerate() {
                     if !row.is_dirty {
                         continue;
                     }
 
                     let rect = DeviceIntRect::new(
@@ -1658,18 +1663,22 @@ impl<T> VertexDataTexture<T> {
             (MAX_VERTEX_TEXTURE_WIDTH - (MAX_VERTEX_TEXTURE_WIDTH % texels_per_item)) as i32;
 
         let rect = DeviceIntRect::new(
             DeviceIntPoint::zero(),
             DeviceIntSize::new(logical_width, needed_height),
         );
 
         debug_assert!(len <= data.capacity(), "CPU copy will read out of bounds");
+        let (upload_size, _) = device.required_upload_size_and_stride(
+            rect.size,
+            self.texture().get_format(),
+        );
         device
-            .upload_texture(self.texture(), &self.pbo, 0)
+            .upload_texture(self.texture(), &self.pbo, upload_size)
             .upload(rect, 0, None, None, data.as_ptr(), len);
     }
 
     fn deinit(mut self, device: &mut Device) {
         device.delete_pbo(self.pbo);
         if let Some(t) = self.texture.take() {
             device.delete_texture(t);
         }
@@ -3449,93 +3458,117 @@ impl Renderer {
                         }
                     }
 
                     if let Some(old) = old {
                         self.device.delete_texture(old);
                     }
                 }
 
-                for update in update_list.updates {
-                    let TextureCacheUpdate { id, rect, stride, offset, layer_index, format_override, source } = update;
-                    let texture = &self.texture_resolver.texture_cache_map[&id];
-
-                    let bytes_uploaded = match source {
-                        TextureUpdateSource::Bytes { data } => {
-                            let mut uploader = self.device.upload_texture(
-                                texture,
-                                &self.texture_cache_upload_pbo,
-                                0,
-                            );
-                            let data = &data[offset as usize ..];
-                            uploader.upload(
-                                rect,
-                                layer_index,
-                                stride,
-                                format_override,
-                                data.as_ptr(),
-                                data.len(),
-                            )
-                        }
-                        TextureUpdateSource::External { id, channel_index } => {
-                            let mut uploader = self.device.upload_texture(
-                                texture,
-                                &self.texture_cache_upload_pbo,
-                                0,
-                            );
-                            let handler = self.external_image_handler
-                                .as_mut()
-                                .expect("Found external image, but no handler set!");
-                            // The filter is only relevant for NativeTexture external images.
-                            let dummy_data;
-                            let data = match handler.lock(id, channel_index, ImageRendering::Auto).source {
-                                ExternalImageSource::RawData(data) => {
-                                    &data[offset as usize ..]
-                                }
-                                ExternalImageSource::Invalid => {
-                                    // Create a local buffer to fill the pbo.
-                                    let bpp = texture.get_format().bytes_per_pixel();
-                                    let width = stride.unwrap_or(rect.size.width * bpp);
-                                    let total_size = width * rect.size.height;
-                                    // WR haven't support RGBAF32 format in texture_cache, so
-                                    // we use u8 type here.
-                                    dummy_data = vec![0xFFu8; total_size as usize];
-                                    &dummy_data
-                                }
-                                ExternalImageSource::NativeTexture(eid) => {
-                                    panic!("Unexpected external texture {:?} for the texture cache update of {:?}", eid, id);
-                                }
-                            };
-                            let size = uploader.upload(
-                                rect,
-                                layer_index,
-                                stride,
-                                format_override,
-                                data.as_ptr(),
-                                data.len()
-                            );
-                            handler.unlock(id, channel_index);
-                            size
-                        }
-                        TextureUpdateSource::DebugClear => {
+                for (texture_id, updates) in update_list.updates {
+                    let texture = &self.texture_resolver.texture_cache_map[&texture_id];
+                    let device = &mut self.device;
+
+                    // Calculate the total size of buffer required to upload all updates.
+                    let required_size = updates.iter().map(|update| {
+                        // Perform any debug clears now. As this requires a mutable borrow of device,
+                        // it must be done before all the updates which require a TextureUploader.
+                        if let TextureUpdateSource::DebugClear = update.source  {
                             let draw_target = DrawTarget::from_texture(
                                 texture,
-                                layer_index as usize,
+                                update.layer_index as usize,
                                 false,
                             );
-                            self.device.bind_draw_target(draw_target);
-                            self.device.clear_target(
+                            device.bind_draw_target(draw_target);
+                            device.clear_target(
                                 Some(TEXTURE_CACHE_DBG_CLEAR_COLOR),
                                 None,
-                                Some(draw_target.to_framebuffer_rect(rect.to_i32()))
+                                Some(draw_target.to_framebuffer_rect(update.rect.to_i32()))
                             );
+
                             0
+                        } else {
+                            let (upload_size, _) = device.required_upload_size_and_stride(
+                                update.rect.size,
+                                texture.get_format(),
+                            );
+                            upload_size
                         }
-                    };
-                    self.profile_counters.texture_data_uploaded.add(bytes_uploaded >> 10);
+                    }).sum();
+
+                    if required_size == 0 {
+                        continue;
+                    }
+
+                    // For best performance we use a single TextureUploader for all uploads.
+                    // Using individual TextureUploaders was causing performance issues on some drivers
+                    // due to allocating too many PBOs.
+                    let mut uploader = device.upload_texture(
+                        texture,
+                        &self.texture_cache_upload_pbo,
+                        required_size
+                    );
+
+                    for update in updates {
+                        let TextureCacheUpdate { rect, stride, offset, layer_index, format_override, source } = update;
+
+                        let bytes_uploaded = match source {
+                            TextureUpdateSource::Bytes { data } => {
+                                let data = &data[offset as usize ..];
+                                uploader.upload(
+                                    rect,
+                                    layer_index,
+                                    stride,
+                                    format_override,
+                                    data.as_ptr(),
+                                    data.len(),
+                                )
+                            }
+                            TextureUpdateSource::External { id, channel_index } => {
+                                let handler = self.external_image_handler
+                                    .as_mut()
+                                    .expect("Found external image, but no handler set!");
+                                // The filter is only relevant for NativeTexture external images.
+                                let dummy_data;
+                                let data = match handler.lock(id, channel_index, ImageRendering::Auto).source {
+                                    ExternalImageSource::RawData(data) => {
+                                        &data[offset as usize ..]
+                                    }
+                                    ExternalImageSource::Invalid => {
+                                        // Create a local buffer to fill the pbo.
+                                        let bpp = texture.get_format().bytes_per_pixel();
+                                        let width = stride.unwrap_or(rect.size.width * bpp);
+                                        let total_size = width * rect.size.height;
+                                        // WR haven't support RGBAF32 format in texture_cache, so
+                                        // we use u8 type here.
+                                        dummy_data = vec![0xFFu8; total_size as usize];
+                                        &dummy_data
+                                    }
+                                    ExternalImageSource::NativeTexture(eid) => {
+                                        panic!("Unexpected external texture {:?} for the texture cache update of {:?}", eid, id);
+                                    }
+                                };
+                                let size = uploader.upload(
+                                    rect,
+                                    layer_index,
+                                    stride,
+                                    format_override,
+                                    data.as_ptr(),
+                                    data.len()
+                                );
+                                handler.unlock(id, channel_index);
+                                size
+                            }
+                            TextureUpdateSource::DebugClear => {
+                                // DebugClear updates are handled separately.
+                                0
+                            }
+                        };
+                        self.profile_counters.texture_data_uploaded.add(bytes_uploaded >> 10);
+                    }
                 }
 
                 if update_list.clears_shared_cache {
                     self.shared_texture_cache_cleared = true;
                 }
             }
 
             drain_filter(
--- a/gfx/wr/webrender/src/texture_cache.rs
+++ b/gfx/wr/webrender/src/texture_cache.rs
@@ -990,22 +990,21 @@ impl TextureCache {
             // Otherwise, pass the external format to the driver.
             let use_upload_format = !self.swizzle.is_some();
             let (layer_index, origin) = entry.details.describe();
             let op = TextureCacheUpdate::new_update(
                 data,
                 &descriptor,
                 origin,
                 entry.size,
-                entry.texture_id,
                 layer_index as i32,
                 use_upload_format,
                 &dirty_rect,
             );
-            self.pending_updates.push_update(op);
+            self.pending_updates.push_update(entry.texture_id, op);
         }
     }
 
     // Check if a given texture handle has a valid allocation
     // in the texture cache.
     pub fn is_allocated(&self, handle: &TextureCacheHandle) -> bool {
         self.entries.get_opt(handle).is_some()
     }
@@ -1868,17 +1867,16 @@ impl TextureCacheUpdate {
     // Constructs a TextureCacheUpdate operation to be passed to the
     // rendering thread in order to do an upload to the right
     // location in the texture cache.
     fn new_update(
         data: CachedImageData,
         descriptor: &ImageDescriptor,
         origin: DeviceIntPoint,
         size: DeviceIntSize,
-        texture_id: CacheTextureId,
         layer_index: i32,
         use_upload_format: bool,
         dirty_rect: &ImageDirtyRect,
     ) -> TextureCacheUpdate {
         let source = match data {
             CachedImageData::Blob => {
                 panic!("The vector image should have been rasterized.");
             }
@@ -1908,34 +1906,32 @@ impl TextureCacheUpdate {
 
         match *dirty_rect {
             DirtyRect::Partial(dirty) => {
                 // the dirty rectangle doesn't have to be within the area but has to intersect it, at least
                 let stride = descriptor.compute_stride();
                 let offset = descriptor.offset + dirty.origin.y * stride + dirty.origin.x * descriptor.format.bytes_per_pixel();
 
                 TextureCacheUpdate {
-                    id: texture_id,
                     rect: DeviceIntRect::new(
                         DeviceIntPoint::new(origin.x + dirty.origin.x, origin.y + dirty.origin.y),
                         DeviceIntSize::new(
                             dirty.size.width.min(size.width - dirty.origin.x),
                             dirty.size.height.min(size.height - dirty.origin.y),
                         ),
                     ),
                     source,
                     stride: Some(stride),
                     offset,
                     format_override,
                     layer_index,
                 }
             }
             DirtyRect::All => {
                 TextureCacheUpdate {
-                    id: texture_id,
                     rect: DeviceIntRect::new(origin, size),
                     source,
                     stride: descriptor.stride,
                     offset: descriptor.offset,
                     format_override,
                     layer_index,
                 }
             }