author Henri Sivonen <>
Fri, 06 Jul 2018 10:44:43 +0300
changeset 489140 4ef0f163fdeb9afeddd87b37bfd987298c038542
parent 476323 da3c81f986fa63b42a2874b1791eddcdbfbe8424
permissions -rw-r--r--
Bug 1402247 - Use encoding_rs for XPCOM string encoding conversions. r=Nika,erahm,froydnj. Correctness improvements: * UTF errors are handled safely per spec instead of dangerously truncating strings. * There are fewer converter implementations. Performance improvements: * The old code did exact buffer length math, which meant doing UTF math twice on each input string (once for length calculation and another time for conversion). Exact length math is more complicated when handling errors properly, which the old code didn't do. The new code does UTF math on the string content only once (when converting) but risks allocating more than once. There are heuristics in place to lower the probability of reallocation in cases where the double math avoidance isn't enough of a saving to absorb an allocation and memcpy. * Previously, in UTF-16 <-> UTF-8 conversions, an ASCII prefix was optimized but a single non-ASCII code point pessimized the rest of the string. The new code tries to get back on the fast ASCII path. * UTF-16 to Latin1 conversion guarantees less about handling of out-of-range input to eliminate an operation from the inner loop on x86/x86_64. * When assigning to a pre-existing string, the new code tries to reuse the old buffer instead of first releasing the old buffer and then allocating a new one. * When reallocating from the new code, the memcpy covers only the data that is part of the logical length of the old string instead of memcpying the whole capacity. (For old callers old excess memcpy behavior is preserved due to bogus callers. See bug 1472113.) * UTF-8 strings in XPConnect that are in the Latin1 range are passed to SpiderMonkey as Latin1. New features: * Conversion between UTF-8 and Latin1 is added in order to enable faster future interop between Rust code (or otherwise UTF-8-using code) and text node and SpiderMonkey code that uses Latin1. MozReview-Commit-ID: JaJuExfILM9

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at

import os
import time
import zipfile

from mozbuild.util import lock_file

class ZipFile(zipfile.ZipFile):
    """ Class with methods to open, read, write, close, list zip files.

    Subclassing zipfile.ZipFile to allow for overwriting of existing
    entries, though only for writestr, not for write.

    def __init__(self, file, mode="r", compression=zipfile.ZIP_STORED,
        if lock:
            assert isinstance(file, basestring)
            self.lockfile = lock_file(file + '.lck')
            self.lockfile = None

        if mode == 'a' and lock:
            # appending to a file which doesn't exist fails, but we can't check
            # existence util we hold the lock
            if (not os.path.isfile(file)) or os.path.getsize(file) == 0:
                mode = 'w'

        zipfile.ZipFile.__init__(self, file, mode, compression)
        self._remove = []
        self.end = self.fp.tell()
        self.debug = 0

    def writestr(self, zinfo_or_arcname, bytes):
        """Write contents into the archive.

        The contents is the argument 'bytes',  'zinfo_or_arcname' is either
        a ZipInfo instance or the name of the file in the archive.
        This method is overloaded to allow overwriting existing entries.
        if not isinstance(zinfo_or_arcname, zipfile.ZipInfo):
            zinfo = zipfile.ZipInfo(filename=zinfo_or_arcname,
            zinfo.compress_type = self.compression
            # Add some standard UNIX file access permissions (-rw-r--r--).
            zinfo.external_attr = (0x81a4 & 0xFFFF) << 16L
            zinfo = zinfo_or_arcname

        # Now to the point why we overwrote this in the first place,
        # remember the entry numbers if we already had this entry.
        # Optimizations:
        # If the entry to overwrite is the last one, just reuse that.
        # If we store uncompressed and the new content has the same size
        # as the old, reuse the existing entry.

        doSeek = False  # store if we need to seek to the eof after overwriting
        if zinfo.filename in self.NameToInfo:
            # Find the last ZipInfo with our name.
            # Last, because that's catching multiple overwrites
            i = len(self.filelist)
            while i > 0:
                i -= 1
                if self.filelist[i].filename == zinfo.filename:
            zi = self.filelist[i]
            if ((zinfo.compress_type == zipfile.ZIP_STORED
                 and zi.compress_size == len(bytes))
                    or (i + 1) == len(self.filelist)):
                # make sure we're allowed to write, otherwise done by writestr below
                # overwrite existing entry
                if (i + 1) == len(self.filelist):
                    # this is the last item in the file, just truncate
                    # we need to move to the end of the file afterwards again
                    doSeek = True
                # unhook the current zipinfo, the writestr of our superclass
                # will add a new one
                # Couldn't optimize, sadly, just remember the old entry for removal
        zipfile.ZipFile.writestr(self, zinfo, bytes)
        self.filelist.sort(lambda l, r: cmp(l.header_offset, r.header_offset))
        if doSeek:
        self.end = self.fp.tell()

    def close(self):
        """Close the file, and for mode "w" and "a" write the ending

        Overwritten to compact overwritten entries.
        if not self._remove:
            # we don't have anything special to do, let's just call base
            r = zipfile.ZipFile.close(self)
            self.lockfile = None
            return r

        if self.fp.mode != 'r+b':
            # adjust file mode if we originally just wrote, now we rewrite
            self.fp = open(self.filename, 'r+b')
        all = map(lambda zi: (zi, True), self.filelist) + \
            map(lambda zi: (zi, False), self._remove)
        all.sort(lambda l, r: cmp(l[0].header_offset, r[0].header_offset))
        # empty _remove for multiple closes
        self._remove = []

        lengths = [all[i+1][0].header_offset - all[i][0].header_offset
                   for i in xrange(len(all)-1)]
        lengths.append(self.end - all[-1][0].header_offset)
        to_pos = 0
        for (zi, keep), length in zip(all, lengths):
            if not keep:
            oldoff = zi.header_offset
            # python <= 2.4 has file_offset
            if hasattr(zi, 'file_offset'):
                zi.file_offset = zi.file_offset + to_pos - oldoff
            zi.header_offset = to_pos
            content =
            to_pos += length
        self.lockfile = None