Bug 1479290 - Reduce memory usage of wpt-update, r=ato
authorJames Graham <james@hoppipolla.co.uk>
Sun, 29 Jul 2018 16:13:15 +0100
changeset 429445 ac688f61b766f81f16820b6a817c6fa0a1ad22da
parent 429444 f579a49b182b2a0cc9c3c1e6fb74d073b3b85e21
child 429446 1d3fb65a58da4e2f73fe3f3c8b5d6d340269566d
push id34364
push userbtara@mozilla.com
push dateTue, 31 Jul 2018 21:59:26 +0000
treeherdermozilla-central@d57a89840dbb [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersato
bugs1479290, 1476053
milestone63.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1479290 - Reduce memory usage of wpt-update, r=ato Summary: The memory usage of wpt-update has always been high, and accidentially regressed in Bug 1476053 to really problematic levels when doing a full metadata update for all test results. This patch implements a number of changes that reduce the peak memory usage to ~1Gb and the heap after reading all results data to ~600Mb. THere are several changes in this patch: * Change from a dict {test: [(subtest, prop, run_info, result)]} to a nested dictionary {test: {subtest: [(prop, run_info, result)]}}. This fixes the silliness that caused the previous regression. * Convert all unicode data to bytestrings and then intern() the bytestring. This allows reusing string allocations for repeated strings, of which there are many. * Process the test manifests upfront and then allow the manifest data to be gc'd before starting to process the results files, so that we are not holding on to data that is no longer required. * Stop storing status-type results as (status, default_expected); just look up the default_expected for a specific test if we have to update the expectations for that test. * Add __slots__ to all frequently-allocated classes to avoid the overhead of constructing an __dict__ for each instance, which typically has size overhead (see e.g. http://book.pythontips.com/en/latest/__slots__magic.html ) * Move to using a custom compact representation for the list of per-subtest results i.e. [(prop, run_info, results)]. This uses an array.array of 2-byte ints to store the entries and effectively interns the values, bit packing the representations of property to be updated and result (if it's a test status) into 4 bits each in the first byte, and the run info into the second byte. Properties with a complex data representation (e.g. LSAN failures) are stored directly, with a 0 value of the status bits indicating that the complex value should be read. Reviewers: ato Tags: #secure-revision Bug #: 1479290 Differential Revision: https://phabricator.services.mozilla.com/D2496 MozReview-Commit-ID: 7zGpdzdaqRj
testing/web-platform/tests/tools/wptrunner/wptrunner/metadata.py
--- a/testing/web-platform/tests/tools/wptrunner/wptrunner/metadata.py
+++ b/testing/web-platform/tests/tools/wptrunner/wptrunner/metadata.py
@@ -1,8 +1,9 @@
+import array
 import os
 import shutil
 import tempfile
 import uuid
 from collections import defaultdict, namedtuple
 
 from mozlog import structuredlog
 
@@ -12,42 +13,36 @@ import wptmanifest
 import wpttest
 from expected import expected_path
 from vcs import git
 manifest = None  # Module that will be imported relative to test_root
 manifestitem = None
 
 logger = structuredlog.StructuredLogger("web-platform-tests")
 
-
 try:
     import ujson as json
 except ImportError:
     import json
 
 
-def load_test_manifests(serve_root, test_paths):
-    do_delayed_imports(serve_root)
-    manifest_loader = testloader.ManifestLoader(test_paths, False)
-    return manifest_loader.load()
-
-
 def update_expected(test_paths, serve_root, log_file_names,
                     rev_old=None, rev_new="HEAD", ignore_existing=False,
                     sync_root=None, property_order=None, boolean_properties=None,
                     stability=None):
     """Update the metadata files for web-platform-tests based on
     the results obtained in a previous run or runs
 
     If stability is not None, assume log_file_names refers to logs from repeated
     test jobs, disable tests that don't behave as expected on all runs"""
+    do_delayed_imports(serve_root)
 
-    manifests = load_test_manifests(serve_root, test_paths)
+    id_test_map = load_test_data(test_paths)
 
-    for metadata_path, updated_ini in update_from_logs(manifests,
+    for metadata_path, updated_ini in update_from_logs(id_test_map,
                                                        *log_file_names,
                                                        ignore_existing=ignore_existing,
                                                        property_order=property_order,
                                                        boolean_properties=boolean_properties,
                                                        stability=stability):
 
         write_new_expected(metadata_path, updated_ini)
         if stability:
@@ -117,43 +112,114 @@ def unexpected_changes(manifests, change
 #   for each conditional:
 #      If all the new values match (or there aren't any) retain that conditional
 #      If any new values mismatch:
 #           If stability and any repeated values don't match, disable the test
 #           else mark the test as needing human attention
 #   Check if all the RHS values are the same; if so collapse the conditionals
 
 
-def update_from_logs(manifests, *log_filenames, **kwargs):
+class InternedData(object):
+    """Class for interning data of any (hashable) type.
+
+    This class is intended for building a mapping of int <=> value, such
+    that the integer may be stored as a proxy for the real value, and then
+    the real value obtained later from the proxy value.
+
+    In order to support the use case of packing the integer value as binary,
+    it is possible to specify a maximum bitsize of the data; adding more items
+    than this allowed will result in a ValueError exception.
+
+    The zero value is reserved to use as a sentinal."""
+
+    type_conv = None
+    rev_type_conv = None
+
+    def __init__(self, max_bits=8):
+        self.max_idx = 2**max_bits - 2
+        # Reserve 0 as a sentinal
+        self._data = [None], {}
+
+    def store(self, obj):
+        if self.type_conv is not None:
+            obj = self.type_conv(obj)
+
+        objs, obj_to_idx = self._data
+        if obj not in obj_to_idx:
+            value = len(objs)
+            objs.append(obj)
+            obj_to_idx[obj] = value
+            if value > self.max_idx:
+                raise ValueError
+        else:
+            value = obj_to_idx[obj]
+        return value
+
+    def get(self, idx):
+        obj = self._data[0][idx]
+        if self.rev_type_conv is not None:
+            obj = self.rev_type_conv(obj)
+        return obj
+
+
+class RunInfoInterned(InternedData):
+    def type_conv(self, value):
+        return tuple(value.items())
+
+    def rev_type_conv(self, value):
+        return dict(value)
+
+
+prop_intern = InternedData(4)
+run_info_intern = RunInfoInterned()
+status_intern = InternedData(4)
+
+
+def load_test_data(test_paths):
+    manifest_loader = testloader.ManifestLoader(test_paths, False)
+    manifests = manifest_loader.load()
+
+    id_test_map = {}
+    for test_manifest, paths in manifests.iteritems():
+        id_test_map.update(create_test_tree(paths["metadata_path"],
+                                            test_manifest))
+    return id_test_map
+
+
+def update_from_logs(id_test_map, *log_filenames, **kwargs):
     ignore_existing = kwargs.get("ignore_existing", False)
     property_order = kwargs.get("property_order")
     boolean_properties = kwargs.get("boolean_properties")
     stability = kwargs.get("stability")
 
-    id_test_map = {}
+    updater = ExpectedUpdater(id_test_map,
+                              ignore_existing=ignore_existing)
 
-    for test_manifest, paths in manifests.iteritems():
-        id_test_map.update(create_test_tree(
-            paths["metadata_path"],
-            test_manifest))
-
-    updater = ExpectedUpdater(manifests,
-                              id_test_map,
-                              ignore_existing=ignore_existing)
-    for log_filename in log_filenames:
+    for i, log_filename in enumerate(log_filenames):
+        print("Processing log %d/%d" % (i + 1, len(log_filenames)))
         with open(log_filename) as f:
             updater.update_from_log(f)
+
     for item in update_results(id_test_map, property_order, boolean_properties, stability):
         yield item
 
 
 def update_results(id_test_map, property_order, boolean_properties, stability):
     test_file_items = set(id_test_map.itervalues())
+
+    default_expected_by_type = {}
+    for test_type, test_cls in wpttest.manifest_test_cls.iteritems():
+        if test_cls.result_cls:
+            default_expected_by_type[(test_type, False)] = test_cls.result_cls.default_expected
+        if test_cls.subtest_result_cls:
+            default_expected_by_type[(test_type, True)] = test_cls.subtest_result_cls.default_expected
+
     for test_file in test_file_items:
-        updated_expected = test_file.update(property_order, boolean_properties, stability)
+        updated_expected = test_file.update(property_order, boolean_properties, stability,
+                                            default_expected_by_type)
         if updated_expected is not None and updated_expected.modified:
             yield test_file.metadata_path, updated_expected
 
 
 def directory_manifests(metadata_path):
     rv = []
     for dirpath, dirname, filenames in os.walk(metadata_path):
         if "__dir__.ini" in filenames:
@@ -209,35 +275,28 @@ def write_new_expected(metadata_path, ex
     else:
         try:
             os.unlink(path)
         except OSError:
             pass
 
 
 class ExpectedUpdater(object):
-    def __init__(self, test_manifests, id_test_map, ignore_existing=False):
+    def __init__(self, id_test_map, ignore_existing=False):
         self.id_test_map = id_test_map
         self.ignore_existing = ignore_existing
         self.run_info = None
         self.action_map = {"suite_start": self.suite_start,
                            "test_start": self.test_start,
                            "test_status": self.test_status,
                            "test_end": self.test_end,
                            "assertion_count": self.assertion_count,
                            "lsan_leak": self.lsan_leak}
         self.tests_visited = {}
 
-        self.types_by_path = {}
-        for manifest in test_manifests.iterkeys():
-            for test_type, path, _ in manifest:
-                if test_type in wpttest.manifest_test_cls:
-                    self.types_by_path[path] = wpttest.manifest_test_cls[test_type]
-        self.run_infos = []
-
     def update_from_log(self, log_file):
         self.run_info = None
         try:
             data = json.load(log_file)
         except Exception:
             pass
         else:
             if "action" not in data and "results" in data:
@@ -277,224 +336,275 @@ class ExpectedUpdater(object):
                 action_map["assertion_count"]({"test": test["test"],
                                                "count": asserts["count"],
                                                "min_expected": asserts["min"],
                                                "max_expected": asserts["max"]})
         for item in data.get("lsan_leaks", []):
             action_map["lsan_leak"](item)
 
     def suite_start(self, data):
-        self.run_info = data["run_info"]
+        self.run_info = run_info_intern.store(data["run_info"])
 
     def test_start(self, data):
-        test_id = data["test"]
+        test_id = intern(data["test"].encode("utf8"))
         try:
             test_data = self.id_test_map[test_id]
         except KeyError:
             print "Test not found %s, skipping" % test_id
             return
 
         if self.ignore_existing:
             test_data.set_requires_update()
             test_data.clear.append("expected")
         self.tests_visited[test_id] = set()
 
     def test_status(self, data):
-        test_id = data["test"]
-        subtest = data["subtest"]
+        test_id = intern(data["test"].encode("utf8"))
+        subtest = intern(data["subtest"].encode("utf8"))
         test_data = self.id_test_map.get(test_id)
         if test_data is None:
             return
 
-        test_cls = self.types_by_path[test_data.test_path]
-
         self.tests_visited[test_id].add(subtest)
 
-        result = test_cls.subtest_result_cls(
-            subtest,
-            data["status"],
-            None)
+        result = status_intern.store(data["status"])
 
         test_data.set(test_id, subtest, "status", self.run_info, result)
         if data.get("expected") and data["expected"] != data["status"]:
             test_data.set_requires_update()
 
     def test_end(self, data):
         if data["status"] == "SKIP":
             return
 
-        test_id = data["test"]
+        test_id = intern(data["test"].encode("utf8"))
         test_data = self.id_test_map.get(test_id)
         if test_data is None:
             return
-        test_cls = self.types_by_path[test_data.test_path]
-
 
-        result = test_cls.result_cls(
-            data["status"],
-            None)
+        result = status_intern.store(data["status"])
+
         test_data.set(test_id, None, "status", self.run_info, result)
         if data.get("expected") and data["status"] != data["expected"]:
             test_data.set_requires_update()
         del self.tests_visited[test_id]
 
     def assertion_count(self, data):
-        test_id = data["test"]
+        test_id = intern(data["test"].encode("utf8"))
         test_data = self.id_test_map.get(test_id)
         if test_data is None:
             return
 
         test_data.set(test_id, None, "asserts", self.run_info, data["count"])
         if data["count"] < data["min_expected"] or data["count"] > data["max_expected"]:
             test_data.set_requires_update()
 
     def lsan_leak(self, data):
         dir_path = data.get("scope", "/")
-        dir_id = os.path.join(dir_path, "__dir__").replace(os.path.sep, "/")
+        dir_id = intern(os.path.join(dir_path, "__dir__").replace(os.path.sep, "/").encode("utf8"))
         if dir_id.startswith("/"):
             dir_id = dir_id[1:]
         test_data = self.id_test_map[dir_id]
-        test_data.set(dir_id, None, "lsan", self.run_info, (data["frames"], data.get("allowed_match")))
+        test_data.set(dir_id, None, "lsan",
+                      self.run_info, (data["frames"], data.get("allowed_match")))
         if not data.get("allowed_match"):
             test_data.set_requires_update()
 
 
 def create_test_tree(metadata_path, test_manifest):
-    """Create a map of expectation manifests for all tests in test_manifest,
-    reading existing manifests under manifest_path
-
-    :returns: A map of test_id to (manifest, test, expectation_data)
+    """Create a map of test_id to TestFileData for that test.
     """
     id_test_map = {}
     exclude_types = frozenset(["stub", "helper", "manual", "support", "conformancechecker"])
     all_types = [item.item_type for item in manifestitem.__dict__.itervalues()
                  if type(item) == type and
                  issubclass(item, manifestitem.ManifestItem) and
                  item.item_type is not None]
     include_types = set(all_types) - exclude_types
-    for _, test_path, tests in test_manifest.itertypes(*include_types):
-        test_file_data = TestFileData(test_manifest,
+    for item_type, test_path, tests in test_manifest.itertypes(*include_types):
+        test_file_data = TestFileData(intern(test_manifest.url_base.encode("utf8")),
+                                      intern(item_type.encode("utf8")),
                                       metadata_path,
                                       test_path,
                                       tests)
         for test in tests:
-            id_test_map[test.id] = test_file_data
+            id_test_map[intern(test.id.encode("utf8"))] = test_file_data
 
         dir_path = os.path.split(test_path)[0].replace(os.path.sep, "/")
         while True:
             if dir_path:
                 dir_id = dir_path + "/__dir__"
             else:
                 dir_id = "__dir__"
-            dir_id = (test_manifest.url_base + dir_id).lstrip("/")
+            dir_id = intern((test_manifest.url_base + dir_id).lstrip("/").encode("utf8"))
             if dir_id not in id_test_map:
-                test_file_data = TestFileData(test_manifest,
+                test_file_data = TestFileData(intern(test_manifest.url_base.encode("utf8")),
+                                              None,
                                               metadata_path,
                                               dir_id,
                                               [])
                 id_test_map[dir_id] = test_file_data
             if not dir_path or dir_path in id_test_map:
                 break
             dir_path = dir_path.rsplit("/", 1)[0] if "/" in dir_path else ""
 
     return id_test_map
 
 
+class PackedResultList(object):
+    """Class for storing test results.
+
+    Results are stored as an array of 2-byte integers for compactness.
+    The first 4 bits represent the property name, the second 4 bits
+    represent the test status (if it's a result with a status code), and
+    the final 8 bits represent the run_info. If the result doesn't have a
+    simple status code but instead a richer type, we place that richer type
+    in a dictionary and set the status part of the result type to 0.
+
+    This class depends on the global prop_intern, run_info_intern and
+    status_intern InteredData objects to convert between the bit values
+    and corresponding Python objects."""
+
+    def __init__(self):
+        self.data = array.array("H")
+
+    __slots__ = ("data", "raw_data")
+
+    def append(self, prop, run_info, value):
+        out_val = (prop << 12) + run_info
+        if prop == prop_intern.store("status"):
+            out_val += value << 8
+        else:
+            if not hasattr(self, "raw_data"):
+                self.raw_data = {}
+            self.raw_data[len(self.data)] = value
+        self.data.append(out_val)
+
+    def unpack(self, idx, packed):
+        prop = prop_intern.get((packed & 0xF000) >> 12)
+
+        value_idx = (packed & 0x0F00) >> 8
+        if value_idx is 0:
+            value = self.raw_data[idx]
+        else:
+            value = status_intern.get(value_idx)
+
+        run_info = run_info_intern.get((packed & 0x00FF))
+
+        return prop, run_info, value
+
+    def __iter__(self):
+        for i, item in enumerate(self.data):
+            yield self.unpack(i, item)
+
+
 class TestFileData(object):
-    def __init__(self, test_manifest, metadata_path, test_path, tests):
-        self.test_manifest = test_manifest
+    __slots__ = ("url_base", "item_type", "test_path", "metadata_path", "tests",
+                 "_requires_update", "clear", "data")
+    def __init__(self, url_base, item_type, metadata_path, test_path, tests):
+        self.url_base = url_base
+        self.item_type = item_type
         self.test_path = test_path
         self.metadata_path = metadata_path
-        self.tests = tests
-        self._expected = None
+        self.tests = {intern(item.id.encode("utf8")) for item in tests}
         self._requires_update = False
         self.clear = set()
-        self.data = []
+        self.data = defaultdict(lambda: defaultdict(PackedResultList))
 
     def set_requires_update(self):
         self._requires_update = True
 
     def set(self, test_id, subtest_id, prop, run_info, value):
-        self.data.append((test_id, subtest_id, prop, run_info, value))
+        self.data[test_id][subtest_id].append(prop_intern.store(prop),
+                                              run_info,
+                                              value)
 
     def expected(self, property_order, boolean_properties):
-        if self._expected is None:
-            expected_data = load_expected(self.test_manifest,
-                                          self.metadata_path,
-                                          self.test_path,
-                                          self.tests,
-                                          property_order,
-                                          boolean_properties)
-            if expected_data is None:
-                expected_data = create_expected(self.test_manifest,
-                                                self.test_path,
-                                                property_order,
-                                                boolean_properties)
-            self._expected = expected_data
-        return self._expected
+        expected_data = load_expected(self.url_base,
+                                      self.metadata_path,
+                                      self.test_path,
+                                      self.tests,
+                                      property_order,
+                                      boolean_properties)
+        if expected_data is None:
+            expected_data = create_expected(self.url_base,
+                                            self.test_path,
+                                            property_order,
+                                            boolean_properties)
+        return expected_data
 
-    def update(self, property_order, boolean_properties, stability):
+    def update(self, property_order, boolean_properties, stability,
+               default_expected_by_type):
         if not self._requires_update:
             return
 
         expected = self.expected(property_order, boolean_properties)
         expected_by_test = {}
 
-        for test in self.tests:
-            if not expected.has_test(test.id):
-                expected.append(manifestupdate.TestNode.create(test.id))
-            test_expected = expected.get_test(test.id)
-            expected_by_test[test.id] = test_expected
+        for test_id in self.tests:
+            if not expected.has_test(test_id):
+                expected.append(manifestupdate.TestNode.create(test_id))
+            test_expected = expected.get_test(test_id)
+            expected_by_test[test_id] = test_expected
             for prop in self.clear:
                 test_expected.clear(prop)
 
-        for (test_id, subtest_id, prop, run_info, value) in self.data:
-            # Special case directory metadata
-            if subtest_id is None and test_id.endswith("__dir__"):
-                if prop == "lsan":
-                    expected.set_lsan(run_info, value)
-                continue
+        for test_id, test_data in self.data.iteritems():
+            for subtest_id, results_list in test_data.iteritems():
+                for prop, run_info, value in results_list:
+                    # Special case directory metadata
+                    if subtest_id is None and test_id.endswith("__dir__"):
+                        if prop == "lsan":
+                            expected.set_lsan(run_info, value)
+                        continue
 
-            test_expected = expected_by_test[test_id]
-            if subtest_id is None:
-                item_expected = test_expected
-            else:
-                item_expected = test_expected.get_subtest(subtest_id)
-            if prop == "status":
-                item_expected.set_result(run_info, value)
-            elif prop == "asserts":
-                item_expected.set_asserts(run_info, value)
+                    if prop == "status":
+                        value = Result(value, default_expected_by_type[self.item_type,
+                                                                       subtest_id is not None])
+
+                    test_expected = expected_by_test[test_id]
+                    if subtest_id is None:
+                        item_expected = test_expected
+                    else:
+                        item_expected = test_expected.get_subtest(subtest_id)
+                    if prop == "status":
+                        item_expected.set_result(run_info, value)
+                    elif prop == "asserts":
+                        item_expected.set_asserts(run_info, value)
 
         expected.coalesce_properties(stability=stability)
         for test in expected.iterchildren():
             for subtest in test.iterchildren():
                 subtest.coalesce_properties(stability=stability)
             test.coalesce_properties(stability=stability)
 
         return expected
 
 
-def create_expected(test_manifest, test_path, property_order=None,
+Result = namedtuple("Result", ["status", "default_expected"])
+
+
+def create_expected(url_base, test_path, property_order=None,
                     boolean_properties=None):
-    expected = manifestupdate.ExpectedManifest(None, test_path, test_manifest.url_base,
+    expected = manifestupdate.ExpectedManifest(None,
+                                               test_path,
+                                               url_base,
                                                property_order=property_order,
                                                boolean_properties=boolean_properties)
     return expected
 
 
-def load_expected(test_manifest, metadata_path, test_path, tests, property_order=None,
+def load_expected(url_base, metadata_path, test_path, tests, property_order=None,
                   boolean_properties=None):
     expected_manifest = manifestupdate.get_manifest(metadata_path,
                                                     test_path,
-                                                    test_manifest.url_base,
+                                                    url_base,
                                                     property_order=property_order,
                                                     boolean_properties=boolean_properties)
     if expected_manifest is None:
         return
 
-    tests_by_id = {item.id for item in tests}
-
     # Remove expected data for tests that no longer exist
     for test in expected_manifest.iterchildren():
-        if test.id not in tests_by_id:
+        if test.id not in tests:
             test.remove()
 
     return expected_manifest