Bug 1430825 - [mozlint] Split work up by paths instead of by linters, r?Standard8 draft
authorAndrew Halberstadt <ahalberstadt@mozilla.com>
Tue, 16 Jan 2018 16:01:20 -0500
changeset 721903 470394e5da4cd7ad79172fbb32ceb2c93380e6af
parent 721703 e7a33d4b7271602c41347a99be1512d74ce78d91
child 746479 ee9c6dbec6ffb744f457df6901e4cf6ee055a606
push id95994
push userahalberstadt@mozilla.com
push dateThu, 18 Jan 2018 02:40:03 +0000
Bug 1430825 - [mozlint] Split work up by paths instead of by linters, r?Standard8 The initial motivation for this patch, was to prevent command lines that are too long on Windows. To that end, there is a cap to the number of paths that can be run per job. For now that cap is set to 50. This will allow for an average path length of 160 characters, which should be sufficient with room to spare. But another big benefit of this patch is that we are running more things in parallel. Previously, mozlint ran each linter in its own subprocess, but that's it. If running eslint is 90% of the work, it'll still only get a single process. This means we are wasting cores as soon as the other linters are finished. This patch chunks the number of specified paths such that there will be N*L jobs where 'N' is the number of cores and 'L' is the number of linters. This means even when there's a dominant linter, we'll be making better use of our resources. This isn't perfect of course, as some paths might contain a small number of files, and some will contain a very large number of files. But it's a start A limitation to this approach is that if there are fewer paths specified than there are cores, we won't schedule enough jobs per linter to use those extra cores. One idea might be to expand specified directories and individually list all the paths under the directory. But this has some hairy edge cases that would be tough to catch. Doing this in a non-hacky way would also require a medium scale refactor. So I propose further parallelization efforts be destined for follow-ups. MozReview-Commit-ID: JRRu13AFaii
--- a/python/mozlint/mozlint/roller.py
+++ b/python/mozlint/mozlint/roller.py
@@ -5,16 +5,17 @@
 from __future__ import absolute_import, print_function, unicode_literals
 import os
 import signal
 import sys
 import traceback
 from collections import defaultdict
 from concurrent.futures import ProcessPoolExecutor
+from math import ceil
 from multiprocessing import cpu_count
 from subprocess import CalledProcessError
 from mozversioncontrol import get_repository_object, MissingUpstreamRepo, InvalidRepoPath
 from .errors import LintersNotConfigured
 from .parser import Parser
 from .types import supported_types
@@ -51,16 +52,17 @@ def _run_worker(*args, **kwargs):
 class LintRoller(object):
     """Registers and runs linters.
     :param root: Path to which relative paths will be joined. If
                  unspecified, root will either be determined from
                  version control or cwd.
     :param lintargs: Arguments to pass to the underlying linter(s).
+    MAX_PATHS_PER_JOB = 50  # set a max size to prevent command lines that are too long on Windows
     def __init__(self, root, **lintargs):
         self.parse = Parser()
             self.vcs = get_repository_object(root)
         except InvalidRepoPath:
             self.vcs = None
@@ -77,16 +79,24 @@ class LintRoller(object):
         :param paths: A path or iterable of paths to linter definitions.
         if isinstance(paths, basestring):
             paths = (paths,)
         for path in paths:
+    def _generate_jobs(self, paths, num_procs):
+        """A job is of the form (<linter:dict>, <paths:list>)."""
+        chunk_size = min(self.MAX_PATHS_PER_JOB, int(ceil(float(len(paths)) / num_procs)))
+        while paths:
+            for linter in self.linters:
+                yield linter, paths[:chunk_size]
+            paths = paths[chunk_size:]
     def roll(self, paths=None, outgoing=None, workdir=None, num_procs=None):
         """Run all of the registered linters against the specified file paths.
         :param paths: An iterable of files and/or directories to lint.
         :param outgoing: Lint files touched by commits that are not on the remote repository.
         :param workdir: Lint all files touched in the working directory.
         :param num_procs: The number of processes to use. Default: cpu count
         :return: A dictionary with file names as the key, and a list of
@@ -127,22 +137,20 @@ class LintRoller(object):
         paths = paths or ['.']
         # This will convert paths back to a list, but that's ok since
         # we're done adding to it.
         paths = map(os.path.abspath, paths)
         num_procs = num_procs or cpu_count()
-        num_procs = min(num_procs, len(self.linters))
         all_results = defaultdict(list)
         with ProcessPoolExecutor(num_procs) as executor:
-            futures = [executor.submit(_run_worker, config, paths, **self.lintargs)
-                       for config in self.linters]
+            futures = [executor.submit(_run_worker, config, p, **self.lintargs)
+                       for config, p in self._generate_jobs(paths, num_procs)]
             # ignore SIGINT in parent so we can still get partial results
             # from child processes. These should shutdown quickly anyway.
             orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
             self.failed = []
             for future in futures:
                 results, failed = future.result()
                 if failed:
--- a/python/mozlint/test/test_roller.py
+++ b/python/mozlint/test/test_roller.py
@@ -5,17 +5,17 @@
 from __future__ import absolute_import
 import os
 import sys
 import mozunit
 import pytest
-from mozlint import ResultContainer
+from mozlint import ResultContainer, roller
 from mozlint.errors import LintersNotConfigured, LintException
 here = os.path.abspath(os.path.dirname(__file__))
 linters = ('string.yml', 'regex.yml', 'external.yml')
@@ -71,15 +71,48 @@ def test_roll_with_invalid_extension(lin
     assert len(result) == 0
     assert lint.failed == []
 def test_roll_with_failure_code(lint, lintdir, files):
     lint.read(os.path.join(lintdir, 'badreturncode.yml'))
     assert lint.failed is None
-    result = lint.roll(files)
+    result = lint.roll(files, num_procs=1)
     assert len(result) == 0
     assert lint.failed == ['BadReturnCodeLinter']
+def fake_run_linters(config, paths, **lintargs):
+    return {'count': [1]}, []
+@pytest.mark.parametrize('num_procs', [1, 4, 8, 16])
+def test_number_of_jobs(monkeypatch, lint, linters, files, num_procs):
+    monkeypatch.setattr(roller, '_run_linters', fake_run_linters)
+    lint.read(linters)
+    num_jobs = len(lint.roll(files, num_procs=num_procs)['count'])
+    if len(files) >= num_procs:
+        assert num_jobs == num_procs * len(linters)
+    else:
+        assert num_jobs == len(files) * len(linters)
+@pytest.mark.parametrize('max_paths,expected_jobs', [(1, 12), (4, 6), (16, 6)])
+def test_max_paths_per_job(monkeypatch, lint, linters, files, max_paths, expected_jobs):
+    monkeypatch.setattr(roller, '_run_linters', fake_run_linters)
+    files = files[:4]
+    assert len(files) == 4
+    linters = linters[:3]
+    assert len(linters) == 3
+    lint.MAX_PATHS_PER_JOB = max_paths
+    lint.read(linters)
+    num_jobs = len(lint.roll(files, num_procs=2)['count'])
+    assert num_jobs == expected_jobs
 if __name__ == '__main__':