Bug 1577905 - Support warmload tests under --browsertime r=nalexander,rwood
authorTarek Ziadé <tarek@mozilla.com>
Mon, 30 Sep 2019 07:27:22 +0000
changeset 495691 9eb8f5738e312905bb78ab1d44d8d6506014d0ea
parent 495690 64b180a4f9f451500f0be9fca044edbf3d3be805
child 495692 5f77e91dd95e12a0118390fe8b6f5d48f4b505c5
push id114140
push userdvarga@mozilla.com
push dateWed, 02 Oct 2019 18:04:51 +0000
treeherdermozilla-inbound@32eb0ea893f3 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersnalexander, rwood
bugs1577905
milestone71.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1577905 - Support warmload tests under --browsertime r=nalexander,rwood Support warmload tests under --browsertime Differential Revision: https://phabricator.services.mozilla.com/D46443
testing/raptor/browsertime/browsertime_pageload.js
testing/raptor/raptor/output.py
testing/raptor/raptor/raptor.py
testing/raptor/raptor/results.py
new file mode 100644
--- /dev/null
+++ b/testing/raptor/browsertime/browsertime_pageload.js
@@ -0,0 +1,16 @@
+/* eslint-env node */
+
+module.exports = async function(context, commands) {
+  let url = context.options.browsertime.url;
+  let page_cycles = context.options.browsertime.page_cycles;
+  let page_cycle_delay = context.options.browsertime.page_cycle_delay;
+
+  await commands.wait.byTime(context.options.browsertime.foreground_delay);
+  await commands.navigate("about:blank");
+
+  for (let count = 0; count < page_cycles; count++) {
+    await commands.wait.byTime(page_cycle_delay);
+    await commands.measure.start(url);
+  }
+  return true;
+};
--- a/testing/raptor/raptor/output.py
+++ b/testing/raptor/raptor/output.py
@@ -1208,92 +1208,97 @@ class BrowsertimeOutput(PerftestOutput):
         ingestion.
 
         Note: For the overall subtest values/results (i.e. for each measurement type) we will use
         the Browsertime-provided statistics, instead of calcuating our own geomeans from the
         replicates.
         """
         LOG.info("preparing browsertime results for output")
 
-        suites = []
-        test_results = {
-            'framework': {
-                'name': 'browsertime',
-            },
-            'suites': suites,
-        }
-
         # check if we actually have any results
         if len(self.results) == 0:
             LOG.error("no browsertime test results found for %s" %
                       ', '.join(test_names))
             return
 
+        test_results = {
+            'framework': {
+                'name': 'browsertime',
+            }
+        }
+
+        # using a mapping so we can have a unique set of results given a name
+        suites = {}
+
         for test in self.results:
-            vals = []
-            subtests = []
-            suite = {
-                'name': test['name'],
-                'type': test['type'],
-                'extraOptions': test['extra_options'],
-                'subtests': subtests,
-                'lowerIsBetter': test['lower_is_better'],
-                'unit': test['unit'],
-                'alertThreshold': float(test['alert_threshold'])
-            }
-
-            # Check if the test has set optional properties
-            if hasattr(test, "alert_change_type"):
-                suite['alertChangeType'] = test['alert_change_type']
-
             # process results for pageloader type of tests
             if test["type"] != "browsertime-pageload":
                 LOG.error("output.summarize received unsupported test results type for %s" %
                           test['name'])
                 continue
 
-            suites.append(suite)
+            if test['name'] not in suites:
+                suite = {
+                    'name': test['name'],
+                    'type': test['type'],
+                    'extraOptions': test['extra_options'],
+                    'lowerIsBetter': test['lower_is_better'],
+                    'unit': test['unit'],
+                    'alertThreshold': float(test['alert_threshold']),
+                    # like suites, subtests are identified by names
+                    'subtests': {}
+                }
+
+                # Check if the test has set optional properties
+                if 'alert_change_type' in test:
+                    suite['alertChangeType'] = test['alert_change_type']
+
+                suites[test['name']] = suite
+            else:
+                suite = suites[test['name']]
 
             for measurement_name, replicates in test['measurements'].iteritems():
-                new_subtest = {}
-                new_subtest['name'] = measurement_name
-                new_subtest['replicates'] = replicates
-                new_subtest['lowerIsBetter'] = test['subtest_lower_is_better']
-                new_subtest['alertThreshold'] = float(test['alert_threshold'])
-                new_subtest['value'] = 0
-                new_subtest['unit'] = test['subtest_unit']
+                if measurement_name not in suite['subtests']:
+                    subtest = {}
+                    subtest['name'] = measurement_name
+                    subtest['lowerIsBetter'] = test['subtest_lower_is_better']
+                    subtest['alertThreshold'] = float(test['alert_threshold'])
+                    subtest['unit'] = test['subtest_unit']
 
-                # if 'alert_on' is set for this particular measurement, then we want to set the
-                # flag in the perfherder output to turn on alerting for this subtest
-                if self.subtest_alert_on is not None:
-                    if measurement_name in self.subtest_alert_on:
-                        LOG.info("turning on subtest alerting for measurement type: %s"
-                                 % measurement_name)
-                        new_subtest['shouldAlert'] = True
+                    # if 'alert_on' is set for this particular measurement, then we want to set the
+                    # flag in the perfherder output to turn on alerting for this subtest
+                    if self.subtest_alert_on is not None:
+                        if measurement_name in self.subtest_alert_on:
+                            LOG.info("turning on subtest alerting for measurement type: %s"
+                                     % measurement_name)
+                            subtest['shouldAlert'] = True
+                    subtest['replicates'] = []
+                    suite['subtests'][measurement_name] = subtest
+                else:
+                    subtest = suite['subtests'][measurement_name]
 
-                # for the subtest (page-load measurement type) overall score/result/value, we
-                # want to use the median of the replicates - now instead of calculating this
-                # ourselves, we will take this value from the browsertime results themselves
-                # as browsertime calculates the mean (and other values) automatically for us
-                bt_measurement_median = test['statistics'][measurement_name]['median']
-                new_subtest['value'] = bt_measurement_median
+                subtest['replicates'].extend(replicates)
 
-                # we have a vals list that contains all the top level results for each of the
-                # measurement types; this will be used to calculate an overall test result
-                # which will be the geomean of all of the top level results of each type
-                vals.append([new_subtest['value'], new_subtest['name']])
-                subtests.append(new_subtest)
+        # converting suites and subtests into lists, and sorting them
+        def _process(subtest):
+            subtest['value'] = filters.median(filters.ignore_first(subtest['replicates'], 1))
+            return subtest
+
+        def _process_suite(suite):
+            suite['subtests'] = [_process(subtest) for subtest in suite['subtests'].values()]
+            suite['subtests'].sort(key=lambda subtest: subtest['name'])
 
             # for pageload tests, if there are > 1 subtests here, that means there
             # were multiple measurement types captured in each single pageload; we want
             # to get the mean of those values and report 1 overall 'suite' value
             # for the page; all replicates will still be available in the JSON artifact
-
             # summarize results to get top overall suite result
-            if len(subtests) > 1:
+            if len(suite['subtests']) > 1:
+                vals = [[subtest['value'], subtest['name']] for subtest in suite['subtests']]
                 suite['value'] = self.construct_summary(vals,
                                                         testname=test['name'])
+            return suite
 
-            subtests.sort(key=lambda subtest: subtest['name'])
-
+        suites = [_process_suite(s) for s in suites.values()]
         suites.sort(key=lambda suite: suite['name'])
 
+        test_results['suites'] = suites
         self.summarized_results = test_results
--- a/testing/raptor/raptor/raptor.py
+++ b/testing/raptor/raptor/raptor.py
@@ -395,38 +395,60 @@ class Browsertime(Perftest):
         LOG.info('binary_path: {}'.format(binary_path))
 
         return ['--browser', 'firefox', '--firefox.binaryPath', binary_path]
 
     def run_test(self, test, timeout):
 
         self.run_test_setup(test)
 
-        cmd = ([self.browsertime_node, self.browsertime_browsertimejs] +
-               self.driver_paths +
-               self.browsertime_args +
-               ['--skipHar',
-                '--video', 'true',
-                '--visualMetrics', 'false',
-                '-vv',
-                '--resultDir', self.results_handler.result_dir_for_test(test),
-                '-n', str(test.get('browser_cycles', 1)), test['test_url']])
+        browsertime_script = [os.path.join(os.path.dirname(__file__), "..",
+                              "browsertime", "browsertime_pageload.js")]
+
+        browsertime_script.extend(self.browsertime_args)
 
         # timeout is a single page-load timeout value in ms from the test INI
         # convert timeout to seconds and account for browser cycles
         timeout = int(timeout / 1000) * int(test.get('browser_cycles', 1))
 
         # add some time for browser startup, time for the browsertime measurement code
         # to be injected/invoked, and for exceptions to bubble up; be generous
         timeout += (20 * int(test.get('browser_cycles', 1)))
 
         # if geckoProfile enabled, give browser more time for profiling
         if self.config['gecko_profile'] is True:
             timeout += 5 * 60
 
+        # browsertime deals with page cycles internally, so we
+        # need to give it a timeout value that includes all cycles
+        timeout = timeout * int(test.get("page_cycles", 1))
+
+        # pass a few extra options to the browsertime script
+        # XXX maybe these should be in the browsertime_args() func
+        browsertime_script.extend(["--browsertime.page_cycles",
+                                  str(test.get("page_cycles", 1))])
+        browsertime_script.extend(["--browsertime.url", test["test_url"]])
+
+        # Raptor's `pageCycleDelay` delay (ms) between pageload cycles
+        browsertime_script.extend(["--browsertime.page_cycle_delay", "1000"])
+        # Raptor's `foregroundDelay` delay (ms) for foregrounding app
+        browsertime_script.extend(["--browsertime.foreground_delay", "5000"])
+
+        # the browser time script cannot restart the browser itself,
+        # so we have to keep -n option here.
+        cmd = ([self.browsertime_node, self.browsertime_browsertimejs] +
+               self.driver_paths +
+               browsertime_script +
+               ['--skipHar',
+                '--video', 'false',
+                '--visualMetrics', 'false',
+                '-vv',
+                '--resultDir', self.results_handler.result_dir_for_test(test),
+                '-n', str(test.get('browser_cycles', 1))])
+
         LOG.info('timeout (s): {}'.format(timeout))
         LOG.info('browsertime cwd: {}'.format(os.getcwd()))
         LOG.info('browsertime cmd: {}'.format(cmd))
         LOG.info('browsertime_ffmpeg: {}'.format(self.browsertime_ffmpeg))
 
         # browsertime requires ffmpeg on the PATH for `--video=true`.
         # It's easier to configure the PATH here than at the TC level.
         env = dict(os.environ)
@@ -1310,17 +1332,16 @@ class RaptorAndroid(Raptor):
             # start measuring CPU usage
             self.cpu_profiler = start_android_cpu_profiler(self)
 
         self.wait_for_test_finish(test, timeout)
 
         # in debug mode, and running locally, leave the browser running
         if self.debug_mode and self.config['run_local']:
             LOG.info("* debug-mode enabled - please shutdown the browser manually...")
-            self.runner.wait(timeout=None)
 
     def check_for_crashes(self):
         super(RaptorAndroid, self).check_for_crashes()
 
         if not self.app_launched:
             LOG.info("skipping check_for_crashes: application has not been launched")
             return
         self.app_launched = False
--- a/testing/raptor/raptor/results.py
+++ b/testing/raptor/raptor/results.py
@@ -296,50 +296,48 @@ class BrowsertimeResultsHandler(Perftest
                 },
               }
             }
           }
         ]
         """
         LOG.info("parsing results from browsertime json")
 
-        # For now, assume that browsertime loads only one site.
-        if len(raw_btresults) != 1:
-            raise ValueError("Browsertime did not measure exactly one site.")
-        (_raw_bt_results,) = raw_btresults
-
-        if not _raw_bt_results['browserScripts']:
-            raise ValueError("Browsertime produced no measurements.")
-        bt_browser = _raw_bt_results['browserScripts'][0]['browser']
-
-        bt_ver = _raw_bt_results['info']['browsertime']['version']
-        bt_url = _raw_bt_results['info']['url'],
-        bt_result = {'bt_ver': bt_ver,
-                     'browser': bt_browser,
-                     'url': bt_url,
-                     'measurements': {},
-                     'statistics': {}}
-
         # bt to raptor names
         conversion = (('fnbpaint', 'firstPaint'),
                       ('fcp', 'timeToContentfulPaint'),
                       ('dcf', 'timeToDomContentFlushed'),
                       ('loadtime', 'loadEventEnd'))
 
-        # extracting values from browserScripts and statistics
-        for bt, raptor in conversion:
-            # XXX looping several times in the list, could do better
-            bt_result['measurements'][bt] = [cycle['timings'][raptor] for cycle in
-                                             _raw_bt_results['browserScripts']]
+        results = []
+
+        for raw_result in raw_btresults:
+            if not raw_result['browserScripts']:
+                raise ValueError("Browsertime produced no measurements.")
 
-            # let's add the browsertime statistics; we'll use those for overall values instead
-            # of calculating our own based on the replicates
-            bt_result['statistics'][bt] = _raw_bt_results['statistics']['timings'][raptor]
+            bt_browser = raw_result['browserScripts'][0]['browser']
+            bt_ver = raw_result['info']['browsertime']['version']
+            bt_url = raw_result['info']['url'],
+            bt_result = {'bt_ver': bt_ver,
+                         'browser': bt_browser,
+                         'url': bt_url,
+                         'measurements': {},
+                         'statistics': {}}
+            # extracting values from browserScripts and statistics
+            for bt, raptor in conversion:
+                # XXX looping several times in the list, could do better
+                bt_result['measurements'][bt] = [cycle['timings'][raptor] for cycle in
+                                                 raw_result['browserScripts']]
+                # let's add the browsertime statistics; we'll use those for overall values instead
+                # of calculating our own based on the replicates
+                bt_result['statistics'][bt] = raw_result['statistics']['timings'][raptor]
 
-        return bt_result
+            results.append(bt_result)
+
+        return results
 
     def summarize_and_output(self, test_config, tests, test_names):
         """
         Retrieve, process, and output the browsertime test results. Currently supports page-load
         type tests only.
 
         The Raptor framework either ran a single page-load test (one URL) - or - an entire suite
         of page-load tests (multiple test URLs). Regardless, every test URL measured will
@@ -374,37 +372,36 @@ class BrowsertimeResultsHandler(Perftest
                 with open(bt_res_json, 'r') as f:
                     raw_btresults = json.load(f)
             except Exception as e:
                 LOG.error("Exception reading %s" % bt_res_json)
                 # XXX this should be replaced by a traceback call
                 LOG.error("Exception: %s %s" % (type(e).__name__, str(e)))
                 raise
 
-            new_result = self.parse_browsertime_json(raw_btresults)
+            for new_result in self.parse_browsertime_json(raw_btresults):
+                # add additional info not from the browsertime json
+                for field in ('name', 'unit', 'lower_is_better',
+                              'alert_threshold', 'cold'):
+                    new_result[field] = test[field]
 
-            # add additional info not from the browsertime json
-            for field in ('name', 'unit', 'lower_is_better',
-                          'alert_threshold', 'cold'):
-                new_result[field] = test[field]
-
-            # Differentiate Raptor `pageload` tests from `browsertime-pageload`
-            # tests while we compare and contrast.
-            new_result['type'] = "browsertime-pageload"
+                # Differentiate Raptor `pageload` tests from `browsertime-pageload`
+                # tests while we compare and contrast.
+                new_result['type'] = "browsertime-pageload"
 
-            # All Browsertime measurements are elapsed times in milliseconds.
-            new_result['subtest_lower_is_better'] = True
-            new_result['subtest_unit'] = 'ms'
-            LOG.info("parsed new result: %s" % str(new_result))
+                # All Browsertime measurements are elapsed times in milliseconds.
+                new_result['subtest_lower_is_better'] = True
+                new_result['subtest_unit'] = 'ms'
+                LOG.info("parsed new result: %s" % str(new_result))
 
-            # `extra_options` will be populated with Gecko profiling flags in
-            # the future.
-            new_result['extra_options'] = []
+                # `extra_options` will be populated with Gecko profiling flags in
+                # the future.
+                new_result['extra_options'] = []
 
-            self.results.append(new_result)
+                self.results.append(new_result)
 
         # now have all results gathered from all browsertime test URLs; format them for output
         output = BrowsertimeOutput(self.results,
                                    self.supporting_data,
                                    test_config['subtest_alert_on'])
 
         output.summarize(test_names)
         res, out_perfdata = output.output(test_names)