Bug 1339594 - Improve Talos error handling for hung browser; r=jmaher
authorGeoff Brown <gbrown@mozilla.com>
Fri, 24 Feb 2017 09:53:09 -0700
changeset 373821 eb573b6f4d66f9f1161c0058e61113feb4df8bb2
parent 373820 237552f782e7c1d474b46b9aa2b540326860e41d
child 373822 9b7942f48f01bb128a29f2b7f6a0cbc0bd5ac49e
push id10863
push userjlorenzo@mozilla.com
push dateMon, 06 Mar 2017 23:02:23 +0000
treeherdermozilla-aurora@0931190cd725 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjmaher
bugs1339594
milestone54.0a1
Bug 1339594 - Improve Talos error handling for hung browser; r=jmaher Previously, some talos hangs were resulting in psutil.NoSuchProcess errors, because the hung browser was killed twice. With this change, exceptions encountered on killing the process are handled, so that hangs result in minidump generation and crash reporting. The NoSuchProcess failures will be replaced with PROCESS-CRASH failures.
testing/talos/talos/talos_process.py
testing/talos/talos/ttest.py
--- a/testing/talos/talos/talos_process.py
+++ b/testing/talos/talos/talos_process.py
@@ -1,15 +1,16 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import time
 import psutil
 import mozcrash
+import traceback
 from mozprocess import ProcessHandler
 from threading import Event
 
 from mozlog import get_proxy_logger
 
 from utils import TalosError
 
 LOG = get_proxy_logger()
@@ -115,16 +116,17 @@ def run_browser(command, minidump_dir, t
     LOG.process_start(proc.pid, ' '.join(command))
     try:
         context.process = psutil.Process(proc.pid)
         if on_started:
             on_started(context.process)
         # wait until we saw __endTimestamp in the proc output,
         # or the browser just terminated - or we have a timeout
         if not event.wait(timeout):
+            LOG.info("Timeout waiting for test completion; killing browser...")
             # try to extract the minidump stack if the browser hangs
             mozcrash.kill_and_get_minidump(proc.pid, minidump_dir)
             raise TalosError("timeout")
         if reader.got_end_timestamp:
             for i in range(1, wait_for_quit_timeout):
                 if proc.wait(1) is not None:
                     break
             if proc.poll() is None:
@@ -132,19 +134,24 @@ def run_browser(command, minidump_dir, t
                     "Browser shutdown timed out after {0} seconds, terminating"
                     " process.".format(wait_for_quit_timeout)
                 )
         elif reader.got_timeout:
             raise TalosError('TIMEOUT: %s' % reader.timeout_message)
     finally:
         # this also handle KeyboardInterrupt
         # ensure early the process is really terminated
-        return_code = context.kill_process()
-        if return_code is None:
-            return_code = proc.wait(1)
+        try:
+            return_code = context.kill_process()
+            if return_code is None:
+                return_code = proc.wait(1)
+        except:
+            # Maybe killed by kill_and_get_minidump(), maybe ended?
+            LOG.info("Unable to kill process")
+            LOG.info(traceback.format_exc())
 
     reader.output.append(
         "__startBeforeLaunchTimestamp%d__endBeforeLaunchTimestamp"
         % first_time)
     reader.output.append(
         "__startAfterTerminationTimestamp%d__endAfterTerminationTimestamp"
         % (int(time.time()) * 1000))
 
--- a/testing/talos/talos/ttest.py
+++ b/testing/talos/talos/ttest.py
@@ -168,16 +168,20 @@ class TTest(object):
                     command_args,
                     minidump_dir,
                     timeout=timeout,
                     env=setup.env,
                     # start collecting counters as soon as possible
                     on_started=(counter_management.start
                                 if counter_management else None),
                 )
+            except:
+                self.check_for_crashes(browser_config, minidump_dir,
+                                       test_config['name'])
+                raise
             finally:
                 if counter_management:
                     counter_management.stop()
 
             if test_config['mainthread']:
                 rawlog = os.path.join(here, "mainthread_io.log")
                 if os.path.exists(rawlog):
                     processedlog = \