Bug 1339594 - Improve Talos error handling for hung browser. r=jmaher, a=test-only
authorGeoff Brown <gbrown@mozilla.com>
Fri, 24 Feb 2017 09:53:09 -0700
changeset 376560 e695a8e2017a7e5d828f75c4e320b98ab3bb212a
parent 376559 84df58250aaa030ecde8573105f307735467da28
child 376561 975654efb213c1e5dc0838fdf0ac24084e7d23cb
push id6996
push userjlorenzo@mozilla.com
push dateMon, 06 Mar 2017 20:48:21 +0000
treeherdermozilla-beta@d89512dab048 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjmaher, test-only
bugs1339594
milestone53.0a2
Bug 1339594 - Improve Talos error handling for hung browser. r=jmaher, a=test-only Previously, some talos hangs were resulting in psutil.NoSuchProcess errors, because the hung browser was killed twice. With this change, exceptions encountered on killing the process are handled, so that hangs result in minidump generation and crash reporting. The NoSuchProcess failures will be replaced with PROCESS-CRASH failures.
testing/talos/talos/talos_process.py
testing/talos/talos/ttest.py
--- a/testing/talos/talos/talos_process.py
+++ b/testing/talos/talos/talos_process.py
@@ -1,15 +1,16 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import time
 import psutil
 import mozcrash
+import traceback
 from mozprocess import ProcessHandler
 from threading import Event
 
 from mozlog import get_proxy_logger
 
 from utils import TalosError
 
 LOG = get_proxy_logger()
@@ -115,16 +116,17 @@ def run_browser(command, minidump_dir, t
     LOG.process_start(proc.pid, ' '.join(command))
     try:
         context.process = psutil.Process(proc.pid)
         if on_started:
             on_started(context.process)
         # wait until we saw __endTimestamp in the proc output,
         # or the browser just terminated - or we have a timeout
         if not event.wait(timeout):
+            LOG.info("Timeout waiting for test completion; killing browser...")
             # try to extract the minidump stack if the browser hangs
             mozcrash.kill_and_get_minidump(proc.pid, minidump_dir)
             raise TalosError("timeout")
         if reader.got_end_timestamp:
             for i in range(1, wait_for_quit_timeout):
                 if proc.wait(1) is not None:
                     break
             if proc.poll() is None:
@@ -132,19 +134,24 @@ def run_browser(command, minidump_dir, t
                     "Browser shutdown timed out after {0} seconds, terminating"
                     " process.".format(wait_for_quit_timeout)
                 )
         elif reader.got_timeout:
             raise TalosError('TIMEOUT: %s' % reader.timeout_message)
     finally:
         # this also handle KeyboardInterrupt
         # ensure early the process is really terminated
-        return_code = context.kill_process()
-        if return_code is None:
-            return_code = proc.wait(1)
+        try:
+            return_code = context.kill_process()
+            if return_code is None:
+                return_code = proc.wait(1)
+        except:
+            # Maybe killed by kill_and_get_minidump(), maybe ended?
+            LOG.info("Unable to kill process")
+            LOG.info(traceback.format_exc())
 
     reader.output.append(
         "__startBeforeLaunchTimestamp%d__endBeforeLaunchTimestamp"
         % first_time)
     reader.output.append(
         "__startAfterTerminationTimestamp%d__endAfterTerminationTimestamp"
         % (int(time.time()) * 1000))
 
--- a/testing/talos/talos/ttest.py
+++ b/testing/talos/talos/ttest.py
@@ -168,16 +168,20 @@ class TTest(object):
                     command_args,
                     minidump_dir,
                     timeout=timeout,
                     env=setup.env,
                     # start collecting counters as soon as possible
                     on_started=(counter_management.start
                                 if counter_management else None),
                 )
+            except:
+                self.check_for_crashes(browser_config, minidump_dir,
+                                       test_config['name'])
+                raise
             finally:
                 if counter_management:
                     counter_management.stop()
 
             if test_config['mainthread']:
                 rawlog = os.path.join(here, "mainthread_io.log")
                 if os.path.exists(rawlog):
                     processedlog = \