Bug 1433873 - Fix race condition in Marionette client for in_app quit and restart. r=ato
authorHenrik Skupin <mail@hskupin.info>
Fri, 05 Oct 2018 16:43:18 +0000
changeset 495567 4204f9fab5fea59e25e6413726b65976540b3432
parent 495566 af465aa90d730fa51fed9e76ba46bffe04ac74a7
child 495568 65403cf5da8af663513ccd84dfe16f3a3d1b5ddb
push id9984
push userffxbld-merge
push dateMon, 15 Oct 2018 21:07:35 +0000
treeherdermozilla-beta@183d27ea8570 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersato
bugs1433873
milestone64.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1433873 - Fix race condition in Marionette client for in_app quit and restart. r=ato Both methods are poorly handling the phase when the application is about to quit/restart. Especially when a callback is used to trigger the application shutdown. This patch makes sure that in case of an active shutdown the do_process_check decorator doesn't trigger a socket failure, and also correctly waits for the application to shutdown, or being restarted. Differential Revision: https://phabricator.services.mozilla.com/D7897
testing/marionette/client/marionette_driver/decorators.py
testing/marionette/client/marionette_driver/marionette.py
testing/marionette/harness/marionette_harness/tests/unit/test_quit_restart.py
--- a/testing/marionette/client/marionette_driver/decorators.py
+++ b/testing/marionette/client/marionette_driver/decorators.py
@@ -20,19 +20,24 @@ def _find_marionette_in_args(*args, **kw
 
 def do_process_check(func):
     """Decorator which checks the process status after the function has run."""
     @wraps(func)
     def _(*args, **kwargs):
         try:
             return func(*args, **kwargs)
         except (socket.error, socket.timeout):
+            m = _find_marionette_in_args(*args, **kwargs)
+
             # In case of socket failures which will also include crashes of the
-            # application, make sure to handle those correctly.
-            m = _find_marionette_in_args(*args, **kwargs)
+            # application, make sure to handle those correctly. In case of an
+            # active shutdown just let it bubble up.
+            if m.is_shutting_down:
+                raise
+
             m._handle_socket_failure()
 
     return _
 
 
 def uses_marionette(func):
     """Decorator which creates a marionette session and deletes it
     afterwards if one doesn't already exist.
--- a/testing/marionette/client/marionette_driver/marionette.py
+++ b/testing/marionette/client/marionette_driver/marionette.py
@@ -681,21 +681,23 @@ class Marionette(object):
         try:
             s.bind((host, port))
             return True
         except socket.error:
             return False
         finally:
             s.close()
 
-    def raise_for_port(self, timeout=None):
+    def raise_for_port(self, timeout=None, check_process_status=True):
         """Raise socket.timeout if no connection can be established.
 
-        :param timeout: Timeout in seconds for the server to be ready.
-
+        :param timeout: Optional timeout in seconds for the server to be ready.
+        :param check_process_status: Optional, if `True` the process will be
+            continuously checked if it has exited, and the connection
+            attempt will be aborted.
         """
         if timeout is None:
             timeout = self.DEFAULT_STARTUP_TIMEOUT
 
         runner = None
         if self.instance is not None:
             runner = self.instance.runner
 
@@ -703,17 +705,17 @@ class Marionette(object):
         starttime = datetime.datetime.now()
         timeout_time = starttime + datetime.timedelta(seconds=timeout)
 
         client = transport.TcpTransport(self.host, self.port, 0.5)
 
         connected = False
         while datetime.datetime.now() < timeout_time:
             # If the instance we want to connect to is not running return immediately
-            if runner is not None and not runner.is_running():
+            if check_process_status and runner is not None and not runner.is_running():
                 break
 
             try:
                 client.connect()
                 return True
             except socket.error:
                 pass
             finally:
@@ -1089,40 +1091,49 @@ class Marionette(object):
                          be used to trigger the shutdown.
         """
         if not self.instance:
             raise errors.MarionetteException("quit() can only be called "
                                              "on Gecko instances launched by Marionette")
 
         cause = None
         if in_app:
-            if callback is not None:
-                if not callable(callback):
-                    raise ValueError("Specified callback '{}' is not callable".format(callback))
+            if callback is not None and not callable(callback):
+                raise ValueError("Specified callback '{}' is not callable".format(callback))
+
+            # Block Marionette from accepting new connections
+            self._send_message("Marionette:AcceptConnections",
+                               {"value": False})
+
+            try:
+                self.is_shutting_down = True
+                if callback is not None:
+                    callback()
+                else:
+                    cause = self._request_in_app_shutdown()
 
-                self._send_message("Marionette:AcceptConnections",
-                                   {"value": False})
-                callback()
-            else:
-                cause = self._request_in_app_shutdown()
+            except IOError:
+                # A possible IOError should be ignored at this point, given that
+                # quit() could have been called inside of `using_context`,
+                # which wants to reset the context but fails sending the message.
+                pass
 
+            returncode = self.instance.runner.wait(timeout=self.DEFAULT_SHUTDOWN_TIMEOUT)
+            if returncode is None:
+                # The process did not shutdown itself, so force-closing it.
+                self.cleanup()
+
+                message = "Process still running {}s after quit request"
+                raise IOError(message.format(self.DEFAULT_SHUTDOWN_TIMEOUT))
+
+            self.is_shutting_down = False
             self.delete_session(send_request=False)
 
-            # Give the application some time to shutdown
-            returncode = self.instance.runner.wait(timeout=self.DEFAULT_SHUTDOWN_TIMEOUT)
-            if returncode is None:
-                # This will force-close the application without sending any other message.
-                self.cleanup()
-
-                message = ("Process killed because a requested application quit did not happen "
-                           "within {}s. Check gecko.log for errors.")
-                raise IOError(message.format(self.DEFAULT_SHUTDOWN_TIMEOUT))
-
         else:
-            self.delete_session()
+            self.delete_session(send_request=False)
             self.instance.close(clean=clean)
 
         if cause not in (None, "shutdown"):
             raise errors.MarionetteException("Unexpected shutdown reason '{}' for "
                                              "quitting the process.".format(cause))
 
     @do_process_check
     def restart(self, clean=False, in_app=False, callback=None):
@@ -1145,37 +1156,64 @@ class Marionette(object):
         context = self._send_message("Marionette:GetContext",
                                      key="value")
 
         cause = None
         if in_app:
             if clean:
                 raise ValueError("An in_app restart cannot be triggered with the clean flag set")
 
-            if callback is not None:
-                if not callable(callback):
-                    raise ValueError("Specified callback '{}' is not callable".format(callback))
+            if callback is not None and not callable(callback):
+                raise ValueError("Specified callback '{}' is not callable".format(callback))
+
+            # Block Marionette from accepting new connections
+            self._send_message("Marionette:AcceptConnections",
+                               {"value": False})
+
+            try:
+                self.is_shutting_down = True
+                if callback is not None:
+                    callback()
+                else:
+                    cause = self._request_in_app_shutdown("eRestart")
+
+            except IOError:
+                # A possible IOError should be ignored at this point, given that
+                # restart() could have been called inside of `using_context`,
+                # which wants to reset the context but fails sending the message.
+                pass
 
-                self._send_message("Marionette:AcceptConnections",
-                                   {"value": False})
-                callback()
-            else:
-                cause = self._request_in_app_shutdown("eRestart")
+            try:
+                # Wait for a new Marionette connection to appear while the
+                # process restarts itself.
+                self.raise_for_port(timeout=self.DEFAULT_SHUTDOWN_TIMEOUT,
+                                    check_process_status=False)
+            except socket.timeout:
+                exc, val, tb = sys.exc_info()
+
+                if self.instance.runner.returncode is None:
+                    # The process is still running, which means the shutdown
+                    # request was not correct or the application ignored it.
+                    # Allow Marionette to accept connections again.
+                    self._send_message("acceptConnections", {"value": True})
+
+                    message = "Process still running {}s after restart request"
+                    reraise(exc, message.format(self.DEFAULT_SHUTDOWN_TIMEOUT), tb)
+
+                else:
+                    # The process shutdown but didn't start again.
+                    self.cleanup()
+                    msg = "Process unexpectedly quit without restarting (exit code: {})"
+                    reraise(exc, msg.format(self.instance.runner.returncode), tb)
+
+            finally:
+                self.is_shutting_down = False
 
             self.delete_session(send_request=False)
 
-            try:
-                timeout = self.DEFAULT_SHUTDOWN_TIMEOUT + self.DEFAULT_STARTUP_TIMEOUT
-                self.raise_for_port(timeout=timeout)
-            except socket.timeout:
-                if self.instance.runner.returncode is not None:
-                    exc, val, tb = sys.exc_info()
-                    self.cleanup()
-                    reraise(exc, "Requested restart of the application was aborted", tb)
-
         else:
             self.delete_session()
             self.instance.restart(clean=clean)
             self.raise_for_port(timeout=self.DEFAULT_STARTUP_TIMEOUT)
 
         if cause not in (None, "restart"):
             raise errors.MarionetteException("Unexpected shutdown reason '{}' for "
                                              "restarting the process".format(cause))
--- a/testing/marionette/harness/marionette_harness/tests/unit/test_quit_restart.py
+++ b/testing/marionette/harness/marionette_harness/tests/unit/test_quit_restart.py
@@ -1,18 +1,17 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 from __future__ import absolute_import, print_function
 
-import sys
 import urllib
 
-from unittest import skip, skipIf
+from unittest import skip
 
 from marionette_driver import errors
 from marionette_driver.by import By
 from marionette_harness import MarionetteTestCase
 
 
 def inline(doc):
     return "data:text/html;charset=utf-8,{}".format(urllib.quote(doc))
@@ -113,67 +112,63 @@ class TestQuitRestart(MarionetteTestCase
             Components.utils.import("resource://gre/modules/Services.jsm");
             let flags = Ci.nsIAppStartup.eAttemptQuit;
             if (arguments[0]) {
               flags |= Ci.nsIAppStartup.eRestart;
             }
             Services.startup.quit(flags);
         """, script_args=(restart,))
 
+    def test_force_restart(self):
+        self.marionette.restart()
+        self.assertEqual(self.marionette.profile, self.profile)
+        self.assertNotEqual(self.marionette.session_id, self.session_id)
+
+        # A forced restart will cause a new process id
+        self.assertNotEqual(self.marionette.process_id, self.pid)
+        self.assertNotEqual(self.marionette.get_pref("startup.homepage_welcome_url"),
+                            "about:about")
+
     def test_force_clean_restart(self):
         self.marionette.restart(clean=True)
         self.assertNotEqual(self.marionette.profile, self.profile)
         self.assertNotEqual(self.marionette.session_id, self.session_id)
         # A forced restart will cause a new process id
         self.assertNotEqual(self.marionette.process_id, self.pid)
         self.assertNotEqual(self.marionette.get_pref("startup.homepage_welcome_url"),
                             "about:about")
 
-    def test_force_restart(self):
-        self.marionette.restart()
-        self.assertEqual(self.marionette.profile, self.profile)
-        self.assertNotEqual(self.marionette.session_id, self.session_id)
-        # A forced restart will cause a new process id
-        self.assertNotEqual(self.marionette.process_id, self.pid)
-        self.assertNotEqual(self.marionette.get_pref("startup.homepage_welcome_url"),
-                            "about:about")
+    def test_force_quit(self):
+        self.marionette.quit()
+
+        self.assertEqual(self.marionette.session, None)
+        with self.assertRaisesRegexp(errors.InvalidSessionIdException, "Please start a session"):
+            self.marionette.get_url()
 
     def test_force_clean_quit(self):
         self.marionette.quit(clean=True)
 
         self.assertEqual(self.marionette.session, None)
         with self.assertRaisesRegexp(errors.InvalidSessionIdException, "Please start a session"):
             self.marionette.get_url()
 
         self.marionette.start_session()
         self.assertNotEqual(self.marionette.profile, self.profile)
         self.assertNotEqual(self.marionette.session_id, self.session_id)
         self.assertNotEqual(self.marionette.get_pref("startup.homepage_welcome_url"),
                             "about:about")
 
-    def test_force_quit(self):
-        self.marionette.quit()
-
-        self.assertEqual(self.marionette.session, None)
-        with self.assertRaisesRegexp(errors.InvalidSessionIdException, "Please start a session"):
-            self.marionette.get_url()
-
-        self.marionette.start_session()
-        self.assertEqual(self.marionette.profile, self.profile)
-        self.assertNotEqual(self.marionette.session_id, self.session_id)
-        self.assertNotEqual(self.marionette.get_pref("startup.homepage_welcome_url"),
-                            "about:about")
-
     def test_no_in_app_clean_restart(self):
         # Test that in_app and clean cannot be used in combination
-        with self.assertRaises(ValueError):
+        with self.assertRaisesRegexp(ValueError, "cannot be triggered with the clean flag set"):
             self.marionette.restart(in_app=True, clean=True)
 
     def test_in_app_restart(self):
         self.marionette.restart(in_app=True)
+
         self.assertEqual(self.marionette.profile, self.profile)
         self.assertNotEqual(self.marionette.session_id, self.session_id)
 
         # An in-app restart will keep the same process id only on Linux
         if self.marionette.session_capabilities["platformName"] == "linux":
             self.assertEqual(self.marionette.process_id, self.pid)
         else:
             self.assertNotEqual(self.marionette.process_id, self.pid)
@@ -192,16 +187,41 @@ class TestQuitRestart(MarionetteTestCase
         if self.marionette.session_capabilities["platformName"] == "linux":
             self.assertEqual(self.marionette.process_id, self.pid)
         else:
             self.assertNotEqual(self.marionette.process_id, self.pid)
 
         self.assertNotEqual(self.marionette.get_pref("startup.homepage_welcome_url"),
                             "about:about")
 
+    def test_in_app_restart_with_callback_not_callable(self):
+        with self.assertRaisesRegexp(ValueError, "is not callable"):
+            self.marionette.restart(in_app=True, callback=4)
+
+    def test_in_app_restart_with_callback_but_process_quit(self):
+        timeout_shutdown = self.marionette.DEFAULT_SHUTDOWN_TIMEOUT
+        # Wait at least 70s for the hang monitor in case of a shutdown hang
+        self.marionette.DEFAULT_SHUTDOWN_TIMEOUT = 70
+
+        try:
+            with self.assertRaisesRegexp(IOError, "Process unexpectedly quit without restarting"):
+                self.marionette.restart(in_app=True, callback=lambda: self.shutdown(restart=False))
+        finally:
+            self.marionette.DEFAULT_SHUTDOWN_TIMEOUT = timeout_shutdown
+
+    def test_in_app_restart_with_callback_missing_shutdown(self):
+        try:
+            timeout_shutdown = self.marionette.DEFAULT_SHUTDOWN_TIMEOUT
+            self.marionette.DEFAULT_SHUTDOWN_TIMEOUT = 5
+
+            with self.assertRaisesRegexp(IOError, "the connection to Marionette server is lost"):
+                self.marionette.restart(in_app=True, callback=lambda: False)
+        finally:
+            self.marionette.DEFAULT_SHUTDOWN_TIMEOUT = timeout_shutdown
+
     @skip("Bug 1397612 - Hang of Marionette client after the restart")
     def test_in_app_restart_safe_mode(self):
 
         def restart_in_safe_mode():
             with self.marionette.using_context("chrome"):
                 self.marionette.execute_script("""
                   Components.utils.import("resource://gre/modules/Services.jsm");
 
@@ -215,37 +235,20 @@ class TestQuitRestart(MarionetteTestCase
                   }
                 """)
 
         try:
             self.assertFalse(self.is_safe_mode, "Safe Mode is unexpectedly enabled")
             self.marionette.restart(in_app=True, callback=restart_in_safe_mode)
             self.assertTrue(self.is_safe_mode, "Safe Mode is not enabled")
         finally:
+            if self.marionette.session is None:
+                self.marionette.start_session()
             self.marionette.quit(clean=True)
 
-    def test_in_app_restart_with_callback_not_callable(self):
-        with self.assertRaisesRegexp(ValueError, "is not callable"):
-            self.marionette.restart(in_app=True, callback=4)
-
-    @skipIf(sys.platform.startswith("win"),
-            "Bug 1433873 - Fix race condition in Marionette for in_app quit and restart")
-    def test_in_app_restart_with_callback_missing_shutdown(self):
-        try:
-            timeout_startup = self.marionette.DEFAULT_STARTUP_TIMEOUT
-            timeout_shutdown = self.marionette.DEFAULT_SHUTDOWN_TIMEOUT
-            self.marionette.DEFAULT_SHUTDOWN_TIMEOUT = 5
-            self.marionette.DEFAULT_STARTUP_TIMEOUT = 5
-
-            with self.assertRaisesRegexp(IOError, "the connection to Marionette server is lost"):
-                self.marionette.restart(in_app=True, callback=lambda: False)
-        finally:
-            self.marionette.DEFAULT_STARTUP_TIMEOUT = timeout_startup
-            self.marionette.DEFAULT_SHUTDOWN_TIMEOUT = timeout_shutdown
-
     def test_in_app_quit(self):
         self.marionette.quit(in_app=True)
 
         self.assertEqual(self.marionette.session, None)
         with self.assertRaisesRegexp(errors.InvalidSessionIdException, "Please start a session"):
             self.marionette.get_url()
 
         self.marionette.start_session()
@@ -264,19 +267,19 @@ class TestQuitRestart(MarionetteTestCase
         self.assertEqual(self.marionette.profile, self.profile)
         self.assertNotEqual(self.marionette.session_id, self.session_id)
         self.assertNotEqual(self.marionette.get_pref("startup.homepage_welcome_url"),
                             "about:about")
 
     def test_in_app_quit_with_callback_missing_shutdown(self):
         try:
             timeout = self.marionette.DEFAULT_SHUTDOWN_TIMEOUT
-            self.marionette.DEFAULT_SHUTDOWN_TIMEOUT = 10
+            self.marionette.DEFAULT_SHUTDOWN_TIMEOUT = 5
 
-            with self.assertRaisesRegexp(IOError, "a requested application quit did not happen"):
+            with self.assertRaisesRegexp(IOError, "Process still running"):
                 self.marionette.quit(in_app=True, callback=lambda: False)
         finally:
             self.marionette.DEFAULT_SHUTDOWN_TIMEOUT = timeout
 
     def test_in_app_quit_with_callback_not_callable(self):
         with self.assertRaisesRegexp(ValueError, "is not callable"):
             self.marionette.restart(in_app=True, callback=4)