Bug 1263815 - retry xvfb startup. r=dustin a=npotb
authorGregory Arndt <garndt@mozilla.com>
Wed, 06 Jul 2016 09:06:58 -0500
changeset 347976 ee62b64c48474df97712e1988528e8ec1606814a
parent 347975 2984e7af1c0e17309abcc6f9e20bdf0e3af1f4f8
child 347977 c6d4012b25e6984a7d6d588bc9dc0d8198f112c2
push id6389
push userraliiev@mozilla.com
push dateMon, 19 Sep 2016 13:38:22 +0000
treeherdermozilla-beta@01d67bfe6c81 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersdustin, npotb
bugs1263815
milestone50.0a2
Bug 1263815 - retry xvfb startup. r=dustin a=npotb Sometimes xvfb will not start up with the current retry/delay settings. This will attempt to retry more and delay for longer to ensure xvfb has started up. Common pieces of this have been factored out into a recipe that all docker images can schare that need this functionality. MozReview-Commit-ID: BTXkJkBWLZX
taskcluster/scripts/builder/build-l10n.sh
taskcluster/scripts/builder/build-linux.sh
taskcluster/scripts/tester/test-ubuntu1204.sh
taskcluster/scripts/tester/test-ubuntu1604.sh
testing/docker/desktop-build/Dockerfile
testing/docker/desktop-test/Dockerfile
testing/docker/desktop1604-test/Dockerfile
testing/docker/recipes/xvfb.sh
--- a/taskcluster/scripts/builder/build-l10n.sh
+++ b/taskcluster/scripts/builder/build-l10n.sh
@@ -1,14 +1,16 @@
 #! /bin/bash -vex
 
 set -x -e
 
 echo "running as" $(id)
 
+. /home/worker/scripts/xvfb.sh
+
 ####
 # Taskcluster friendly wrapper for performing fx desktop l10n repacks via mozharness.
 # Based on ./build-linux.sh
 ####
 
 # Inputs, with defaults
 
 : MOZHARNESS_SCRIPT             ${MOZHARNESS_SCRIPT}
@@ -38,47 +40,24 @@ export TINDERBOX_OUTPUT=1
 export LIBRARY_PATH=$LIBRARY_PATH:$WORKSPACE/src/obj-firefox:$WORKSPACE/src/gcc/lib64
 
 # test required parameters are supplied
 if [[ -z ${MOZHARNESS_SCRIPT} ]]; then fail "MOZHARNESS_SCRIPT is not set"; fi
 if [[ -z ${MOZHARNESS_CONFIG} ]]; then fail "MOZHARNESS_CONFIG is not set"; fi
 
 cleanup() {
     local rv=$?
-    if [ -n "$xvfb_pid" ]; then
-        kill $xvfb_pid || true
-    fi
+    cleanup_xvfb
     exit $rv
 }
 trap cleanup EXIT INT
 
-# run mozharness in XVfb, if necessary; this is an array to maintain the quoting in the -s argument
+# run XVfb in the background, if necessary
 if $NEED_XVFB; then
-    # Some mozharness scripts set DISPLAY=:2
-    Xvfb :2 -screen 0 1024x768x24 &
-    export DISPLAY=:2
-    xvfb_pid=$!
-    # Only error code 255 matters, because it signifies that no
-    # display could be opened. As long as we can open the display
-    # tests should work. We'll retry a few times with a sleep before
-    # failing.
-    retry_count=0
-    max_retries=2
-    xvfb_test=0
-    until [ $retry_count -gt $max_retries ]; do
-        xvinfo || xvfb_test=$?
-        if [ $xvfb_test != 255 ]; then
-            retry_count=$(($max_retries + 1))
-        else
-            retry_count=$(($retry_count + 1))
-            echo "Failed to start Xvfb, retry: $retry_count"
-            sleep 2
-        fi
-    done
-    if [ $xvfb_test == 255 ]; then fail "xvfb did not start properly"; fi
+    start_xvfb '1024x768x24' 2
 fi
 
 # set up mozharness configuration, via command line, env, etc.
 
 # $TOOLTOOL_CACHE bypasses mozharness completely and is read by tooltool_wrapper.sh to set the
 # cache.  However, only some mozharness scripts use tooltool_wrapper.sh, so this may not be
 # entirely effective.
 export TOOLTOOL_CACHE
--- a/taskcluster/scripts/builder/build-linux.sh
+++ b/taskcluster/scripts/builder/build-linux.sh
@@ -1,14 +1,16 @@
 #! /bin/bash -vex
 
 set -x -e
 
 echo "running as" $(id)
 
+. /home/worker/scripts/xvfb.sh
+
 ####
 # Taskcluster friendly wrapper for performing fx desktop builds via mozharness.
 ####
 
 # Inputs, with defaults
 
 : MOZHARNESS_SCRIPT             ${MOZHARNESS_SCRIPT}
 : MOZHARNESS_CONFIG             ${MOZHARNESS_CONFIG}
@@ -49,47 +51,24 @@ export MOZ_AUTOMATION_UPLOAD_SYMBOLS=0
 export LIBRARY_PATH=$LIBRARY_PATH:$WORKSPACE/src/obj-firefox:$WORKSPACE/src/gcc/lib64
 
 # test required parameters are supplied
 if [[ -z ${MOZHARNESS_SCRIPT} ]]; then fail "MOZHARNESS_SCRIPT is not set"; fi
 if [[ -z ${MOZHARNESS_CONFIG} ]]; then fail "MOZHARNESS_CONFIG is not set"; fi
 
 cleanup() {
     local rv=$?
-    if [ -n "$xvfb_pid" ]; then
-        kill $xvfb_pid || true
-    fi
+    cleanup_xvfb
     exit $rv
 }
 trap cleanup EXIT INT
 
-# run mozharness in XVfb, if necessary; this is an array to maintain the quoting in the -s argument
+# run XVfb in the background, if necessary
 if $NEED_XVFB; then
-    # Some mozharness scripts set DISPLAY=:2
-    Xvfb :2 -screen 0 1024x768x24 &
-    export DISPLAY=:2
-    xvfb_pid=$!
-    # Only error code 255 matters, because it signifies that no
-    # display could be opened. As long as we can open the display
-    # tests should work. We'll retry a few times with a sleep before
-    # failing.
-    retry_count=0
-    max_retries=2
-    xvfb_test=0
-    until [ $retry_count -gt $max_retries ]; do
-        xvinfo || xvfb_test=$?
-        if [ $xvfb_test != 255 ]; then
-            retry_count=$(($max_retries + 1))
-        else
-            retry_count=$(($retry_count + 1))
-            echo "Failed to start Xvfb, retry: $retry_count"
-            sleep 2
-        fi
-    done
-    if [ $xvfb_test == 255 ]; then fail "xvfb did not start properly"; fi
+    start_xvfb '1024x768x24' 2
 fi
 
 # set up mozharness configuration, via command line, env, etc.
 
 debug_flag=""
 if [ 0$DEBUG -ne 0 ]; then
   debug_flag='--debug'
 fi
--- a/taskcluster/scripts/tester/test-ubuntu1204.sh
+++ b/taskcluster/scripts/tester/test-ubuntu1204.sh
@@ -1,14 +1,16 @@
 #! /bin/bash -xe
 
 set -x -e
 
 echo "running as" $(id)
 
+. /home/worker/scripts/xvfb.sh
+
 ####
 # Taskcluster friendly wrapper for performing fx desktop tests via mozharness.
 ####
 
 # Inputs, with defaults
 
 : MOZHARNESS_URL                ${MOZHARNESS_URL}
 : MOZHARNESS_SCRIPT             ${MOZHARNESS_SCRIPT}
@@ -38,23 +40,17 @@ if [[ -z ${MOZHARNESS_CONFIG} ]]; then f
 mkdir -p ~/artifacts/public
 
 cleanup() {
     local rv=$?
     if [[ -s /home/worker/.xsession-errors ]]; then
       # To share X issues
       cp /home/worker/.xsession-errors ~/artifacts/public/xsession-errors.log
     fi
-    # When you call this script with START_VNC or TASKCLUSTER_INTERACTIVE
-    # we make sure we do not kill xvfb so you do not lose your connection
-    xvfb_pid=`pidof Xvfb`
-    if [ -n "$xvfb_pid" ] && [ $START_VNC == false ] && [ $TASKCLUSTER_INTERACTIVE == false ] ; then
-        kill $xvfb_pid || true
-        screen -XS xvfb quit || true
-    fi
+    cleanup_xvfb
     exit $rv
 }
 trap cleanup EXIT INT
 
 # Unzip the mozharness ZIP file created by the build task
 if ! curl --fail -o mozharness.zip --retry 10 -L $MOZHARNESS_URL; then
     fail "failed to download mozharness zip"
 fi
@@ -68,39 +64,19 @@ fi
 
 # start up the pulseaudio daemon.  Note that it's important this occur
 # before the Xvfb startup for ubuntu 12.04, not for 16.04
 if $NEED_PULSEAUDIO; then
     pulseaudio --fail --daemonize --start
     pactl load-module module-null-sink
 fi
 
-# run Xvfb in the background, if necessary
+# run XVfb in the background, if necessary
 if $NEED_XVFB; then
-    screen -dmS xvfb Xvfb :0 -nolisten tcp -screen 0 1600x1200x24 \
-       > ~/artifacts/public/xvfb.log 2>&1
-    export DISPLAY=:0
-    # Only error code 255 matters, because it signifies that no
-    # display could be opened. As long as we can open the display
-    # tests should work. We'll retry a few times with a sleep before
-    # failing.
-    retry_count=0
-    max_retries=2
-    xvfb_test=0
-    until [ $retry_count -gt $max_retries ]; do
-        xvinfo || xvfb_test=$?
-        if [ $xvfb_test != 255 ]; then
-            retry_count=$(($max_retries + 1))
-        else
-            retry_count=$(($retry_count + 1))
-            echo "Failed to start Xvfb, retry: $retry_count"
-            sleep 2
-        fi
-    done
-    if [ $xvfb_test == 255 ]; then fail "xvfb did not start properly"; fi
+    start_xvfb '1600x1200x24' 0
 fi
 
 if $START_VNC; then
     x11vnc > ~/artifacts/public/x11vnc.log 2>&1 &
 fi
 
 if $NEED_WINDOW_MANAGER; then
     # This is read by xsession to select the window manager
--- a/taskcluster/scripts/tester/test-ubuntu1604.sh
+++ b/taskcluster/scripts/tester/test-ubuntu1604.sh
@@ -1,14 +1,16 @@
 #! /bin/bash -xe
 
 set -x -e
 
 echo "running as" $(id)
 
+. /home/worker/scripts/xvfb.sh
+
 ####
 # Taskcluster friendly wrapper for performing fx desktop tests via mozharness.
 ####
 
 # Inputs, with defaults
 
 : MOZHARNESS_URL                ${MOZHARNESS_URL}
 : MOZHARNESS_SCRIPT             ${MOZHARNESS_SCRIPT}
@@ -38,63 +40,36 @@ if [[ -z ${MOZHARNESS_CONFIG} ]]; then f
 mkdir -p ~/artifacts/public
 
 cleanup() {
     local rv=$?
     if [[ -s /home/worker/.xsession-errors ]]; then
       # To share X issues
       cp /home/worker/.xsession-errors ~/artifacts/public/xsession-errors.log
     fi
-    # When you call this script with START_VNC or TASKCLUSTER_INTERACTIVE
-    # we make sure we do not kill xvfb so you do not lose your connection
-    xvfb_pid=`pidof Xvfb`
-    if [ -n "$xvfb_pid" ] && [ $START_VNC == false ] && [ $TASKCLUSTER_INTERACTIVE == false ] ; then
-        kill $xvfb_pid || true
-        screen -XS xvfb quit || true
-    fi
+    cleanup_xvfb
     exit $rv
 }
 trap cleanup EXIT INT
 
 # Unzip the mozharness ZIP file created by the build task
 if ! curl --fail -o mozharness.zip --retry 10 -L $MOZHARNESS_URL; then
     fail "failed to download mozharness zip"
 fi
 rm -rf mozharness
 unzip -q mozharness.zip
 rm mozharness.zip
 
 if ! [ -d mozharness ]; then
     fail "mozharness zip did not contain mozharness/"
 fi
 
-# run Xvfb in the background, if necessary
+# run XVfb in the background, if necessary
 if $NEED_XVFB; then
-    screen -dmS xvfb Xvfb :0 -nolisten tcp -screen 0 1600x1200x24 \
-       > ~/artifacts/public/xvfb.log 2>&1
-    export DISPLAY=:0
-
-    # Only error code 255 matters, because it signifies that no
-    # display could be opened. As long as we can open the display
-    # tests should work. We'll retry a few times with a sleep before
-    # failing.
-    retry_count=0
-    max_retries=2
-    xvfb_test=0
-    until [ $retry_count -gt $max_retries ]; do
-        xvinfo || xvfb_test=$?
-        if [ $xvfb_test != 255 ]; then
-            retry_count=$(($max_retries + 1))
-        else
-            retry_count=$(($retry_count + 1))
-            echo "Failed to start Xvfb, retry: $retry_count"
-            sleep 2
-        fi
-    done
-    if [ $xvfb_test == 255 ]; then fail "xvfb did not start properly"; fi
+    start_xvfb '1600x1200x24' 0
 fi
 
 if $START_VNC; then
     x11vnc > ~/artifacts/public/x11vnc.log 2>&1 &
 fi
 
 if $NEED_WINDOW_MANAGER; then
     # This is read by xsession to select the window manager
--- a/testing/docker/desktop-build/Dockerfile
+++ b/testing/docker/desktop-build/Dockerfile
@@ -1,16 +1,20 @@
 FROM          taskcluster/centos6-build-upd:0.1.6.20160329195300
 MAINTAINER    Dustin J. Mitchell <dustin@mozilla.com>
 
 # Add build scripts; these are the entry points from the taskcluster worker, and
 # operate on environment variables
 ADD             bin /home/worker/bin
 RUN             chmod +x /home/worker/bin/*
 
+# Add wrapper scripts for xvfb allowing tasks to easily retry starting up xvfb
+# %include testing/docker/recipes/xvfb.sh
+ADD topsrcdir/testing/docker/recipes/xvfb.sh /home/worker/scripts/xvfb.sh
+
 # Add configuration
 COPY            dot-config                    /home/worker/.config
 
 # Generate machine uuid file
 RUN dbus-uuidgen --ensure=/var/lib/dbus/machine-id
 
 # Stubbed out credentials; mozharness looks for this file an issues a WARNING
 # if it's not found, which causes the build to fail.  Note that this needs to
@@ -23,10 +27,14 @@ ADD           oauth.txt /home/worker/
 # the directory where mozharness is run (not its --work-dir)
 ADD           buildprops.json /home/worker/
 
 # install tooltool directly from github where tooltool_wrapper.sh et al. expect
 # to find it
 RUN wget -O /builds/tooltool.py https://raw.githubusercontent.com/mozilla/build-tooltool/master/tooltool.py
 RUN chmod +x /builds/tooltool.py
 
+# Move installation to base centos6-build image once Bug 1272629 is fixed
+# Install the screen package here to use with xvfb.
+RUN yum install -y screen
+
 # Set a default command useful for debugging
 CMD ["/bin/bash", "--login"]
--- a/testing/docker/desktop-test/Dockerfile
+++ b/testing/docker/desktop-test/Dockerfile
@@ -14,16 +14,20 @@ COPY           tc-vcs-config.yml /etc/ta
 # TODO: remove
 ADD            https://raw.githubusercontent.com/taskcluster/buildbot-step/master/buildbot_step /home/worker/bin/buildbot_step
 RUN chmod u+x /home/worker/bin/buildbot_step
 
 # TODO: remove
 ADD            https://s3-us-west-2.amazonaws.com/test-caching/packages/linux64-stackwalk /usr/local/bin/linux64-minidump_stackwalk
 RUN chmod +x /usr/local/bin/linux64-minidump_stackwalk
 
+# Add wrapper scripts for xvfb allowing tasks to easily retry starting up xvfb
+# %include testing/docker/recipes/xvfb.sh
+ADD topsrcdir/testing/docker/recipes/xvfb.sh /home/worker/scripts/xvfb.sh
+
 # allow the worker user to access video devices
 RUN usermod -a -G video worker
 
 RUN mkdir Documents; mkdir Pictures; mkdir Music; mkdir Videos; mkdir artifacts
 
 # install tc-vcs and tc-npm-cache
 RUN npm install -g taskcluster-vcs@2.3.12 \
  && npm install -g taskcluster-npm-cache@1.1.14 \
--- a/testing/docker/desktop1604-test/Dockerfile
+++ b/testing/docker/desktop1604-test/Dockerfile
@@ -14,16 +14,20 @@ COPY           tc-vcs-config.yml /etc/ta
 # TODO: remove
 ADD            https://raw.githubusercontent.com/taskcluster/buildbot-step/master/buildbot_step /home/worker/bin/buildbot_step
 RUN chmod u+x /home/worker/bin/buildbot_step
 
 # TODO: remove
 ADD            https://s3-us-west-2.amazonaws.com/test-caching/packages/linux64-stackwalk /usr/local/bin/linux64-minidump_stackwalk
 RUN chmod +x /usr/local/bin/linux64-minidump_stackwalk
 
+# Add wrapper scripts for xvfb allowing tasks to easily retry starting up xvfb
+# %include testing/docker/recipes/xvfb.sh
+ADD topsrcdir/testing/docker/recipes/xvfb.sh /home/worker/scripts/xvfb.sh
+
 # allow the worker user to access video devices
 RUN usermod -a -G video worker
 
 RUN mkdir Documents; mkdir Pictures; mkdir Music; mkdir Videos; mkdir artifacts
 
 # install a new enough npm, plus tc-vcs and tc-npm-cache
 RUN npm install -g npm@^2.0.0 \
  && npm install -g taskcluster-vcs@2.3.12 \
new file mode 100644
--- /dev/null
+++ b/testing/docker/recipes/xvfb.sh
@@ -0,0 +1,75 @@
+#! /bin/bash -x
+
+set -x
+
+fail() {
+    echo # make sure error message is on a new line
+    echo "[xvfb.sh:error]" "${@}"
+    exit 1
+}
+
+cleanup_xvfb() {
+    # When you call this script with START_VNC or TASKCLUSTER_INTERACTIVE
+    # we make sure we do not kill xvfb so you do not lose your connection
+    local xvfb_pid=`pidof Xvfb`
+    local vnc=${START_VNC:-false}
+    local interactive=${TASKCLUSTER_INTERACTIVE:-false}
+    if [ -n "$xvfb_pid" ] && [[ $vnc == false ]] && [[ $interactive == false ]] ; then
+        kill $xvfb_pid || true
+        screen -XS xvfb quit || true
+    fi
+}
+
+# Attempt to start xvfb in a screen session with the given resolution and display
+# number.  Up to 5 attempts will be made to start xvfb with a short delay
+# between retries
+try_xvfb() {
+    screen -dmS xvfb Xvfb :$2 -nolisten tcp -screen 0 $1 \
+       > ~/artifacts/xvfb/xvfb.log 2>&1
+    export DISPLAY=:$2
+
+    # Only error code 255 matters, because it signifies that no
+    # display could be opened. As long as we can open the display
+    # tests should work. We'll retry a few times with a sleep before
+    # failing.
+    local retry_count=0
+    local max_retries=5
+    xvfb_test=0
+    until [ $retry_count -gt $max_retries ]; do
+        xvinfo || xvfb_test=$?
+        if [ $xvfb_test != 255 ]; then
+            retry_count=$(($max_retries + 1))
+        else
+            retry_count=$(($retry_count + 1))
+            echo "Failed to start Xvfb, retry: $retry_count"
+            sleep 2
+        fi
+    done
+    if [ $xvfb_test == 255 ]; then
+        return 1
+    else
+        return 0
+    fi
+}
+
+start_xvfb() {
+    set +e
+    mkdir -p ~/artifacts/xvfb
+    local retry_count=0
+    local max_retries=2
+    local success=1
+    until [ $retry_count -gt $max_retries ]; do
+        try_xvfb $1 $2
+        success=$?
+        if [ $success -eq 0 ]; then
+            retry_count=$(($max_retries + 1))
+        else
+            retry_count=$(($retry_count + 1))
+            sleep 10
+        fi
+    done
+    set -e
+    if [ $success -eq 1 ]; then
+        fail "Could not start xvfb after ${max_retries} attempts"
+    fi
+}