Bug 1636682 - Make some small improvements to the similarity metric. r=tarek
authorGregory Mierzwinski <gmierz2@outlook.com>
Tue, 19 May 2020 20:00:50 +0000
changeset 530890 5c5aa0a052b4fb126e719420d6dcdfd449339feb
parent 530889 d34c604f3e8b8438f87b78c454e75df320b7c575
child 530891 9d927bd44ed522093f6ca16be02cd0629fcebdcb
push id116435
push usergmierz2@outlook.com
push dateTue, 19 May 2020 21:21:30 +0000
treeherderautoland@5c5aa0a052b4 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerstarek
bugs1636682
milestone78.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1636682 - Make some small improvements to the similarity metric. r=tarek This patch makes two main changes: 1.Calculates 2 sets of metrics, one that is calculated against it's last task (same label) and another that is calculated against the live site variant if it exists. i. Live site tests only have one calculated. 2. Use task label instead of building it ourselves to ensure that the smilarity metrics use the correct page load variant and make the queries more robust. Differential Revision: https://phabricator.services.mozilla.com/D74529
taskcluster/ci/visual-metrics-dep/kind.yml
taskcluster/docker/visual-metrics/run-visual-metrics.py
taskcluster/docker/visual-metrics/similarity.py
taskcluster/taskgraph/transforms/visual_metrics_dep.py
--- a/taskcluster/ci/visual-metrics-dep/kind.yml
+++ b/taskcluster/ci/visual-metrics-dep/kind.yml
@@ -24,23 +24,20 @@ job-template:
     worker-type: t-linux-xlarge
 
     treeherder:
         tier: 3
         kind: other
 
     worker:
         docker-image: {in-tree: visual-metrics}
-        max-run-time: 900
+        max-run-time: 1800
         artifacts:
-            - type: file
-              name: public/perfherder-data.json
-              path: /builds/worker/artifacts/perfherder-data.json
-            - type: file
-              name: public/summary.json
-              path: /builds/worker/artifacts/summary.json
+            - type: directory
+              name: public/
+              path: /builds/worker/artifacts/
     fetches:
         fetch:
             - visual-metrics
     run:
         using: run-task
         command: /builds/worker/bin/run-visual-metrics.py -- --orange --perceptual --contentful --force --renderignore 5 --json --viewport
         checkout: false
--- a/taskcluster/docker/visual-metrics/run-visual-metrics.py
+++ b/taskcluster/docker/visual-metrics/run-visual-metrics.py
@@ -284,39 +284,30 @@ def main(log, args):
         "suites": suites,
     }
     for entry in suites:
         entry["extraOptions"] = jobs_json["extra_options"]
 
     # Try to get the similarity for all possible tests, this means that we
     # will also get a comparison of recorded vs. live sites to check
     # the on-going quality of our recordings.
-    similarity = None
-    if "android" in os.getenv("TC_PLATFORM", ""):
-        try:
-            from similarity import calculate_similarity
-            similarity = calculate_similarity(jobs_json, fetch_dir, OUTPUT_DIR, log)
-        except Exception:
-            log.info("Failed to calculate similarity score", exc_info=True)
-
-    if similarity:
-        suites[0]["subtests"].append({
-            "name": "Similarity3D",
-            "value": similarity[0],
-            "replicates": [similarity[0]],
-            "lowerIsBetter": False,
-            "unit": "a.u.",
-        })
-        suites[0]["subtests"].append({
-            "name": "Similarity2D",
-            "value": similarity[1],
-            "replicates": [similarity[1]],
-            "lowerIsBetter": False,
-            "unit": "a.u.",
-        })
+    try:
+        from similarity import calculate_similarity
+        for name, value in calculate_similarity(jobs_json, fetch_dir, OUTPUT_DIR).items():
+            if value is None:
+                continue
+            suites[0]["subtests"].append({
+                "name": name,
+                "value": value,
+                "replicates": [value],
+                "lowerIsBetter": False,
+                "unit": "a.u.",
+            })
+    except Exception:
+        log.info("Failed to calculate similarity score", exc_info=True)
 
     # Validates the perf data complies with perfherder schema.
     # The perfherder schema uses jsonschema so we can't use voluptuous here.
     validate(perf_data, PERFHERDER_SCHEMA)
 
     raw_perf_data = json.dumps(perf_data)
     with Path(OUTPUT_DIR, "perfherder-data.json").open("w") as f:
         f.write(raw_perf_data)
--- a/taskcluster/docker/visual-metrics/similarity.py
+++ b/taskcluster/docker/visual-metrics/similarity.py
@@ -5,27 +5,44 @@
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 import cv2
 import json
 import numpy as np
 import os
 import pathlib
 import shutil
 import socket
+import structlog
 import tarfile
 import tempfile
 import urllib
 
 from functools import wraps
 from matplotlib import pyplot as plt
 from scipy.stats import spearmanr
 
 
-def open_data(file):
-    return cv2.VideoCapture(str(file))
+log = None
+
+
+# We add the `and` conditions to it later
+base_ad_query = {
+    "from": "task",
+    "limit": 1000,
+    "where": {
+        "and": []
+    },
+    "select": [
+        "action.start_time",
+        "run.name",
+        "task.artifacts",
+        "task.group.id",
+        "task.id"
+    ],
+}
 
 
 def socket_timeout(value=120):
     """Decorator for socket timeouts."""
     def _socket_timeout(func):
         @wraps(func)
         def __socket_timeout(*args, **kw):
             old = socket.getdefaulttimeout()
@@ -33,18 +50,22 @@ def socket_timeout(value=120):
             try:
                 return func(*args, **kw)
             finally:
                 socket.setdefaulttimeout(old)
         return __socket_timeout
     return _socket_timeout
 
 
+def _open_data(file):
+    return cv2.VideoCapture(str(file))
+
+
 @socket_timeout(120)
-def query_activedata(query_json, log):
+def _query_activedata(query_json):
     """Used to run queries on active data."""
     active_data_url = "http://activedata.allizom.org/query"
 
     req = urllib.request.Request(active_data_url)
     req.add_header("Content-Type", "application/json")
     jsondata = json.dumps(query_json)
 
     jsondataasbytes = jsondata.encode("utf-8")
@@ -54,172 +75,182 @@ def query_activedata(query_json, log):
     response = urllib.request.urlopen(req, jsondataasbytes)
     log.info("Status: %s" % {str(response.getcode())})
 
     data = json.loads(response.read().decode("utf8").replace("'", '"'))["data"]
     return data
 
 
 @socket_timeout(120)
-def download(url, loc, log):
+def _download(url, loc):
     """Downloads from a url (with a timeout)."""
     log.info("Downloading %s" % url)
     try:
         urllib.request.urlretrieve(url, loc)
     except Exception as e:
         log.info(str(e))
         return False
     return True
 
 
-def get_frames(video):
+def _get_frames(video):
     """Gets all frames from a video into a list."""
     allframes = []
     while video.isOpened():
         ret, frame = video.read()
         if ret:
             # Convert to gray to simplify the process
             allframes.append(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY))
         else:
             video.release()
             break
     return allframes
 
 
-def calculate_similarity(jobs_json, fetch_dir, output, log):
-    """Calculates the similarity score against the last live site test.
-
-    The technique works as follows:
-        1. Get the last live site test.
-        2. For each 15x15 video pairings, build a cross-correlation matrix:
-            1. Get each of the videos and calculate their histograms
-               across the full videos.
-            2. Calculate the correlation coefficient between these two.
-        3. Average the cross-correlation matrix to obtain the score.
-
-    The 2D similarity score is the same, except that it builds a histogram
-    from the final frame instead of the full video.
-
-    For finding the last live site, we use active-data. We search for
-    PGO android builds since this metric is only available for live sites that
-    run on android in mozilla-cental. Given that live sites currently
-    run on cron 3 days a week, then it's also reasonable to look for tasks
-    which have occurred before today and within the last two weeks at most.
-    But this is a TODO for future work, since we need to determine a better
-    way of selecting the last task (HG push logs?) - there's a lot that factors
-    into these choices, so it might require a multi-faceted approach.
-
-    Args:
-        jobs_json: The jobs JSON that holds extra information.
-        fetch_dir: The fetch directory that holds the new videos.
-        log: The logger.
-    Returns:
-        Two similarity scores (3D, 2D) as a float, or None if there was an issue.
-    """
-    app = jobs_json["application"]["name"]
-    test = jobs_json["jobs"][0]["test_name"]
-    splittest = test.split("-cold")
-
-    cold = ""
-    if len(splittest) > 0:
-        cold = ".*cold"
-    test = splittest[0]
-
-    # PGO vs. OPT shouldn't matter much, but we restrict it to PGO builds here
-    # for android, and desktop tests have the opt/pgo restriction removed
-    plat = os.getenv("TC_PLATFORM", "")
-    if "android" in plat:
-        plat = plat.replace("/opt", "/pgo")
-    else:
-        plat = plat.replace("/opt", "").replace("/pgo", "")
-    ad_query = {
-        "from": "task",
-        "limit": 1000,
-        "where": {
-            "and": [
-                {
-                    "regexp": {
-                        "run.name": ".*%s.*browsertime.*-live.*%s%s.*%s.*"
-                        % (plat, app, cold, test)
-                    }
-                },
-                {"not": {"prefix": {"run.name": "test-vismet"}}},
-                {"in": {"repo.branch.name": ["mozilla-central"]}},
-                {"gte": {"action.start_time": {"date": "today-week-week"}}},
-                {"lt": {"action.start_time": {"date": "today-1day"}}},
-                {"in": {"task.run.state": ["completed"]}},
-            ]
-        },
-        "select": ["action.start_time", "run.name", "task.artifacts"],
-    }
-
-    # Run the AD query and find the browsertime videos to download
+def _get_browsertime_results(query):
+    """Used to run an AD query and extract the browsertime results if they exist."""
     failed = False
     try:
-        data = query_activedata(ad_query, log)
+        data = _query_activedata(query)
     except Exception as e:
         log.info(str(e))
         failed = True
     if failed or not data:
         log.info("Couldn't get activedata data")
         return None
 
+    # Find the newest browsertime task
     log.info("Found %s datums" % str(len(data["action.start_time"])))
     maxind = np.argmax([float(t) for t in data["action.start_time"]])
     artifacts = data["task.artifacts"][maxind]
     btime_artifact = None
     for art in artifacts:
         if "browsertime-results" in art["name"]:
             btime_artifact = art["url"]
             break
     if not btime_artifact:
-        log.info("Can't find an older live site")
+        log.info("Can't find an older site test")
         return None
 
+    log.info("Comparing videos to TASK_GROUP=%s, TASK_ID=%s" % (
+        data["task.group.id"][maxind], data["task.id"][maxind]
+    ))
+
     # Download the browsertime videos and untar them
     tmpdir = tempfile.mkdtemp()
     loc = os.path.join(tmpdir, "tmpfile.tgz")
-    if not download(btime_artifact, loc, log):
+    if not _download(btime_artifact, loc):
+        log.info(
+            "Failed to download browsertime-results artifact from %s" % btime_artifact
+        )
         return None
     tmploc = tempfile.mkdtemp()
     try:
         with tarfile.open(str(loc)) as tar:
             tar.extractall(path=tmploc)
     except Exception:
         log.info(
             "Could not read/extract old browsertime results archive",
             path=loc,
             exc_info=True,
         )
         return None
 
-    # Find all the videos
-    oldmp4s = [str(f) for f in pathlib.Path(tmploc).rglob("*.mp4")]
-    log.info("Found %s old videos" % str(len(oldmp4s)))
-    newmp4s = [str(f) for f in pathlib.Path(fetch_dir).rglob("*.mp4")]
-    log.info("Found %s new videos" % str(len(newmp4s)))
+    return tmploc
+
+
+def _data_from_last_task(label):
+    """Gets the data from the last PGO/OPT task with the same label.
+
+    We look for both OPT and PGO tasks. The difference
+    between them should be minimal. This method also provides
+    a way to compare recordings from this task to another
+    known task based on the TC_GROUP_ID environment varible.
+    """
+    label_opt = label.replace("/pgo", "/opt")
+    label_pgo = label.replace("/opt", "/pgo")
+
+    base_ad_query["where"]["and"] = [
+        {"in": {"task.run.state": ["completed"]}},
+        {"or": [
+            {"eq": {"run.name": label_pgo}},
+            {"eq": {"run.name": label_opt}}
+        ]}
+    ]
+
+    task_group_id = os.getenv("TC_GROUP_ID", "")
+    if task_group_id:
+        base_ad_query["where"]["and"].append(
+            {"eq": {"task.group.id": task_group_id}}
+        )
+    else:
+        base_ad_query["where"]["and"].extend([
+            {"in": {"repo.branch.name": ["mozilla-central"]}},
+            {"gte": {"action.start_time": {"date": "today-week-week"}}},
+        ])
+
+    return _get_browsertime_results(base_ad_query)
+
 
-    # Finally, calculate the 2D/3D score
+def _data_from_last_live_task(label):
+    """Gets the data from the last live site PGO task."""
+    label_live = label.replace("/opt", "/pgo").replace("tp6m", "tp6m-live")
+
+    base_ad_query["where"]["and"] = [
+        {"in": {"repo.branch.name": ["mozilla-central"]}},
+        {"gte": {"action.start_time": {"date": "today-week-week"}}},
+        {"in": {"task.run.state": ["completed"]}},
+        {"eq": {"run.name": label_live}},
+    ]
+
+    return _get_browsertime_results(base_ad_query)
+
+
+def _get_similarity(old_videos_info, new_videos_info, output, prefix=""):
+    """Calculates a similarity score for two groupings of videos.
+
+    The technique works as follows:
+        1. Get the last live site test.
+        2. For each 15x15 video pairings, build a cross-correlation matrix:
+            1. Get each of the videos and calculate their histograms
+               across the full videos.
+            2. Calculate the correlation coefficient between these two.
+        3. Average the cross-correlation matrix to obtain the score.
+
+    The 2D similarity score is the same, except that it builds a histogram
+    from the final frame instead of the full video.
+
+    Args:
+        old_videos: List of old videos.
+        new_videos: List of new videos (from this task).
+        output: Location to output videos with low similarity scores.
+        prefix: Prefix a string to the output.
+    Returns:
+        Two similarity scores (3D, 2D) as a float.
+    """
     nhists = []
     nhists2d = []
 
-    total_vids = min(len(oldmp4s), len(newmp4s))
+    old_videos = [entry["data"] for entry in old_videos_info]
+    new_videos = [entry["data"] for entry in new_videos_info]
+
+    total_vids = min(len(old_videos), len(new_videos))
     xcorr = np.zeros((total_vids, total_vids))
     xcorr2d = np.zeros((total_vids, total_vids))
 
     for i in range(total_vids):
-        datao = np.asarray(get_frames(open_data(oldmp4s[i])))
+        datao = np.asarray(_get_frames(old_videos[i]))
 
         histo, _, _ = plt.hist(datao.flatten(), bins=255)
         histo2d, _, _ = plt.hist(datao[-1, :, :].flatten(), bins=255)
 
         for j in range(total_vids):
             if i == 0:
                 # Only calculate the histograms once; it takes time
-                datan = np.asarray(get_frames(open_data(newmp4s[j])))
+                datan = np.asarray(_get_frames(new_videos[j]))
 
                 histn, _, _ = plt.hist(datan.flatten(), bins=255)
                 histn2d, _, _ = plt.hist(datan[-1, :, :].flatten(), bins=255)
 
                 nhists.append(histn)
                 nhists2d.append(histn2d)
             else:
                 histn = nhists[j]
@@ -232,20 +263,98 @@ def calculate_similarity(jobs_json, fetc
             xcorr2d[i, j] = rho2d
 
     similarity = np.mean(xcorr)
     similarity2d = np.mean(xcorr2d)
 
     log.info("Average 3D similarity: %s" % str(np.round(similarity, 5)))
     log.info("Average 2D similarity: %s" % str(np.round(similarity2d, 5)))
 
-    if similarity < 0.5:
-        # For really low correlations, output the worst video pairing
+    if np.round(similarity, 1) <= 0.7 or np.round(similarity2d, 1) <= 0.7:
+        # For low correlations, output the worst video pairing
         # so that we can visually see what the issue was
         minind = np.unravel_index(np.argmin(xcorr, axis=None), xcorr.shape)
 
-        oldvid = oldmp4s[minind[0]]
-        shutil.copyfile(oldvid, str(pathlib.Path(output, "old_video.mp4")))
+        oldvid = old_videos_info[minind[0]]["path"]
+        shutil.copyfile(oldvid, str(pathlib.Path(output, "%sold_video.mp4" % prefix)))
 
-        newvid = newmp4s[minind[1]]
-        shutil.copyfile(newvid, str(pathlib.Path(output, "new_video.mp4")))
+        newvid = new_videos_info[minind[1]]["path"]
+        shutil.copyfile(newvid, str(pathlib.Path(output, "%snew_video.mp4" % prefix)))
 
     return np.round(similarity, 5), np.round(similarity2d, 5)
+
+
+def calculate_similarity(jobs_json, fetch_dir, output):
+    """Calculates the similarity score for this task.
+
+    Here we use activedata to find the last live site that ran and
+    to find the last task (with the same label) that ran. Those two
+    tasks are then compared to the current one and 4 metrics are produced.
+
+    For live sites, we only calculate 2 of these metrics, since the
+    playback similarity is not applicable to it.
+
+    Args:
+        jobs_json: The jobs JSON that holds extra information.
+        fetch_dir: The fetch directory that holds the new videos.
+        output: The output directory.
+    Returns:
+        A dictionary containing up to 4 different metrics (their values default
+        to None if a metric couldn't be calculated):
+            PlaybackSimilarity: Similarity of the full playback to a live site test.
+            PlaybackSimilarity2D: - // - (but for the final frame only)
+            Similarity: Similarity of the tests video recording to its last run.
+            Similarity2D: - // - (but for the final frame only)
+    """
+    global log
+    log = structlog.get_logger()
+
+    label = os.getenv("TC_LABEL", "")
+    if not label:
+        log.info("TC_LABEL is undefined, cannot calculate similarity metrics")
+        return {}
+
+    # Get all the newest videos from this task
+    new_btime_videos = [
+        {"data": _open_data(str(f)), "path": str(f)}
+        for f in pathlib.Path(fetch_dir).rglob("*.mp4")
+    ]
+    log.info("Found %s new videos" % str(len(new_btime_videos)))
+
+    # Get the similarity against the last task
+    old_btime_res = _data_from_last_task(label)
+    old_sim = old_sim2d = None
+    if old_btime_res:
+        old_btime_videos = [
+            {"data": _open_data(str(f)), "path": str(f)}
+            for f in pathlib.Path(old_btime_res).rglob("*.mp4")
+        ]
+        log.info("Found %s old videos" % str(len(old_btime_videos)))
+
+        old_sim, old_sim2d = _get_similarity(
+            old_btime_videos, new_btime_videos, output
+        )
+    else:
+        log.info("Failed to find an older test task")
+
+    # Compare recordings to their live site variant if it exists
+    live_sim = live_sim2d = None
+    if "live" not in jobs_json["extra_options"]:
+        live_btime_res = _data_from_last_live_task(label)
+        if live_btime_res:
+            live_btime_videos = [
+                {"data": _open_data(str(f)), "path": str(f)}
+                for f in pathlib.Path(live_btime_res).rglob("*.mp4")
+            ]
+            log.info("Found %s live videos" % str(len(live_btime_videos)))
+
+            live_sim, live_sim2d = _get_similarity(
+                live_btime_videos, new_btime_videos, output, prefix="live_"
+            )
+        else:
+            log.info("Failed to find a live site variant")
+
+    return {
+        "PlaybackSimilarity": live_sim,
+        "PlaybackSimilarity2D": live_sim2d,
+        "Similarity": old_sim,
+        "Similarity2D": old_sim2d,
+    }
--- a/taskcluster/taskgraph/transforms/visual_metrics_dep.py
+++ b/taskcluster/taskgraph/transforms/visual_metrics_dep.py
@@ -1,17 +1,18 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 """
 These transformations take a task description for a visual metrics task and
 add the necessary environment variables to run on the given inputs.
 """
+from __future__ import absolute_import, print_function, unicode_literals
+import os
 
-from __future__ import absolute_import, print_function, unicode_literals
 from taskgraph.transforms.base import TransformSequence
 
 
 transforms = TransformSequence()
 
 SYMBOL = "%(groupSymbol)s(%(symbol)s-vismet)"
 # the test- prefix makes the task SETA-optimized.
 LABEL = "test-vismet-%(platform)s-%(raptor_try_name)s"
@@ -28,17 +29,21 @@ def run_visual_metrics(config, jobs):
             attributes = dict(dep_job.attributes)
             attributes['platform'] = platform
             job['label'] = LABEL % attributes
             treeherder_info = dict(dep_job.task['extra']['treeherder'])
             job['treeherder']['symbol'] = SYMBOL % treeherder_info
 
             # Store the platform name so we can use it to calculate
             # the similarity metric against other tasks
-            job['worker'].setdefault('env', {})['TC_PLATFORM'] = platform
+            job['worker'].setdefault('env', {})['TC_LABEL'] = dep_job.label
+
+            # Setting the `TC_GROUP_ID` environment variable to a task group ID
+            # is a simple way to compare videos to a specific task group
+            job['worker']['env']['TC_GROUP_ID'] = os.getenv("TC_GROUP_ID", "")
 
             # vismet runs on Linux but we want to have it displayed
             # alongside the job it was triggered by to make it easier for
             # people to find it back.
             job['treeherder']['platform'] = platform
 
             # run-on-projects needs to be set based on the dependent task
             job['run-on-projects'] = attributes['run_on_projects']