Test similarity score. draft
authorGregory Mierzwinski <gmierz2@outlook.com>
Wed, 29 Apr 2020 20:19:26 -0400
changeset 2852265 557d010171a836c892bd8cfb610a9d72c2b066b9
parent 2852087 9549da267024243448900d9e5b4ecf01464cbdd2
child 2852266 ddc7e148661363471aefda57eb2762befce39aa0
push id528809
push usergmierz2@outlook.com
push dateThu, 30 Apr 2020 02:07:35 +0000
treeherdertry@ddc7e1486613 [default view] [failures only]
milestone77.0a1
Test similarity score.
taskcluster/docker/visual-metrics/Dockerfile
taskcluster/docker/visual-metrics/run-visual-metrics.py
taskcluster/taskgraph/transforms/visual_metrics_dep.py
testing/raptor/raptor/results.py
--- a/taskcluster/docker/visual-metrics/Dockerfile
+++ b/taskcluster/docker/visual-metrics/Dockerfile
@@ -16,13 +16,17 @@ WORKDIR /builds/worker
 
 # %include testing/mozharness/external_tools/performance-artifact-schema.json
 ADD topsrcdir/testing/mozharness/external_tools/performance-artifact-schema.json /builds/worker/performance-artifact-schema.json
 
 COPY requirements.txt /builds/worker/requirements.txt
 RUN pip3 install setuptools==42.0.2
 RUN pip3 install --require-hashes -r /builds/worker/requirements.txt && \
     rm /builds/worker/requirements.txt
+RUN pip3 install numpy
+RUN pip3 install scipy
+RUN pip3 install matplotlib
+RUN pip3 install opencv-python
 
 COPY run-visual-metrics.py /builds/worker/bin/run-visual-metrics.py
 RUN chmod +x /builds/worker/bin/run-visual-metrics.py
 
 VOLUME /builds/worker/artifacts/
old mode 100644
new mode 100755
--- a/taskcluster/docker/visual-metrics/run-visual-metrics.py
+++ b/taskcluster/docker/visual-metrics/run-visual-metrics.py
@@ -4,30 +4,39 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 """Instrument visualmetrics.py to run in parallel."""
 
 import argparse
 import json
 import os
+import signal
 import statistics
 import subprocess
 import sys
 import tarfile
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
 from multiprocessing import cpu_count
 from pathlib import Path
 
 import attr
 import structlog
 from jsonschema import validate
 from voluptuous import ALLOW_EXTRA, Required, Schema
 
+import urllib
+
+try:
+    from urllib.parse import urlencode
+    from urllib.request import urlopen, urlretrieve
+except ImportError:
+    from urllib import urlencode, urlretrieve
+    from urllib2 import urlopen
 
 #: The directory where artifacts from this job will be placed.
 OUTPUT_DIR = Path("/", "builds", "worker", "artifacts")
 
 #: A job to process through visualmetrics.py
 @attr.s
 class Job:
     #: The name of the test.
@@ -42,17 +51,17 @@ class Job:
 
 #: The schema for validating jobs.
 JOB_SCHEMA = Schema(
     {
         Required("jobs"): [
             {Required("test_name"): str, Required("browsertime_json_path"): str}
         ],
         Required("application"): {Required("name"): str, "version": str},
-        Required("extraOptions"): [str],
+        Required("extraOptions"): [str]
     }
 )
 
 #: A partial schema for browsertime.json files.
 BROWSERTIME_SCHEMA = Schema(
     [{Required("files"): {Required("video"): [str]}}], extra=ALLOW_EXTRA
 )
 
@@ -167,16 +176,36 @@ def read_json(json_path, schema):
         schema(data)
     except Exception:
         log.error("JSON failed to validate", exc_info=True)
         raise
 
     return data
 
 
+def query_activedata(query_json):
+    """Used to run queries on active data.
+    """
+    active_data_url = "http://activedata.allizom.org/query"
+
+    req = urllib.request.Request(active_data_url)
+    req.add_header("Content-Type", "application/json")
+    jsondata = json.dumps(query_json)
+
+    jsondataasbytes = jsondata.encode("utf-8")
+    req.add_header("Content-Length", len(jsondataasbytes))
+
+    print("Querying Active-data...")
+    response = urllib.request.urlopen(req, jsondataasbytes)
+    print("Status:" + str(response.getcode()))
+
+    data = json.loads(response.read().decode("utf8").replace("'", '"'))["data"]
+    return data
+
+
 def main(log, args):
     """Run visualmetrics.py in parallel.
 
     Args:
         log: The structlog logger instance.
         args: The parsed arguments from the argument parser.
 
     Returns:
@@ -281,16 +310,178 @@ def main(log, args):
         "framework": {"name": "browsertime"},
         "application": jobs_json["application"],
         "type": "vismet",
         "suites": suites,
     }
     for entry in suites:
         entry["extraOptions"] = jobs_json["extraOptions"]
 
+    avgcorr = None
+    if "live" in jobs_json["extraOptions"]:
+        # Get the similarity of the recordings
+        similarity = 0
+
+        # Find the last good job and get its videos
+        app = jobs_json["application"]["name"]
+        test = jobs_json["jobs"][0]["test_name"]
+        test = test.split("-cold")[0]
+        plat = os.getenv("TC_PLATFORM", "").replace("/opt", "").replace("/pgo", "")
+        ad_query = {
+            "from":"task",
+            "limit":1000,
+            "where":{"and":[
+                {"regexp":{"run.name":".*%s.*browsertime.*-live.*%s.*%s.*" % (plat, app, test)}},
+                {"not":{"prefix":{"run.name":"test-vismet"}}},
+                {"in":{"repo.branch.name":["mozilla-central"]}},
+                {"gte":{"action.start_time":{"date":"today-week-week"}}},
+                {"lt":{"action.start_time":{"date":"today-1day"}}},
+                {"in":{"task.run.state":["completed"]}}
+            ]},
+            "select":["action.start_time","run.name", "task.artifacts"]
+        }
+        print(json.dumps(ad_query, indent=4))
+
+        import tempfile
+        import signal
+        import numpy as np
+        import pathlib
+        import cv2
+        from matplotlib import pyplot as plt
+        from scipy.stats import spearmanr
+
+        def handler(signum, frame):
+            raise Exception("Timed out.")
+        signal.signal(signal.SIGALRM, handler)
+
+        def download(url, loc):
+            """Downloads a JSON through a thread"""
+            print("Downloading %s" % url)
+            try:
+                # Timeout after 2 minutes
+                signal.alarm(120)
+                urlretrieve(url, loc)
+            except Exception as e:
+                log.info(str(e))
+                return False
+            finally:
+                signal.alarm(0)
+            return True
+
+        TMPDIR = tempfile.mkdtemp()
+        loc = os.path.join(TMPDIR, "tmpfile.tgz")
+
+        failed = False
+        try:
+            signal.alarm(120)
+            data = query_activedata(ad_query)
+        except Exception as e:
+            log.info(str(e))
+            failed = True
+        signal.alarm(0)
+
+        if failed:
+            log.info("Couldn't get activedata data")
+        print(data)
+
+        if not failed:
+            maxind = 0
+            log.info("Found %s datums" % str(len(data["action.start_time"])))
+            if len(data["action.start_time"]) > 1:
+                maxind = np.argmax([float(t) for t in data["action.start_time"]])
+
+            artifacts = data["task.artifacts"][maxind]
+            btime_artifact = None
+            for art in artifacts:
+                if "browsertime-results" in art["name"]:
+                    btime_artifact = art["url"]
+                    break
+            if not btime_artifact:
+                raise Exception("Can't find an older live site tested")
+
+            log.info(btime_artifact)
+            download(btime_artifact, loc)
+
+            tmploc = tempfile.mkdtemp()
+            try:
+                with tarfile.open(str(loc)) as tar:
+                    tar.extractall(path=tmploc)
+            except Exception:
+                log.error(
+                    "Could not read extract browsertime results archive",
+                    path=browsertime_results_path,
+                    exc_info=True
+                )
+                return 1
+
+            oldmp4s = [
+                str(f)
+                for f in pathlib.Path(tmploc).rglob("*.mp4")
+            ]
+            log.info("Found %s old videos" % str(len(oldmp4s)))
+
+            newmp4s = [
+                str(f)
+                for f in pathlib.Path(fetch_dir).rglob("*.mp4")
+            ]
+            log.info("Found %s new videos" % str(len(newmp4s)))
+
+            total_old = len(oldmp4s)
+            total_new = len(newmp4s)
+            total_vids = min(total_old, total_new)
+
+            def open_data(file):
+                return cv2.VideoCapture(str(file))
+
+            def get_frames(video):
+                allframes = []
+                while video.isOpened():
+                    ret, frame = video.read()
+                    if ret:
+                        allframes.append(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY))
+                    else:
+                        video.release()
+                        break
+                return allframes
+
+            ohists = []
+            xcorr = np.zeros((total_vids, total_vids))
+            for i in range(total_vids):
+                dat1 = get_frames(open_data(oldmp4s[i]))
+                bn = np.asarray(dat1)
+                allbn = bn.flatten()
+                histn, _, _ = plt.hist(allbn, bins=255)
+
+                for j in range(total_vids):
+                    if i == 0:
+                        dat2 = get_frames(open_data(newmp4s[j]))
+                        bo = np.asarray(dat2)
+                        allbo = bo.flatten()
+                        histo, _, _ = plt.hist(allbo, bins=255)
+                        ohists.append(histo)
+                    else:
+                        histo = ohists[j]
+
+                    rho, p = spearmanr(histn, histo)
+                    xcorr[i,j] = rho
+
+            print(xcorr)
+            avgcorr = np.mean(xcorr)
+            print(avgcorr)
+            log.info("Average corr: %s" % str(avgcorr))
+
+    if avgcorr:
+        suites[0]["subtests"].append({
+            "name": "Similarity",
+            "value": np.round(avgcorr,5),
+            "replicates": [np.round(avgcorr,5)],
+            "lowerIsBetter": False,
+            "unit": "a.u.",
+        })
+
     # Validates the perf data complies with perfherder schema.
     # The perfherder schema uses jsonschema so we can't use voluptuous here.
     validate(perf_data, PERFHERDER_SCHEMA)
 
     raw_perf_data = json.dumps(perf_data)
     with Path(OUTPUT_DIR, "perfherder-data.json").open("w") as f:
         f.write(raw_perf_data)
     # Prints the data in logs for Perfherder to pick it up.
--- a/taskcluster/taskgraph/transforms/visual_metrics_dep.py
+++ b/taskcluster/taskgraph/transforms/visual_metrics_dep.py
@@ -25,16 +25,18 @@ def run_visual_metrics(config, jobs):
             platform = dep_job.task['extra']['treeherder-platform']
             job['dependencies'] = {dep_job.label: dep_job.label}
             job['fetches'][dep_job.label] = ['/public/test_info/browsertime-results.tgz']
             attributes = dict(dep_job.attributes)
             attributes['platform'] = platform
             job['label'] = LABEL % attributes
             treeherder_info = dict(dep_job.task['extra']['treeherder'])
             job['treeherder']['symbol'] = SYMBOL % treeherder_info
+            print(job)
+            job['worker'].setdefault('env', {})['TC_PLATFORM'] = platform
 
             # vismet runs on Linux but we want to have it displayed
             # alongside the job it was triggered by to make it easier for
             # people to find it back.
             job['treeherder']['platform'] = platform
 
             # run-on-projects needs to be set based on the dependent task
             job['run-on-projects'] = attributes['run_on_projects']
--- a/testing/raptor/raptor/results.py
+++ b/testing/raptor/raptor/results.py
@@ -661,17 +661,17 @@ class BrowsertimeResultsHandler(Perftest
             validate_success = self._validate_treeherder_data(output, out_perfdata)
 
         if len(video_jobs) > 0:
             # The video list and application metadata (browser name and
             # optionally version) that will be used in the visual metrics task.
             jobs_json = {
                 "jobs": video_jobs,
                 "application": {"name": self.browser_name},
-                "extraOptions": output.summarized_data["suites"][0]["extraOptions"]
+                "extraOptions": output.summarized_results["suites"][0]["extraOptions"]
             }
 
             if self.browser_version is not None:
                 jobs_json["application"]["version"] = self.browser_version
 
             jobs_file = os.path.join(self.result_dir(), "jobs.json")
             LOG.info(
                 "Writing video jobs and application data {} into {}".format(