Bug 1369604: Replace Metadata JSON parsing with faster regex matching. r=liuche
☠☠ backed out by ec0c38721aff ☠ ☠
authorMichael Comella <michael.l.comella@gmail.com>
Fri, 28 Jul 2017 13:45:41 -0700
changeset 420530 7cd7e31b61ec48ebac89e32001cfd0b4724e67ee
parent 420529 f23174dcc5e60458b0bb44938a69f3c3b2443fef
child 420531 7232652c03042c62967082e9e34b95e3f7263ba2
push id7566
push usermtabara@mozilla.com
push dateWed, 02 Aug 2017 08:25:16 +0000
treeherdermozilla-beta@86913f512c3c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersliuche
bugs1369604
milestone56.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1369604: Replace Metadata JSON parsing with faster regex matching. r=liuche I replaced JSON parsing for all highlight candidates (at most, 500) with a faster estimation using regex: we only use the full JSON parsing to get the perfect values for the items to be shown (~5). One caveat of this change: JSON parsing will be moved to the main thread when the getMetadataSlow is lazily-loaded. Disclaimer: my device seems to be running faster than yesterday so profiling may not be consistent but here are the profiling results: - HighlightsRanking.extractFeatures: 78.1% -> 54.5% - Highlight.<init>: 26.5% -> 14.5% - JSONObject.<init>: 11.4% -> rm'd - initFast*: (replaced JSONObject.<init> & friends) -> 4.2% With ^ the disclaimer in mind, runtime decreased from 12.6s to 5.3s (this is slower due to profiling). MozReview-Commit-ID: CTqAyDDmaJQ
mobile/android/app/src/test/java/org/mozilla/gecko/activitystream/homepanel/model/TestHighlight.java
mobile/android/base/java/org/mozilla/gecko/activitystream/homepanel/model/Highlight.java
mobile/android/base/java/org/mozilla/gecko/activitystream/homepanel/model/Metadata.java
mobile/android/base/java/org/mozilla/gecko/activitystream/ranking/HighlightCandidate.java
mobile/android/base/java/org/mozilla/gecko/activitystream/ranking/HighlightsRanking.java
new file mode 100644
--- /dev/null
+++ b/mobile/android/app/src/test/java/org/mozilla/gecko/activitystream/homepanel/model/TestHighlight.java
@@ -0,0 +1,91 @@
+/* Any copyright is dedicated to the Public Domain.
+   http://creativecommons.org/publicdomain/zero/1.0/ */
+
+package org.mozilla.gecko.activitystream.homepanel.model;
+
+import junit.framework.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mozilla.gecko.background.testhelpers.TestRunner;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static android.R.id.input;
+
+@RunWith(TestRunner.class)
+public class TestHighlight {
+
+    @Test
+    public void testInitFastImageURL() throws Exception {
+        final Map<String, String> jsonToExpected = new HashMap<>();
+        jsonToExpected.put(
+                "{\"image_url\":\"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/f\\/f1\\/Brauysegen_im_Bett.gif\"}",
+                "https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/f\\/f1\\/Brauysegen_im_Bett.gif");
+        jsonToExpected.put(
+                "{\"image_url\":\"https:\\/\\/www.apple.com\\/v\\/music\\/e\\/images\\/shared\\/og_image_family.png?201706051846\",\"provider\":\"Apple\",\"description_length\":135}",
+                "https:\\/\\/www.apple.com\\/v\\/music\\/e\\/images\\/shared\\/og_image_family.png?201706051846");
+        jsonToExpected.put(
+                "{\"image_url\":\"https:\\/\\/i.kinja-img.com\\/gawker-media\\/image\\/upload\\/s--XYjAnDao--\\/c_fill,fl_progressive,g_center,h_200,q_80,w_200\\/ghxlwgdztvqerb4zptdx.png\",\"provider\":\"Kotaku\",\"description_length\":17}",
+                "https:\\/\\/i.kinja-img.com\\/gawker-media\\/image\\/upload\\/s--XYjAnDao--\\/c_fill,fl_progressive,g_center,h_200,q_80,w_200\\/ghxlwgdztvqerb4zptdx.png");
+
+        for (final Map.Entry<String, String> entry : jsonToExpected.entrySet()) {
+            final String input = entry.getKey();
+            final String expected = entry.getValue();
+            Assert.assertEquals("For input: " + input, expected, Highlight.initFastImageURL(input));
+        }
+    }
+
+    @Test
+    public void testInitFastImageURLReturnsNullWithoutImageURL() {
+        final List<String> inputs = Arrays.asList(
+                "{\"description_length\":130}",
+                "{\"provider\":\"CNN\",\"description_length\":117}"
+        );
+
+        for (final String input : inputs) {
+            Assert.assertNull("For input: " + input, Highlight.initFastImageURL(input));
+        }
+    }
+
+    @Test
+    public void testInitFastImageURLNullInput() {
+        Assert.assertNull(Highlight.initFastImageURL(null));
+    }
+
+    @Test
+    public void testInitFastDescriptionLength() {
+        final Map<String, Integer> jsonToExpected = new HashMap<>();
+        jsonToExpected.put("{\"description_length\":130}", 130);
+        jsonToExpected.put(
+                "{\"image_url\":\"https:\\/\\/www.apple.com\\/v\\/music\\/e\\/images\\/shared\\/og_image_family.png?201706051846\",\"provider\":\"Apple\",\"description_length\":135}",
+                135);
+        jsonToExpected.put(
+                "{\"image_url\":\"https:\\/\\/i.kinja-img.com\\/gawker-media\\/image\\/upload\\/s--XYjAnDao--\\/c_fill,fl_progressive,g_center,h_200,q_80,w_200\\/ghxlwgdztvqerb4zptdx.png\",\"provider\":\"Kotaku\",\"description_length\":17}",
+                17);
+
+        for (final Map.Entry<String, Integer> entry : jsonToExpected.entrySet()) {
+            final String input = entry.getKey();
+            final int expected = entry.getValue();
+            Assert.assertEquals("For input: " + input, expected, Highlight.initFastDescriptionLength(input));
+        }
+    }
+
+    @Test
+    public void testInitFastDescriptionLengthMissingDescriptionLengthKey() {
+        Assert.assertEquals(0, Highlight.initFastDescriptionLength(
+                "{\"image_url\":\"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/f\\/f1\\/Brauysegen_im_Bett.gif\"}"));
+    }
+
+    @Test
+    public void testInitFastDescriptionLengthInvalidValue() {
+        Assert.assertEquals(0, Highlight.initFastDescriptionLength("{\"description_length\":abc}"));
+    }
+
+    @Test
+    public void testInitFastDescriptionLengthNullInput() {
+        Assert.assertEquals(0, Highlight.initFastDescriptionLength(null));
+    }
+}
\ No newline at end of file
--- a/mobile/android/base/java/org/mozilla/gecko/activitystream/homepanel/model/Highlight.java
+++ b/mobile/android/base/java/org/mozilla/gecko/activitystream/homepanel/model/Highlight.java
@@ -2,50 +2,98 @@
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 package org.mozilla.gecko.activitystream.homepanel.model;
 
 import android.database.Cursor;
 import android.support.annotation.Nullable;
+import android.support.annotation.VisibleForTesting;
+import android.text.TextUtils;
 import android.text.format.DateUtils;
 import org.mozilla.gecko.activitystream.Utils;
 import org.mozilla.gecko.activitystream.ranking.HighlightCandidateCursorIndices;
+import org.mozilla.gecko.activitystream.ranking.HighlightsRanking;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 public class Highlight implements Item {
+
+    /**
+     * A pattern matching a json object containing the key "image_url" and extracting the value. afaik, these urls
+     * are not encoded so it's entirely possible that the url will contain a quote and we will not extract the whole
+     * url. However, given these are coming from websites providing favicon-like images, it's not likely a quote will
+     * appear and since these urls are only being used to compare against one another (as imageURLs in Highlight items),
+     * a partial URL may actually have the same behavior: good enough for me!
+     */
+    private static final Pattern FAST_IMAGE_URL_PATTERN = Pattern.compile("\"image_url\":\"([^\"]+)\"");
+
+    // A pattern matching a json object containing the key "description_length" and extracting the value: this
+    // regex should perfectly match values in json without whitespace.
+    private static final Pattern FAST_DESCRIPTION_LENGTH_PATTERN = Pattern.compile("\"description_length\":([0-9]+)");
+
     private final String title;
     private final String url;
     private final Utils.HighlightSource source;
     private final long time;
 
     private long historyId;
 
-    private Metadata metadata;
+    private @Nullable Metadata metadata;
+    private @Nullable final String metadataJSON;
+    private @Nullable String fastImageURL;
+    private int fastDescriptionLength;
 
     private @Nullable Boolean isPinned;
     private @Nullable Boolean isBookmarked;
 
     public static Highlight fromCursor(final Cursor cursor, final HighlightCandidateCursorIndices cursorIndices) {
         return new Highlight(cursor, cursorIndices);
     }
 
     private Highlight(final Cursor cursor, final HighlightCandidateCursorIndices cursorIndices) {
         title = cursor.getString(cursorIndices.titleColumnIndex);
         url = cursor.getString(cursorIndices.urlColumnIndex);
         source = Utils.highlightSource(cursor, cursorIndices);
         time = cursor.getLong(cursorIndices.highlightsDateColumnIndex);
 
         historyId = cursor.getLong(cursorIndices.historyIDColumnIndex);
 
-        metadata = Metadata.fromCursor(cursor, cursorIndices);
+        metadataJSON = cursor.getString(cursorIndices.metadataColumnIndex);
+        fastImageURL = initFastImageURL(metadataJSON);
+        fastDescriptionLength = initFastDescriptionLength(metadataJSON);
 
         updateState();
     }
 
+    /** Gets a fast image URL. Full docs for this method at {@link #getFastImageURLForComparison()} & {@link #FAST_IMAGE_URL_PATTERN}. */
+    @VisibleForTesting static @Nullable String initFastImageURL(final String metadataJSON) {
+        return extractFirstGroupFromMetadataJSON(metadataJSON, FAST_IMAGE_URL_PATTERN);
+    }
+
+    /** Gets a fast description length. Full docs for this method at {@link #getFastDescriptionLength()} & {@link #FAST_DESCRIPTION_LENGTH_PATTERN}. */
+    @VisibleForTesting static int initFastDescriptionLength(final String metadataJSON) {
+        final String extractedStr = extractFirstGroupFromMetadataJSON(metadataJSON, FAST_DESCRIPTION_LENGTH_PATTERN);
+        try {
+            return !TextUtils.isEmpty(extractedStr) ? Integer.parseInt(extractedStr) : 0;
+        } catch (final NumberFormatException e) { /* intentionally blank */ }
+        return 0;
+    }
+
+    private static @Nullable String extractFirstGroupFromMetadataJSON(final String metadataJSON, final Pattern pattern) {
+        if (metadataJSON == null) {
+            return null;
+        }
+
+        final Matcher matcher = pattern.matcher(metadataJSON);
+        return matcher.find() ? matcher.group(1) : null;
+    }
+
     private void updateState() {
         // We can only be certain of bookmark state if an item is a bookmark item.
         // Otherwise, due to the underlying highlights query, we have to look up states when
         // menus are displayed.
         switch (source) {
             case BOOKMARKED:
                 isBookmarked = true;
                 isPinned = null;
@@ -62,20 +110,77 @@ public class Highlight implements Item {
     public String getTitle() {
         return title;
     }
 
     public String getUrl() {
         return url;
     }
 
-    public Metadata getMetadata() {
+    /**
+     * Retrieves the metadata associated with this highlight, lazily loaded.
+     *
+     * AVOID USING THIS FOR A LARGE NUMBER OF ITEMS, particularly in {@link HighlightsRanking#extractFeatures(Cursor)},
+     * where we added lazy loading to improve performance.
+     *
+     * The JSONObject constructor inside Metadata takes a non-trivial amount of time to run so
+     * we lazily load it. At the time of writing, in {@link HighlightsRanking#extractFeatures(Cursor)}, we get
+     * 500 highlights before curating down to the ~5 shown items. For the non-displayed items, we use
+     * the getFast* methods and, for the shown items, lazy-load the metadata since only then is it necessary.
+     * These methods include:
+     * - {@link #getFastDescriptionLength()}
+     * - {@link #getFastImageURLForComparison()}
+     * - {@link #hasFastImageURL()}
+     */
+    public Metadata getMetadataSlow() {
+        if (metadata == null) {
+            metadata = new Metadata(metadataJSON);
+        }
         return metadata;
     }
 
+    /**
+     * Returns the image url in the highlight's metadata. This value does not provide valid image url but is
+     * consistent across invocations and can be used to compare against other Highlight's fast image urls.
+     * See {@link #getMetadataSlow()} for a description of why we use this method.
+     *
+     * To get a valid image url (at a performance penalty), use {@link #getMetadataSlow()}
+     * {@link #getMetadataSlow()} & {@link Metadata#getImageUrl()}.
+     *
+     * Note that this explanation is dependent on the implementation of {@link #initFastImageURL(String)}.
+     *
+     * @return the image url, or null if one could not be found.
+     */
+    public @Nullable String getFastImageURLForComparison() {
+        return fastImageURL;
+    }
+
+    /**
+     * Returns true if {@link #getFastImageURLForComparison()} has found an image url, false otherwise.
+     * See that method for caveats.
+     */
+    public boolean hasFastImageURL() {
+        return fastImageURL != null;
+    }
+
+    /**
+     * Returns the description length in the highlight's metadata. This value is expected to correct in all cases.
+     * See {@link #getMetadataSlow()} for why we use this method.
+     *
+     * This is a faster version of {@link #getMetadataSlow()} & {@link Metadata#getDescriptionLength()} because
+     * retrieving the metadata in this way does a full json parse, which is slower.
+     *
+     * Note: this explanation is dependent on the implementation of {@link #initFastDescriptionLength(String)}.
+     *
+     * @return the given description length, or 0 if no description length was given
+     */
+    public int getFastDescriptionLength() {
+        return fastDescriptionLength;
+    }
+
     public Boolean isBookmarked() {
         return isBookmarked;
     }
 
     public Boolean isPinned() {
         return isPinned;
     }
 
--- a/mobile/android/base/java/org/mozilla/gecko/activitystream/homepanel/model/Metadata.java
+++ b/mobile/android/base/java/org/mozilla/gecko/activitystream/homepanel/model/Metadata.java
@@ -1,34 +1,29 @@
 /* -*- Mode: Java; c-basic-offset: 4; tab-width: 20; indent-tabs-mode: nil; -*-
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 package org.mozilla.gecko.activitystream.homepanel.model;
 
-import android.database.Cursor;
+import android.support.annotation.Nullable;
 import android.text.TextUtils;
 import android.util.Log;
 import org.json.JSONException;
 import org.json.JSONObject;
-import org.mozilla.gecko.activitystream.ranking.HighlightCandidateCursorIndices;
 
 public class Metadata {
     private static final String LOGTAG = "GeckoMetadata";
 
-    public static Metadata fromCursor(final Cursor cursor, final HighlightCandidateCursorIndices cursorIndices) {
-        return new Metadata(cursor.getString(cursorIndices.metadataColumnIndex));
-    }
-
     private String provider;
     private String imageUrl;
     private int descriptionLength;
 
-    private Metadata(String json) {
+    /* package-private */ Metadata(String json) {
         if (TextUtils.isEmpty(json)) {
             // Just use default values. It's better to have an empty Metadata object instead of
             // juggling with null values.
             return;
         }
 
         try {
             JSONObject object = new JSONObject(json);
@@ -42,26 +37,21 @@ public class Metadata {
     }
 
     public boolean hasProvider() {
         return !TextUtils.isEmpty(provider);
     }
 
     /**
      * Returns the URL of an image representing this site. Returns null if no image could be found.
-     * Use hasImageUrl() to avoid dealing with null values.
      */
-    public String getImageUrl() {
+    public @Nullable String getImageUrl() {
         return imageUrl;
     }
 
-    public boolean hasImageUrl() {
-        return imageUrl != null;
-    }
-
     public String getProvider() {
         return provider;
     }
 
     public int getDescriptionLength() {
         return descriptionLength;
     }
 }
--- a/mobile/android/base/java/org/mozilla/gecko/activitystream/ranking/HighlightCandidate.java
+++ b/mobile/android/base/java/org/mozilla/gecko/activitystream/ranking/HighlightCandidate.java
@@ -110,32 +110,32 @@ import java.lang.annotation.RetentionPol
         // TODO: Calculate values based using domain hash field (bug 1335817)
         final int occurrences = 1; // Number of times host shows up in history (Bug 1319485)
         final int domainCountSize = cursor.getCount(); // Number of domains visited (Bug 1319485)
 
         candidate.features.put(
                 FEATURE_DOMAIN_FREQUENCY,
                 Math.log(1 + domainCountSize / occurrences));
 
-        candidate.imageUrl = candidate.highlight.getMetadata().getImageUrl();
+        candidate.imageUrl = candidate.highlight.getFastImageURLForComparison();
 
         // The desktop add-on used the number of images returned form Embed.ly here. This is not the
         // same as total images on the page (think of small icons or the famous spacer.gif). So for
         // now this value will only be 1 or 0 depending on whether we found a good image. The desktop
         // team will face the same issue when switching from Embed.ly to the metadata-page-parser.
         // At this point we can try to find a fathom rule for determining a good value here.
         candidate.features.put(
                 FEATURE_IMAGE_COUNT,
-                candidate.highlight.getMetadata().hasImageUrl() ? 1d : 0d);
+                candidate.highlight.hasFastImageURL() ? 1d : 0d);
 
         // TODO: We do not store the size of the main image (Bug 1335819).
         // The desktop add-on calculates: Math.min(image.width * image.height, 1e5)
         candidate.features.put(
                 FEATURE_IMAGE_SIZE,
-                candidate.highlight.getMetadata().hasImageUrl() ? 1d : 0d
+                candidate.highlight.hasFastImageURL() ? 1d : 0d
         );
 
         // Historical note: before Bug 1335198, this value was not really the time at which the
         // bookmark was created by the user. Especially synchronized bookmarks could have a recent
         // value but have been bookmarked a long time ago.
         // Current behaviour: synchronized clients will, over time, converge DATE_CREATED field
         // to the real creation date, or the earliest one mentioned in the clients constellation.
         // We are sourcing highlights from the recent visited history - so in order to
@@ -147,17 +147,17 @@ import java.lang.annotation.RetentionPol
         } else {
             candidate.features.put(
                     FEATURE_BOOKMARK_AGE_IN_MILLISECONDS,
                     Math.max(1, System.currentTimeMillis() - cursor.getDouble(cursorIndices.bookmarkDateCreatedColumnIndex)));
         }
 
         candidate.features.put(
                 FEATURE_DESCRIPTION_LENGTH,
-                (double) candidate.highlight.getMetadata().getDescriptionLength());
+                (double) candidate.highlight.getFastDescriptionLength());
 
         final Uri uri = Uri.parse(candidate.highlight.getUrl());
 
         // We don't support opaque URIs (such as mailto:...), or URIs which do not have a valid host.
         // The latter might simply be URIs with invalid characters in them (such as underscore...).
         // See Bug 1363521 and Bug 1378967.
         if (!uri.isHierarchical() || uri.getHost() == null) {
             throw new InvalidHighlightCandidateException();
@@ -189,18 +189,22 @@ import java.lang.annotation.RetentionPol
     /* package-private */ String getUrl() {
         return highlight.getUrl();
     }
 
     /* package-private */ String getHost() {
         return host;
     }
 
+    /**
+     * Gets an estimate of the actual image url that should only be used to compare against other return
+     * values of this method. See {@link Highlight#getFastImageURLForComparison()} for more details.
+     */
     @Nullable
-    /* package-private */ String getImageUrl() {
+    /* package-private */ String getFastImageUrlForComparison() {
         return imageUrl;
     }
 
     /* package-private */ Highlight getHighlight() {
         return highlight;
     }
 
     /* package-private */ static class InvalidHighlightCandidateException extends Exception {
--- a/mobile/android/base/java/org/mozilla/gecko/activitystream/ranking/HighlightsRanking.java
+++ b/mobile/android/base/java/org/mozilla/gecko/activitystream/ranking/HighlightsRanking.java
@@ -220,17 +220,17 @@ public class HighlightsRanking {
 
         applyInPairs(candidates, new Action2<HighlightCandidate, HighlightCandidate>() {
             @Override
             public void call(HighlightCandidate previous, HighlightCandidate next) {
                 boolean hasImage = previous.features.get(FEATURE_IMAGE_COUNT) > 0
                         && next.features.get(FEATURE_IMAGE_COUNT) > 0;
 
                 boolean similar = previous.getHost().equals(next.getHost());
-                similar |= hasImage && next.getImageUrl().equals(previous.getImageUrl());
+                similar |= hasImage && next.getFastImageUrlForComparison().equals(previous.getFastImageUrlForComparison());
 
                 if (similar) {
                     next.updateScore(next.getScore() * penalty[0]);
                     penalty[0] -= 0.2;
                 } else {
                     penalty[0] = 0.8;
                 }
             }