Bug 1197866 - Part 2: Represent visited URLs with Murmur hash. r=nalexander
☠☠ backed out by 4fc03405a420 ☠ ☠
authorAndrew Gaul <andrew@gaul.org>
Mon, 30 Apr 2018 21:15:15 -0700
changeset 490970 a80113dba9c130824111bb48f9fde2b9c4406fcd
parent 490969 7b9befdce34d9afd8c754ea561dca35da11a87fd
child 490971 32db311632da28ba478fe6d25f39ebbd365f7ae7
push id247
push userfmarier@mozilla.com
push dateSat, 27 Oct 2018 01:06:44 +0000
reviewersnalexander
bugs1197866
milestone65.0a1
Bug 1197866 - Part 2: Represent visited URLs with Murmur hash. r=nalexander This set can contain tens of thousands of entries each with 100 characters, consuming several megabytes. Representing this as specialized primitive long set instead of HashSet<String> reduces this to a few hundred kilobytes by removing the characters and halving the object overhead by using long instead of String and its char array. Using only 64 bits of the Murmush hash should have few collisions given the limited size of the map.
mobile/android/app/src/test/java/org/mozilla/gecko/SimpleLongOpenHashSetTest.java
mobile/android/base/java/org/mozilla/gecko/GlobalHistory.java
mobile/android/base/java/org/mozilla/gecko/SimpleLongOpenHashSet.java
new file mode 100644
--- /dev/null
+++ b/mobile/android/app/src/test/java/org/mozilla/gecko/SimpleLongOpenHashSetTest.java
@@ -0,0 +1,75 @@
+/* Any copyright is dedicated to the Public Domain.
+   http://creativecommons.org/publicdomain/zero/1.0/ */
+
+package org.mozilla.gecko;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+public final class SimpleLongOpenHashSetTest {
+    @Test
+    public void testEmpty() {
+        SimpleLongOpenHashSet set = new SimpleLongOpenHashSet();
+
+        assertFalse(set.contains(42));
+        assertFalse(set.contains(0));
+        assertFalse(set.contains(-1));
+        assertTrue(set.size() == 0);
+    }
+
+    @Test
+    public void testOneElement() {
+        SimpleLongOpenHashSet set = new SimpleLongOpenHashSet();
+        assertTrue(set.add(42));
+        assertFalse(set.add(42));
+
+        assertTrue(set.contains(42));
+        assertFalse(set.contains(1));
+        assertFalse(set.contains(0));
+        assertFalse(set.contains(-1));
+        assertTrue(set.size() == 1);
+    }
+
+    @Test
+    public void testTwoElementsSameBucket() {
+        SimpleLongOpenHashSet set = new SimpleLongOpenHashSet(4);
+        assertTrue(set.add(0));
+        assertTrue(set.add(4));
+
+        assertTrue(set.contains(0));
+        assertTrue(set.contains(4));
+        assertFalse(set.contains(8));
+        assertTrue(set.size() == 2);
+    }
+
+    @Test
+    public void testManyElementsResize() {
+        SimpleLongOpenHashSet set = new SimpleLongOpenHashSet(4);
+        int count = 16;
+        for (int i = 0; i < count; ++i) {
+            assertTrue(set.add(i));
+        }
+
+        for (int i = 0; i < count; ++i) {
+            assertTrue(set.contains(i));
+        }
+        assertTrue(set.size() == count);
+    }
+
+    @Test
+    public void testConstructorEdgeCases() {
+        for (int i = 0; i < 3; ++i) {
+            SimpleLongOpenHashSet set = new SimpleLongOpenHashSet(i);
+            assertTrue(set.add(42));
+            assertFalse(set.add(42));
+
+            assertTrue(set.contains(42));
+            assertFalse(set.contains(1));
+            assertFalse(set.contains(0));
+            assertFalse(set.contains(-1));
+            assertTrue(set.size() == 1);
+        }
+    }
+}
--- a/mobile/android/base/java/org/mozilla/gecko/GlobalHistory.java
+++ b/mobile/android/base/java/org/mozilla/gecko/GlobalHistory.java
@@ -1,25 +1,28 @@
 /* -*- Mode: Java; c-basic-offset: 4; tab-width: 20; indent-tabs-mode: nil; -*-
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 package org.mozilla.gecko;
 
 import java.lang.ref.SoftReference;
-import java.util.HashSet;
+import java.security.NoSuchAlgorithmException;
 import java.util.LinkedList;
 import java.util.Queue;
-import java.util.Set;
+
+import com.github.yonik.MurmurHash3;
 
 import org.mozilla.gecko.annotation.WrapForJNI;
 import org.mozilla.gecko.db.BrowserDB;
 import org.mozilla.gecko.reader.ReaderModeUtils;
+import org.mozilla.gecko.sync.Utils;
 import org.mozilla.gecko.util.GeckoBundle;
+import org.mozilla.gecko.util.StringUtils;
 import org.mozilla.gecko.util.ThreadUtils;
 
 import android.content.ContentResolver;
 import android.content.Context;
 import android.database.Cursor;
 import android.os.Handler;
 import android.os.SystemClock;
 import android.util.Log;
@@ -44,83 +47,91 @@ class GlobalHistory {
     // this allows batching together multiple requests and processing them together,
     // which is more efficient.
     private static final long BATCHING_DELAY_MS = 100;
 
     private final Handler mHandler;                     // a background thread on which we can process requests
 
     //  Note: These fields are accessed through the NotificationRunnable inner class.
     final Queue<String> mPendingUris;           // URIs that need to be checked
-    SoftReference<Set<String>> mVisitedCache;   // cache of the visited URI list
+
+    // This set can grow to tens of thousands of entries and several megabytes
+    // of overhead when using HashSet<String>.  Compact this by storing
+    // a 64-bit hash instead of the original String.
+    SoftReference<SimpleLongOpenHashSet> mVisitedCache;  // cache of the visited URI list
+
     boolean mProcessing; // = false             // whether or not the runnable is queued/working
 
     private class NotifierRunnable implements Runnable {
         private final ContentResolver mContentResolver;
         private final BrowserDB mDB;
 
         public NotifierRunnable(final Context context) {
             mContentResolver = context.getContentResolver();
             mDB = BrowserDB.from(context);
         }
 
         @Override
         public void run() {
-            Set<String> visitedSet = mVisitedCache.get();
+            MurmurHash3.LongPair pair = new MurmurHash3.LongPair();
+            SimpleLongOpenHashSet visitedSet = mVisitedCache.get();
             if (visitedSet == null) {
                 // The cache was wiped. Repopulate it.
                 Log.w(LOGTAG, "Rebuilding visited link set...");
                 final long start = SystemClock.uptimeMillis();
                 final Cursor c = mDB.getAllVisitedHistory(mContentResolver);
                 if (c == null) {
                     return;
                 }
 
                 try {
-                    visitedSet = new HashSet<String>();
+                    visitedSet = new SimpleLongOpenHashSet(c.getCount());
                     if (c.moveToFirst()) {
                         do {
-                            visitedSet.add(c.getString(0));
+                            visitedSet.add(hashUrl(c.getBlob(0), pair));
                         } while (c.moveToNext());
                     }
-                    mVisitedCache = new SoftReference<Set<String>>(visitedSet);
+                    mVisitedCache = new SoftReference<SimpleLongOpenHashSet>(visitedSet);
                     final long end = SystemClock.uptimeMillis();
                     final long took = end - start;
+                    Log.i(LOGTAG, "Rebuilt visited link set containing " + visitedSet.size() + " URLs in " + took + " ms");
                     Telemetry.addToHistogram(TELEMETRY_HISTOGRAM_BUILD_VISITED_LINK, (int) Math.min(took, Integer.MAX_VALUE));
                 } finally {
                     c.close();
                 }
             }
 
             // This runs on the same handler thread as the checkUriVisited code,
             // so no synchronization is needed.
             while (true) {
                 final String uri = mPendingUris.poll();
                 if (uri == null) {
                     break;
                 }
 
-                if (visitedSet.contains(uri)) {
+                if (visitedSet.contains(hashUrl(uri.getBytes(StringUtils.UTF_8), pair))) {
                     GeckoAppShell.notifyUriVisited(uri);
                 }
             }
 
             mProcessing = false;
         }
     };
 
     private GlobalHistory() {
         mHandler = ThreadUtils.getBackgroundHandler();
         mPendingUris = new LinkedList<String>();
-        mVisitedCache = new SoftReference<Set<String>>(null);
+        mVisitedCache = new SoftReference<SimpleLongOpenHashSet>(null);
     }
 
     public void addToGeckoOnly(String uri) {
-        Set<String> visitedSet = mVisitedCache.get();
+        MurmurHash3.LongPair pair = new MurmurHash3.LongPair();
+        SimpleLongOpenHashSet visitedSet = mVisitedCache.get();
         if (visitedSet != null) {
-            visitedSet.add(uri);
+            visitedSet.add(hashUrl(uri.getBytes(StringUtils.UTF_8), pair));
         }
         GeckoAppShell.notifyUriVisited(uri);
     }
 
     public void add(final Context context, final BrowserDB db, String uri) {
         ThreadUtils.assertOnBackgroundThread();
         final long start = SystemClock.uptimeMillis();
 
@@ -201,9 +212,15 @@ class GlobalHistory {
         });
     }
 
     private void dispatchUriAvailableMessage(String uri) {
         final GeckoBundle message = new GeckoBundle();
         message.putString(EVENT_PARAM_URI, uri);
         EventDispatcher.getInstance().dispatch(EVENT_URI_AVAILABLE_IN_HISTORY, message);
     }
+
+    /** Calculate hash of input. */
+    private static long hashUrl(byte[] input, MurmurHash3.LongPair pair) {
+        MurmurHash3.murmurhash3_x64_128(input, 0, input.length, 0, pair);
+        return pair.val1 ^ pair.val2;
+    }
 }
new file mode 100644
--- /dev/null
+++ b/mobile/android/base/java/org/mozilla/gecko/SimpleLongOpenHashSet.java
@@ -0,0 +1,114 @@
+/* -*- Mode: Java; c-basic-offset: 4; tab-width: 20; indent-tabs-mode: nil; -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+package org.mozilla.gecko;
+
+import java.util.Arrays;
+
+/**
+ * A simple open hash table, specialied for primitive long values.
+ *
+ * There is no support for removing individual values but clearing the entire
+ * set is supported.
+ *
+ * The implementation uses linear probing.
+ */
+
+public final class SimpleLongOpenHashSet {
+    private long[] elems = new long[0];
+    private int size;
+    private static final int FILL_PERCENTAGE = 70;
+    /** Use a known value to indicate that an element is not present. */
+    private static final long SENTINEL = ~0L;
+    private static final int MINIMUM_SIZE = 16;
+
+    public SimpleLongOpenHashSet() {
+        this(MINIMUM_SIZE);
+    }
+
+    public SimpleLongOpenHashSet(int expectedSize) {
+        if (expectedSize <= MINIMUM_SIZE) {
+            expectedSize = MINIMUM_SIZE;
+        }
+        resize(expectedSize);
+    }
+
+    /** @return true if element exists in set */
+    public boolean contains(long elem) {
+        if (elem == SENTINEL) {
+            ++elem;
+        }
+        int index = index(elem);
+        // loop guaranteed to terminate since set has at least one SENTINEL
+        while (true) {
+            if (elems[index] == elem) {
+                return true;
+            } else if (elems[index] == -1) {
+                return false;
+            } else {
+                ++index;
+                if (index >= elems.length) {
+                    index = 0;
+                }
+            }
+        }
+    }
+
+    /**
+     * Add an element to the hash set.
+     *
+     * @return true if element did not previously exist
+     */
+    public boolean add(long elem) {
+        if ((size + 1) * 100 / elems.length > FILL_PERCENTAGE) {
+            resize(2 * elems.length);
+        }
+        if (elem == SENTINEL) {
+            ++elem;
+        }
+        int index = index(elem);
+        // loop guaranteed to terminate since set has at least one SENTINEL
+        while (true) {
+            if (elems[index] == elem) {
+                return false;
+            } else if (elems[index] == -1) {
+                elems[index] = elem;
+                ++size;
+                return true;
+            } else {
+                ++index;
+                if (index >= elems.length) {
+                    index = 0;
+                }
+            }
+        }
+    }
+
+    /** Remove all elements from the set. */
+    public void clear() {
+        Arrays.fill(elems, SENTINEL);
+        size = 0;
+    }
+
+    /** @return number of elements in the set. */
+    public int size() {
+        return size;
+    }
+
+    private void resize(int size) {
+        long[] oldElems = elems;
+        elems = new long[size];
+        clear();
+        for (long elem : oldElems) {
+            add(elem);
+        }
+    }
+
+    /** @return valid index for element */
+    private int index(long elem) {
+        // remove sign bit since modulus preserves it
+        return (int) Math.abs(elem) % elems.length;
+    }
+}