Bug 1387694 - Top frecent sites query should dedupe to the more frecent rev_nowww. r=ursula draft
authorEd Lee <edilee@mozilla.com>
Sat, 05 Aug 2017 01:55:06 -0700
changeset 642608 45ec57a6f5480d725df94964f1840a599e7d90d7
parent 621399 4cfb674227051e22bab651e5759f3de503a50560
child 725055 699b459ae63a13cb4ff8aa6ecf6b31078586c652
push id72820
push userbmo:edilee@mozilla.com
push dateTue, 08 Aug 2017 15:35:41 +0000
reviewersursula
bugs1387694
milestone57.0a1
Bug 1387694 - Top frecent sites query should dedupe to the more frecent rev_nowww. r=ursula Refactor the SQL to be extremely fast and push complex logic to JS. MozReview-Commit-ID: E6707hC9K5D
toolkit/modules/NewTabUtils.jsm
toolkit/modules/tests/xpcshell/test_NewTabUtils.js
--- a/toolkit/modules/NewTabUtils.jsm
+++ b/toolkit/modules/NewTabUtils.jsm
@@ -930,65 +930,97 @@ var ActivityStreamProvider = {
    * @param {Object} aOptions
    *          options.ignoreBlocked: Do not filter out blocked links .
    *
    * @returns {Promise} Returns a promise with the array of links as payload.
    */
   async getTopFrecentSites(aOptions = {}) {
     let {ignoreBlocked} = aOptions;
 
-    // GROUP first by rev_host to get the most-frecent page of an exact host
-    // then GROUP by rev_nowww to dedupe between top two pages of nowww host.
-    // Note that unlike mysql, sqlite picks the last raw from groupby bucket.
-    // Which is why subselect orders frecency and last_visit_date backwards.
-    // In general the groupby behavior in the absence of aggregates is not
-    // defined in SQL, hence we are relying on sqlite implementation that may
-    // change in the future.
+    // Get extra links in case they're blocked and double the usual limit in
+    // case the host is deduped between with www or not-www (i.e., 2 hosts) as
+    // well as an extra buffer for multiple pages from the same exact host.
+    const limit = Object.keys(BlockedLinks.links).length + TOP_SITES_LIMIT * 2 * 10;
 
-    const limit = Object.keys(BlockedLinks.links).length + TOP_SITES_LIMIT;
-    let sqlQuery = `/* do not warn (bug N/A): do not need index */
-                    SELECT url, title, SUM(frecency) frecency, guid, bookmarkGuid,
-                     last_visit_date / 1000 as lastVisitDate, "history" as type
-                    FROM (SELECT * FROM (
-                      SELECT
+    // Keep this query fast with frecency-indexed lookups (even with excess
+    // rows) and shift the more complex logic to post-processing afterwards
+    const sqlQuery = `SELECT
+                        moz_bookmarks.guid AS bookmarkGuid,
+                        frecency,
+                        moz_places.guid,
+                        last_visit_date / 1000 AS lastVisitDate,
                         rev_host,
-                        fixup_url(get_unreversed_host(rev_host)) AS rev_nowww,
-                        moz_places.url,
                         moz_places.title,
-                        frecency,
-                        last_visit_date,
-                        moz_places.guid AS guid,
-                        moz_bookmarks.guid AS bookmarkGuid
+                        url
                       FROM moz_places
                       LEFT JOIN moz_bookmarks
-                      on moz_places.id = moz_bookmarks.fk
+                      ON moz_places.id = moz_bookmarks.fk
                       WHERE hidden = 0 AND last_visit_date NOTNULL
                       AND (SUBSTR(moz_places.url, 1, 6) == "https:" OR SUBSTR(moz_places.url, 1, 5) == "http:")
-                      ORDER BY frecency, last_visit_date, moz_places.url DESC
-                    ) GROUP BY rev_host)
-                    GROUP BY rev_nowww
-                    ORDER BY frecency DESC, lastVisitDate DESC, url
-                    LIMIT ${limit}`;
+                      ORDER BY frecency DESC
+                      LIMIT ${limit}`;
 
     let links = await this.executePlacesQuery(sqlQuery, {
       columns: [
         "bookmarkGuid",
         "frecency",
         "guid",
         "lastVisitDate",
         "title",
-        "type",
         "url"
       ]
     });
 
-    if (!ignoreBlocked) {
-      links = links.filter(link => !BlockedLinks.isBlocked(link));
+    // Determine if the other link is "better" (larger frecency, more recent,
+    // lexicographically earlier url)
+    function isOtherBetter(link, other) {
+      return other.frecency > link.frecency ||
+        (other.frecency === link.frecency && other.lastVisitDate > link.lastVisitDate ||
+         other.lastVisitDate === link.lastVisitDate && other.url < link.url);
+    }
+
+    // Update a host Map with the better link
+    function setBetterLink(map, link, hostMatcher, combiner = () => {}) {
+      const host = hostMatcher(link.url)[1];
+      if (map.has(host)) {
+        const other = map.get(host);
+        if (isOtherBetter(link, other)) {
+          link = other;
+        }
+        combiner(link, other);
+      }
+      map.set(host, link);
     }
-    links = links.slice(0, TOP_SITES_LIMIT);
+
+    // Clean up the returned links by removing blocked, deduping, etc.
+    const exactHosts = new Map();
+    for (const link of links) {
+      if (!ignoreBlocked && BlockedLinks.isBlocked(link)) {
+        continue;
+      }
+
+      link.type = "history";
+
+      // First we want to find the best link for an exact host
+      setBetterLink(exactHosts, link, url => url.match(/:\/\/([^\/]+)/));
+    }
+
+    // Clean up exact hosts to dedupe as non-www hosts
+    const hosts = new Map();
+    for (const link of exactHosts.values()) {
+      setBetterLink(hosts, link, url => url.match(/:\/\/(?:www\.)?([^\/]+)/),
+        // Combine frecencies when deduping these links
+        (targetLink, otherLink) => {
+          targetLink.frecency = link.frecency + otherLink.frecency;
+        });
+    }
+
+    // Pick out the top links using the same comparer as before
+    links = [...hosts.values()].sort(isOtherBetter).slice(0, TOP_SITES_LIMIT);
+
     links = await this._addFavicons(links);
     return this._processLinks(links);
   },
 
   /**
    * Gets a specific bookmark given an id
    *
    * @param {String} aGuid
--- a/toolkit/modules/tests/xpcshell/test_NewTabUtils.js
+++ b/toolkit/modules/tests/xpcshell/test_NewTabUtils.js
@@ -387,23 +387,38 @@ add_task(async function getTopFrecentSit
   await PlacesTestUtils.addVisits(testURI);
 
   // Test combined frecency score
   links = await provider.getTopSites();
   Assert.equal(links.length, 1, "adding both www. and no-www. yields one link");
   Assert.equal(links[0].frecency, 200, "frecency scores are combined");
 
   // add another page visit with www and without www
-  testURI = "http://mozilla.com/page";
-  await PlacesTestUtils.addVisits(testURI);
-  testURI = "http://www.mozilla.com/page";
-  await PlacesTestUtils.addVisits(testURI);
+  let noWWW = "http://mozilla.com/page";
+  await PlacesTestUtils.addVisits(noWWW);
+  let withWWW = "http://www.mozilla.com/page";
+  await PlacesTestUtils.addVisits(withWWW);
   links = await provider.getTopSites();
   Assert.equal(links.length, 1, "adding both www. and no-www. yields one link");
   Assert.equal(links[0].frecency, 200, "frecency scores are combined ignoring extra pages");
+
+  // add another visit with www
+  await PlacesTestUtils.addVisits(withWWW);
+  links = await provider.getTopSites();
+  Assert.equal(links.length, 1, "still yields one link");
+  Assert.equal(links[0].url, withWWW, "more frecent www link is used");
+  Assert.equal(links[0].frecency, 300, "frecency scores are combined ignoring extra pages");
+
+  // add a couple more visits to the no-www page
+  await PlacesTestUtils.addVisits(noWWW);
+  await PlacesTestUtils.addVisits(noWWW);
+  links = await provider.getTopSites();
+  Assert.equal(links.length, 1, "still yields one link");
+  Assert.equal(links[0].url, noWWW, "now more frecent no-www link is used");
+  Assert.equal(links[0].frecency, 500, "frecency scores are combined ignoring extra pages");
 });
 
 add_task(async function getTopFrencentSites_maxLimit() {
   await setUpActivityStreamTest();
 
   let provider = NewTabUtils.activityStreamLinks;
 
   // add many visits
@@ -438,16 +453,23 @@ add_task(async function getTopFrencentSi
   Assert.equal(links.length, 1, "http:// is an allowed protocol");
 
   // and just to be sure, add a visit to a site with ftp:// protocol
   testURI = "ftp://bad/example";
   await PlacesTestUtils.addVisits(testURI);
 
   links = await provider.getTopSites();
   Assert.equal(links.length, 1, "we still only accept http:// and https:// for top sites");
+
+  // add a different allowed protocol
+  testURI = "https://https";
+  await PlacesTestUtils.addVisits(testURI);
+
+  links = await provider.getTopSites();
+  Assert.equal(links.length, 2, "we now accept both http:// and https:// for top sites");
 });
 
 add_task(async function getTopFrecentSites_order() {
   await setUpActivityStreamTest();
 
   let provider = NewTabUtils.activityStreamLinks;
   let {TRANSITION_TYPED} = PlacesUtils.history;