Bug 782285 - Don't offer reader in pages with too much reading competition (r=mfinkle)
authorLucas Rocha <lucasr@mozilla.com>
Thu, 23 Aug 2012 17:08:13 +0100
changeset 105214 55c4a3f3a6a96597f7b9ce3d698071f3820e4abb
parent 105213 236151ae351f82468be10b7b104d20f001efb49c
child 105215 08b790ce10cdd77757ea5d772afb7ea22dd8f957
push id55
push usershu@rfrn.org
push dateThu, 30 Aug 2012 01:33:09 +0000
reviewersmfinkle
bugs782285
milestone17.0a1
Bug 782285 - Don't offer reader in pages with too much reading competition (r=mfinkle) * * * blo
mobile/android/chrome/content/Readability.js
--- a/mobile/android/chrome/content/Readability.js
+++ b/mobile/android/chrome/content/Readability.js
@@ -53,16 +53,20 @@ var Readability = function(uri, doc) {
   }
 }
 
 Readability.prototype = {
   FLAG_STRIP_UNLIKELYS: 0x1,
   FLAG_WEIGHT_CLASSES: 0x2,
   FLAG_CLEAN_CONDITIONALLY: 0x4,
 
+  // The number of top candidates to consider when analysing how
+  // tight the competition is among candidates.
+  N_TOP_CANDIDATES: 5,
+
   // The maximum number of pages to loop through before we call
   // it quits and just show a link.
   MAX_PAGES: 5,
 
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
     unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
@@ -530,34 +534,44 @@ Readability.prototype = {
         parentNode.readability.contentScore += contentScore;
 
         if (grandParentNode)
           grandParentNode.readability.contentScore += contentScore / 2;
       }
 
       // After we've calculated scores, loop through all of the possible
       // candidate nodes we found and find the one with the highest score.
-      let topCandidate = null;
+      let topCandidates = [];
       for (let c = 0, cl = candidates.length; c < cl; c += 1) {
+        let candidate = candidates[c];
+
         // Scale the final candidates score based on link density. Good content
         // should have a relatively small link density (5% or less) and be mostly
         // unaffected by this operation.
-        candidates[c].readability.contentScore =
-            candidates[c].readability.contentScore * (1 - this._getLinkDensity(candidates[c]));
+        let candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
+        candidate.readability.contentScore = candidateScore;
+
+        this.log('Candidate: ' + candidate + " (" + candidate.className + ":" +
+          candidate.id + ") with score " + candidateScore);
 
-        this.log('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" +
-          candidates[c].id + ") with score " +
-          candidates[c].readability.contentScore);
+        for (let t = 0; t < this.N_TOP_CANDIDATES; t++) {
+          let aTopCandidate = topCandidates[t];
 
-        if (!topCandidate ||
-          candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
-          topCandidate = candidates[c];
+          if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
+            topCandidates.splice(t, 0, candidate);
+            if (topCandidates.length > this.N_TOP_CANDIDATES)
+              topCandidates.pop();
+            break;
+          }
         }
       }
 
+      let topCandidate = topCandidates[0] || null;
+      let lastTopCandidate = (topCandidates.length > 3 ? topCandidates[topCandidates.length - 1] : null);
+
       // If we still have no top candidate, just use the body as a last resort.
       // We also have to copy the body node so it is something we can modify.
       if (topCandidate === null || topCandidate.tagName === "BODY") {
         topCandidate = doc.createElement("DIV");
         topCandidate.innerHTML = page.innerHTML;
 
         page.innerHTML = "";
         page.appendChild(topCandidate);
@@ -642,29 +656,40 @@ Readability.prototype = {
       if (this._curPageNum === 1)
         articleContent.innerHTML = '<div id="readability-page-1" class="page">' + articleContent.innerHTML + '</div>';
 
       // Now that we've gone through the full algorithm, check to see if
       // we got any meaningful content. If we didn't, we may need to re-run
       // grabArticle with different flags set. This gives us a higher likelihood of
       // finding the content, and the sieve approach gives us a higher likelihood of
       // finding the -right- content.
-      if (this._getInnerText(articleContent, false).length < 250) {
+      if (this._getInnerText(articleContent, true).length < 500) {
         page.innerHTML = pageCacheHtml;
 
         if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
           this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
         } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
           this._removeFlag(this.FLAG_WEIGHT_CLASSES);
         } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
           this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
         } else {
           return null;
         }
       } else {
+        if (lastTopCandidate !== null) {
+          // EXPERIMENTAL: Contrast ratio is how we measure the level of competition between candidates in the
+          // readability algorithm. This is to avoid offering reader mode on pages that are more like
+          // a list or directory of links with summaries. It takes the score of the last top candidate
+          // (see N_TOP_CANDIDATES) and checks how it compares to the top candidate's. On pages that are not
+          // actual articles, there will likely be many candidates with similar score (i.e. higher contrast ratio).
+          let contrastRatio = lastTopCandidate.readability.contentScore / topCandidate.readability.contentScore;
+          if (contrastRatio > 0.45)
+            return null;
+        }
+
         return articleContent;
       }
     }
   },
 
   /**
    * Removes script tags from the document.
    *