Bug 1322327 et al., update readability from github, rs=me
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Wed, 18 Jan 2017 14:21:10 +0000
changeset 377233 2d9cff6d389ac6ece80e6621b2a6d029bcb81908
parent 377232 b3885db8150b4e0450717776f3a652ec2425503c
child 377234 c6e104aef85f2ee6831ce1e6b38c4127ef08d125
push id1419
push userjlund@mozilla.com
push dateMon, 10 Apr 2017 20:44:07 +0000
treeherdermozilla-release@5e6801b73ef6 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersme
bugs1322327
milestone53.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1322327 et al., update readability from github, rs=me MozReview-Commit-ID: Gg9o2gE6T3t
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -77,22 +77,22 @@ function Readability(uri, doc, options) 
       var elDesc = "";
       if (e.id)
         elDesc = "(#" + e.id + classDesc + ")";
       else if (classDesc)
         elDesc = "(" + classDesc + ")";
       return rv + elDesc;
     };
     this.log = function () {
-      if (typeof dump !== undefined) {
+      if (typeof dump !== "undefined") {
         var msg = Array.prototype.map.call(arguments, function(x) {
           return (x && x.nodeName) ? logEl(x) : x;
         }).join(" ");
         dump("Reader: (Readability) " + msg + "\n");
-      } else if (typeof console !== undefined) {
+      } else if (typeof console !== "undefined") {
         var args = ["Reader: (Readability) "].concat(arguments);
         console.log.apply(console, args);
       }
     };
   } else {
     this.log = function () {};
   }
 }
@@ -114,17 +114,17 @@ Readability.prototype = {
   DEFAULT_MAX_PAGES: 5,
 
   // Element tags to score by default.
   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
-    unlikelyCandidates: /banner|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|rss|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+    unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|ad-break|agegate|pagination|pager|popup|yom-remote/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
     negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby|p-author/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
     normalize: /\s{2,}/g,
     videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
@@ -472,16 +472,17 @@ Readability.prototype = {
    * @param Element
    * @return void
    **/
   _prepArticle: function(articleContent) {
     this._cleanStyles(articleContent);
 
     // Clean out junk from the article content
     this._cleanConditionally(articleContent, "form");
+    this._cleanConditionally(articleContent, "fieldset");
     this._clean(articleContent, "object");
     this._clean(articleContent, "embed");
     this._clean(articleContent, "h1");
     this._clean(articleContent, "footer");
 
     // Clean out elements have "share" in their id/class combinations from final top candidates,
     // which means we don't remove the top candidates even they have "share".
     this._forEachNode(articleContent.children, function(topCandidate) {
@@ -489,16 +490,20 @@ Readability.prototype = {
     });
 
     // If there is only one h2, they are probably using it as a header
     // and not a subheader, so remove it since we already have a header.
     if (articleContent.getElementsByTagName('h2').length === 1)
       this._clean(articleContent, "h2");
 
     this._clean(articleContent, "iframe");
+    this._clean(articleContent, "input");
+    this._clean(articleContent, "textarea");
+    this._clean(articleContent, "select");
+    this._clean(articleContent, "button");
     this._cleanHeaders(articleContent);
 
     // Do these last as the previous stuff may have removed junk
     // that will affect these
     this._cleanConditionally(articleContent, "table");
     this._cleanConditionally(articleContent, "ul");
     this._cleanConditionally(articleContent, "div");
 
@@ -841,28 +846,59 @@ Readability.prototype = {
           this.log("Moving child out:", kids[0]);
           topCandidate.appendChild(kids[0]);
         }
 
         page.appendChild(topCandidate);
 
         this._initializeNode(topCandidate);
       } else if (topCandidate) {
+        // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
+        // and whose scores are quite closed with current `topCandidate` node.
+        var alternativeCandidateAncestors = [];
+        for (var i = 1; i < topCandidates.length; i++) {
+          if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) {
+            alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i]));
+          }
+        }
+        var MINIMUM_TOPCANDIDATES = 3;
+        if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
+          parentOfTopCandidate = topCandidate.parentNode;
+          while (parentOfTopCandidate.tagName !== "BODY") {
+            var listsContainingThisAncestor = 0;
+            for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) {
+              listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate));
+            }
+            if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
+              topCandidate = parentOfTopCandidate;
+              break;
+            }
+            parentOfTopCandidate = parentOfTopCandidate.parentNode;
+          }
+        }
+        if (!topCandidate.readability) {
+          this._initializeNode(topCandidate);
+        }
+
         // Because of our bonus system, parents of candidates might have scores
         // themselves. They get half of the node. There won't be nodes with higher
         // scores than our topCandidate, but if we see the score going *up* in the first
         // few steps up the tree, that's a decent sign that there might be more content
         // lurking in other places that we want to unify in. The sibling stuff
         // below does some of that - but only if we've looked high enough up the DOM
         // tree.
         parentOfTopCandidate = topCandidate.parentNode;
         var lastScore = topCandidate.readability.contentScore;
         // The scores shouldn't get too low.
         var scoreThreshold = lastScore / 3;
-        while (parentOfTopCandidate && parentOfTopCandidate.readability) {
+        while (parentOfTopCandidate.tagName !== "BODY") {
+          if (!parentOfTopCandidate.readability) {
+            parentOfTopCandidate = parentOfTopCandidate.parentNode;
+            continue;
+          }
           var parentScore = parentOfTopCandidate.readability.contentScore;
           if (parentScore < scoreThreshold)
             break;
           if (parentScore > lastScore) {
             // Alright! We found a better parent to use.
             topCandidate = parentOfTopCandidate;
             break;
           }
@@ -1236,21 +1272,16 @@ Readability.prototype = {
       if (segment.indexOf(".") !== -1) {
         possibleType = segment.split(".")[1];
 
         // If the type isn't alpha-only, it's probably not actually a file extension.
         if (!possibleType.match(/[^a-zA-Z]/))
           segment = segment.split(".")[0];
       }
 
-      // EW-CMS specific segment replacement. Ugly.
-      // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
-      if (segment.indexOf(',00') !== -1)
-        segment = segment.replace(',00', '');
-
       // If our first or second segment has anything looking like a page number, remove it.
       if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0)))
         segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
 
       var del = false;
 
       // If this is purely a number, and it's the first or second segment,
       // it's probably a page number. Remove it.
@@ -1708,21 +1739,20 @@ Readability.prototype = {
           if (!this.REGEXPS.videos.test(embeds[ei].src))
             embedCount += 1;
         }
 
         var linkDensity = this._getLinkDensity(node);
         var contentLength = this._getInnerText(node).length;
 
         var haveToRemove =
-          // Make an exception for elements with no p's and exactly 1 img.
-          (img > p && !this._hasAncestorTag(node, "figure")) ||
+          (img > 1 && img > p && !this._hasAncestorTag(node, "figure")) ||
           (!isList && li > p) ||
           (input > Math.floor(p/3)) ||
-          (!isList && contentLength < 25 && (img === 0 || img > 2)) ||
+          (!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
           (!isList && weight < 25 && linkDensity > 0.2) ||
           (weight >= 25 && linkDensity > 0.5) ||
           ((embedCount === 1 && contentLength < 75) || embedCount > 1);
         return haveToRemove;
       }
       return false;
     });
   },
@@ -1907,8 +1937,12 @@ Readability.prototype = {
       dir: this._articleDir,
       content: articleContent.innerHTML,
       textContent: textContent,
       length: textContent.length,
       excerpt: metadata.excerpt,
     };
   }
 };
+
+if (typeof module === "object") {
+  module.exports = Readability;
+}