Bug 1158228 - merge github's readability code into m-c, rs=me
☠☠ backed out by fd9158e97d81 ☠ ☠
authorMargaret Leibovic <margaret.leibovic@gmail.com>
Fri, 01 May 2015 12:20:00 -0700
changeset 273496 c7440c022f74cc075ae187abe1d3f00eb343b568
parent 273495 df9523258590eb2fd995cc4dfe5192ced87f9478
child 273497 db3c12a86b66cd04fd701b20f6208f955f2bf3f1
push id863
push userraliiev@mozilla.com
push dateMon, 03 Aug 2015 13:22:43 +0000
treeherdermozilla-release@f6321b14228d [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersme
bugs1158228
milestone40.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1158228 - merge github's readability code into m-c, rs=me
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -103,28 +103,31 @@ Readability.prototype = {
   // The number of top candidates to consider when analysing how
   // tight the competition is among candidates.
   DEFAULT_N_TOP_CANDIDATES: 5,
 
   // The maximum number of pages to loop through before we call
   // it quits and just show a link.
   DEFAULT_MAX_PAGES: 5,
 
+  // Element tags to score by default.
+  DEFAULT_TAGS_TO_SCORE: ["SECTION", "P", "TD", "PRE"],
+
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
     unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
     negative: /hidden|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
     normalize: /\s{2,}/g,
-    videos: /https?:\/\/(www\.)?(youtube|youtube-nocookie|player\.vimeo)\.com/i,
+    videos: /https?:\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
     prevLink: /(prev|earl|old|new|<|«)/i,
     whitespace: /^\s*$/,
     hasContent: /\S$/,
   },
 
   DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
 
@@ -181,16 +184,25 @@ Readability.prototype = {
     var slice = Array.prototype.slice;
     var args = slice.call(arguments);
     var nodeLists = args.map(function(list) {
       return slice.call(list);
     });
     return Array.prototype.concat.apply([], nodeLists);
   },
 
+  _getAllNodesWithTag: function(node, tagNames) {
+    if (node.querySelectorAll) {
+      return node.querySelectorAll(tagNames.join(','));
+    }
+    return [].concat.apply([], tagNames.map(function(tag) {
+      return node.getElementsByTagName(tag);
+    }));
+  },
+
   /**
    * Converts each <a> and <img> uri in the given element to an absolute URI.
    *
    * @param Element
    * @return void
    */
   _fixRelativeUris: function(articleContent) {
     var scheme = this._uri.scheme;
@@ -581,16 +593,28 @@ Readability.prototype = {
     if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
       this._articleByline = node.textContent.trim();
       return true;
     }
 
     return false;
   },
 
+  _getNodeAncestors: function(node, maxDepth) {
+    maxDepth = maxDepth || 0;
+    var i = 0, ancestors = [];
+    while (node.parentNode) {
+      ancestors.push(node.parentNode)
+      if (maxDepth && ++i === maxDepth)
+        break;
+      node = node.parentNode;
+    }
+    return ancestors;
+  },
+
   /***
    * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
    *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
    *
    * @param page a document to run upon. Needs to be a full document, complete with body.
    * @return Element
   **/
   _grabArticle: function (page) {
@@ -635,18 +659,19 @@ Readability.prototype = {
               node.tagName !== "BODY" &&
               node.tagName !== "A") {
             this.log("Removing unlikely candidate - " + matchString);
             node = this._removeAndGetNext(node);
             continue;
           }
         }
 
-        if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
+        if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
           elementsToScore.push(node);
+        }
 
         // Turn all divs that don't have children block level elements into p's
         if (node.tagName === "DIV") {
           // Sites like http://mobile.slate.com encloses each paragraph with a DIV
           // element. DIVs with only a P element inside and no text content can be
           // safely converted into plain P elements to avoid confusing the scoring
           // algorithm with DIVs with are, in practice, paragraphs.
           if (this._hasSinglePInsideElement(node)) {
@@ -675,57 +700,52 @@ Readability.prototype = {
       /**
        * Loop through all paragraphs, and assign a score to them based on how content-y they look.
        * Then add their score to their parent node.
        *
        * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
       **/
       var candidates = [];
       this._forEachNode(elementsToScore, function(elementToScore) {
-        var parentNode = elementToScore.parentNode;
-        var grandParentNode = parentNode ? parentNode.parentNode : null;
-        var innerText = this._getInnerText(elementToScore);
-
-        if (!parentNode || typeof(parentNode.tagName) === 'undefined')
+        if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined')
           return;
 
         // If this paragraph is less than 25 characters, don't even count it.
+        var innerText = this._getInnerText(elementToScore);
         if (innerText.length < 25)
           return;
 
-        // Initialize readability data for the parent.
-        if (typeof parentNode.readability === 'undefined') {
-          this._initializeNode(parentNode);
-          candidates.push(parentNode);
-        }
-
-        // Initialize readability data for the grandparent.
-        if (grandParentNode &&
-          typeof(grandParentNode.readability) === 'undefined' &&
-          typeof(grandParentNode.tagName) !== 'undefined') {
-          this._initializeNode(grandParentNode);
-          candidates.push(grandParentNode);
-        }
+        // Exclude nodes with no ancestor.
+        var ancestors = this._getNodeAncestors(elementToScore, 3);
+        if (ancestors.length === 0)
+          return;
 
         var contentScore = 0;
 
         // Add a point for the paragraph itself as a base.
         contentScore += 1;
 
         // Add points for any commas within this paragraph.
         contentScore += innerText.split(',').length;
 
         // For every 100 characters in this paragraph, add another point. Up to 3 points.
         contentScore += Math.min(Math.floor(innerText.length / 100), 3);
 
-        // Add the score to the parent. The grandparent gets half.
-        parentNode.readability.contentScore += contentScore;
+        // Initialize and score ancestors.
+        this._forEachNode(ancestors, function(ancestor, level) {
+          if (!ancestor.tagName)
+            return;
 
-        if (grandParentNode)
-          grandParentNode.readability.contentScore += contentScore / 2;
+          if (typeof(ancestor.readability) === 'undefined') {
+            this._initializeNode(ancestor);
+            candidates.push(ancestor);
+          }
+
+          ancestor.readability.contentScore += contentScore / (level === 0 ? 1 : level * 2);
+        });
       });
 
       // After we've calculated scores, loop through all of the possible
       // candidate nodes we found and find the one with the highest score.
       var topCandidates = [];
       for (var c = 0, cl = candidates.length; c < cl; c += 1) {
         var candidate = candidates[c];
 
@@ -843,20 +863,16 @@ Readability.prototype = {
           if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
             // We have a node that isn't a common block level element, like a form or td tag.
             // Turn it into a div so it doesn't get filtered out later by accident.
             this.log("Altering sibling:", sibling, 'to div.');
 
             sibling = this._setNodeTag(sibling, "DIV");
           }
 
-          // To ensure a node does not interfere with readability styles,
-          // remove its classnames.
-          sibling.removeAttribute("class");
-
           articleContent.appendChild(sibling);
           // siblings is a reference to the children array, and
           // sibling is removed from the array when we call appendChild().
           // As a result, we must revisit this index since the nodes
           // have been shifted.
           s -= 1;
           sl -= 1;
         }
@@ -948,17 +964,17 @@ Readability.prototype = {
     // Match Facebook's Open Graph title & description properties.
     var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi;
 
     // Find description tags.
     this._forEachNode(metaElements, function(element) {
       var elementName = element.getAttribute("name");
       var elementProperty = element.getAttribute("property");
 
-      if (elementName === "author") {
+      if ([elementName, elementProperty].indexOf("author") !== -1) {
         metadata.byline = element.getAttribute("content");
         return;
       }
 
       var name = null;
       if (namePattern.test(elementName)) {
         name = elementName;
       } else if (propertyPattern.test(elementProperty)) {
@@ -1592,16 +1608,17 @@ Readability.prototype = {
    * @return void
    **/
   _cleanConditionally: function(e, tag) {
     if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
       return;
 
     var tagsList = e.getElementsByTagName(tag);
     var curTagsLength = tagsList.length;
+    var isList = tag === "ul" || tag === "ol";
 
     // Gather counts for other typical elements embedded within.
     // Traverse backwards so we can remove nodes at the same time
     // without effecting the traversal.
     //
     // TODO: Consider taking into account original contentScore here.
     for (var i = curTagsLength-1; i >= 0; i -= 1) {
       var weight = this._getClassWeight(tagsList[i]);
@@ -1627,23 +1644,23 @@ Readability.prototype = {
             embedCount += 1;
         }
 
         var linkDensity = this._getLinkDensity(tagsList[i]);
         var contentLength = this._getInnerText(tagsList[i]).length;
         var toRemove = false;
         if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
           toRemove = true;
-        } else if (li > p && tag !== "ul" && tag !== "ol") {
+        } else if (!isList && li > p) {
           toRemove = true;
-        } else if ( input > Math.floor(p/3) ) {
+        } else if (input > Math.floor(p/3)) {
           toRemove = true;
-        } else if (contentLength < 25 && (img === 0 || img > 2) ) {
+        } else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
           toRemove = true;
-        } else if (weight < 25 && linkDensity > 0.2) {
+        } else if (!isList && weight < 25 && linkDensity > 0.2) {
           toRemove = true;
         } else if (weight >= 25 && linkDensity > 0.5) {
           toRemove = true;
         } else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
           toRemove = true;
         }
 
         if (toRemove) {
@@ -1658,17 +1675,17 @@ Readability.prototype = {
    *
    * @param Element
    * @return void
   **/
   _cleanHeaders: function(e) {
     for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
       var headers = e.getElementsByTagName('h' + headerIndex);
       for (var i = headers.length - 1; i >= 0; i -= 1) {
-        if (this._getClassWeight(headers[i]) < 0 || this._getLinkDensity(headers[i]) > 0.33)
+        if (this._getClassWeight(headers[i]) < 0)
           headers[i].parentNode.removeChild(headers[i]);
       }
     }
   },
 
   _flagIsActive: function(flag) {
     return (this._flags & flag) > 0;
   },
@@ -1681,42 +1698,52 @@ Readability.prototype = {
     this._flags = this._flags & ~flag;
   },
 
   /**
    * Decides whether or not the document is reader-able without parsing the whole thing.
    *
    * @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
    */
-  isProbablyReaderable: function() {
-    var nodes = this._doc.getElementsByTagName("p");
-    if (nodes.length < 5) {
-      return false;
-    }
+  isProbablyReaderable: function(helperIsVisible) {
+    var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
+
+    // FIXME we should have a fallback for helperIsVisible, but this is
+    // problematic because of jsdom's elem.style handling - see
+    // https://github.com/mozilla/readability/pull/186 for context.
 
-    var possibleParagraphs = 0;
-    for (var i = 0; i < nodes.length; i++) {
-      var node = nodes[i];
+    var score = 0;
+    // This is a little cheeky, we use the accumulator 'score' to decide what to return from
+    // this callback:
+    return this._someNode(nodes, function(node) {
+      if (helperIsVisible && !helperIsVisible(node))
+        return false;
       var matchString = node.className + " " + node.id;
 
       if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
           !this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
-        continue;
+        return false;
       }
 
-      if (node.textContent.trim().length < 100) {
-        continue;
+      if (node.matches && node.matches("li p")) {
+        return false;
       }
 
-      possibleParagraphs++;
-      if (possibleParagraphs >= 5) {
+      var textContentLength = node.textContent.trim().length;
+      if (textContentLength < 140) {
+        return false;
+      }
+
+      score += Math.sqrt(textContentLength - 140);
+
+      if (score > 20) {
         return true;
       }
-    }
-    return false;
+      return false;
+    });
   },
 
   /**
    * Runs readability.
    *
    * Workflow:
    *  1. Prep the document by removing script tags, css, etc.
    *  2. Build readability's DOM tree.