Bug 1265866 - update Readability to the latest version from github, rs=me
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Wed, 20 Apr 2016 21:19:07 +0100
changeset 331966 387167675e152537dd5538a61d07c155a4fabdd0
parent 331965 6603fcdf9d73ffe1e33617015b879a5555b18ca7
child 331967 9ad67a907cae9644493ecbabae4529602ca3fa72
push id6048
push userkmoir@mozilla.com
push dateMon, 06 Jun 2016 19:02:08 +0000
treeherdermozilla-beta@46d72a56c57d [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersme
bugs1265866
milestone48.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1265866 - update Readability to the latest version from github, rs=me MozReview-Commit-ID: 2n8zVKjTfma
toolkit/components/reader/JSDOMParser.js
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -1,8 +1,9 @@
+/*eslint-env es6:false*/
 /*
  * DO NOT MODIFY THIS FILE DIRECTLY!
  *
  * This is a shared library that is maintained in an external repo:
  * https://github.com/mozilla/readability
  */
 
 /* This Source Code Form is subject to the terms of the Mozilla Public
@@ -693,22 +694,23 @@
       var arr = [];
       getHTML(this);
       return arr.join("");
     },
 
     set innerHTML(html) {
       var parser = new JSDOMParser();
       var node = parser.parse(html);
-      for (let i = this.childNodes.length; --i >= 0;) {
+      var i;
+      for (i = this.childNodes.length; --i >= 0;) {
         this.childNodes[i].parentNode = null;
       }
       this.childNodes = node.childNodes;
       this.children = node.children;
-      for (let i = this.childNodes.length; --i >= 0;) {
+      for (i = this.childNodes.length; --i >= 0;) {
         this.childNodes[i].parentNode = this;
       }
     },
 
     set textContent(text) {
       // clear parentNodes for existing children
       for (var i = this.childNodes.length; --i >= 0;) {
         this.childNodes[i].parentNode = null;
@@ -1083,26 +1085,26 @@
       var c = this.nextChar();
 
       if (c === undefined)
         return null;
 
       // Read any text as Text node
       if (c !== "<") {
         --this.currentChar;
-        let node = new Text();
+        var textNode = new Text();
         var n = this.html.indexOf("<", this.currentChar);
         if (n === -1) {
-          node.innerHTML = this.html.substring(this.currentChar, this.html.length);
+          textNode.innerHTML = this.html.substring(this.currentChar, this.html.length);
           this.currentChar = this.html.length;
         } else {
-          node.innerHTML = this.html.substring(this.currentChar, n);
+          textNode.innerHTML = this.html.substring(this.currentChar, n);
           this.currentChar = n;
         }
-        return node;
+        return textNode;
       }
 
       c = this.peekNext();
 
       // Read Comment node. Normally, Comment nodes know their inner
       // textContent, but we don't really care about Comment nodes (we throw
       // them away in readChildren()). So just returning an empty Comment node
       // here is sufficient.
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -1,8 +1,9 @@
+/*eslint-env es6:false*/
 /*
  * DO NOT MODIFY THIS FILE DIRECTLY!
  *
  * This is a shared library that is maintained in an external repo:
  * https://github.com/mozilla/readability
  */
 
 /*
@@ -59,32 +60,33 @@ var Readability = function(uri, doc, opt
 
   // A list of the ETag headers of pages we've parsed, in case they happen to match,
   // we'll know it's a duplicate.
   this._pageETags = {};
 
   // Make an AJAX request for each page and append it to the document.
   this._curPageNum = 1;
 
+  var logEl;
+
   // Control whether log messages are sent to the console
   if (this._debug) {
-    function logEl(e) {
+    logEl = function(e) {
       var rv = e.nodeName + " ";
       if (e.nodeType == e.TEXT_NODE) {
         return rv + '("' + e.textContent + '")';
       }
       var classDesc = e.className && ("." + e.className.replace(/ /g, "."));
       var elDesc = "";
-      if (e.id) {
+      if (e.id)
         elDesc = "(#" + e.id + classDesc + ")";
-      } else if (classDesc) {
+      else if (classDesc)
         elDesc = "(" + classDesc + ")";
-      }
       return rv + elDesc;
-    }
+    };
     this.log = function () {
       if ("dump" in root) {
         var msg = Array.prototype.map.call(arguments, function(x) {
           return (x && x.nodeName) ? logEl(x) : x;
         }).join(" ");
         dump("Reader: (Readability) " + msg + "\n");
       } else if ("console" in root) {
         var args = ["Reader: (Readability) "].concat(arguments);
@@ -113,20 +115,20 @@ Readability.prototype = {
   DEFAULT_MAX_PAGES: 5,
 
   // Element tags to score by default.
   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
-    unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
+    unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
-    negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
+    negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
     normalize: /\s{2,}/g,
     videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
     prevLink: /(prev|earl|old|new|<|«)/i,
     whitespace: /^\s*$/,
@@ -193,22 +195,24 @@ Readability.prototype = {
     return Array.prototype.concat.apply([], nodeLists);
   },
 
   _getAllNodesWithTag: function(node, tagNames) {
     if (node.querySelectorAll) {
       return node.querySelectorAll(tagNames.join(','));
     }
     return [].concat.apply([], tagNames.map(function(tag) {
-      return node.getElementsByTagName(tag);
+      var collection = node.getElementsByTagName(tag);
+      return Array.isArray(collection) ? collection : Array.from(collection);
     }));
   },
 
   /**
-   * Converts each <a> and <img> uri in the given element to an absolute URI.
+   * Converts each <a> and <img> uri in the given element to an absolute URI,
+   * ignoring #ref URIs.
    *
    * @param Element
    * @return void
    */
   _fixRelativeUris: function(articleContent) {
     var scheme = this._uri.scheme;
     var prePath = this._uri.prePath;
     var pathBase = this._uri.pathBase;
@@ -225,16 +229,20 @@ Readability.prototype = {
       // Prepath-rooted relative URI.
       if (uri[0] == "/")
         return prePath + uri;
 
       // Dotslash relative URI.
       if (uri.indexOf("./") === 0)
         return pathBase + uri.slice(2);
 
+      // Ignore hash URIs:
+      if (uri[0] == "#")
+        return uri;
+
       // Standard relative URI; add entire path. pathBase already includes a
       // trailing "/".
       return pathBase + uri;
     }
 
     var links = articleContent.getElementsByTagName("a");
     this._forEachNode(links, function(link) {
       var href = link.getAttribute("href");
@@ -369,19 +377,19 @@ Readability.prototype = {
       // <p> block.
       var replaced = false;
 
       // If we find a <br> chain, remove the <br>s until we hit another element
       // or non-whitespace. This leaves behind the first <br> in the chain
       // (which will be replaced with a <p> later).
       while ((next = this._nextElement(next)) && (next.tagName == "BR")) {
         replaced = true;
-        let sibling = next.nextSibling;
+        var brSibling = next.nextSibling;
         next.parentNode.removeChild(next);
-        next = sibling;
+        next = brSibling;
       }
 
       // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
       // all sibling nodes as children of the <p> until we hit another <br>
       // chain.
       if (replaced) {
         var p = this._doc.createElement("p");
         br.parentNode.replaceChild(p, br);
@@ -391,17 +399,17 @@ Readability.prototype = {
           // If we've hit another <br><br>, we're done adding children to this <p>.
           if (next.tagName == "BR") {
             var nextElem = this._nextElement(next);
             if (nextElem && nextElem.tagName == "BR")
               break;
           }
 
           // Otherwise, make this node a child of the new <p>.
-          let sibling = next.nextSibling;
+          var sibling = next.nextSibling;
           p.appendChild(next);
           next = sibling;
         }
       }
     });
   },
 
   _setNodeTag: function (node, tag) {
@@ -742,17 +750,22 @@ Readability.prototype = {
             this._initializeNode(ancestor);
             candidates.push(ancestor);
           }
 
           // Node score divider:
           // - parent:             1 (no division)
           // - grandparent:        2
           // - great grandparent+: ancestor level * 3
-          var scoreDivider = level < 2 ? level + 1 : level * 3;
+          if (level === 0)
+            var scoreDivider = 1;
+          else if (level === 1)
+            scoreDivider = 2;
+          else
+            scoreDivider = level * 3;
           ancestor.readability.contentScore += contentScore / scoreDivider;
         });
       });
 
       // After we've calculated scores, loop through all of the possible
       // candidate nodes we found and find the one with the highest score.
       var topCandidates = [];
       for (var c = 0, cl = candidates.length; c < cl; c += 1) {
@@ -855,17 +868,18 @@ Readability.prototype = {
             append = true;
           } else if (sibling.nodeName === "P") {
             var linkDensity = this._getLinkDensity(sibling);
             var nodeContent = this._getInnerText(sibling);
             var nodeLength = nodeContent.length;
 
             if (nodeLength > 80 && linkDensity < 0.25) {
               append = true;
-            } else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) {
+            } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 &&
+                       nodeContent.search(/\.( |$)/) !== -1) {
               append = true;
             }
           }
         }
 
         if (append) {
           this.log("Appending node:", sibling);
 
@@ -1140,17 +1154,17 @@ Readability.prototype = {
    * This is the amount of text that is inside a link divided by the total text in the node.
    *
    * @param Element
    * @return number (float)
   **/
   _getLinkDensity: function(element) {
     var textLength = this._getInnerText(element).length;
     if (textLength === 0)
-      return undefined;
+      return 0;
 
     var linkLength = 0;
 
     // XXX implement _reduceNodeList?
     this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
       linkLength += this._getInnerText(linkNode).length;
     });