Bug 1265866 - update Readability to the latest version from github, rs=me
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Wed, 20 Apr 2016 21:19:07 +0100
changeset 317869 387167675e152537dd5538a61d07c155a4fabdd0
parent 317868 6603fcdf9d73ffe1e33617015b879a5555b18ca7
child 317870 9ad67a907cae9644493ecbabae4529602ca3fa72
push id9480
push userjlund@mozilla.com
push dateMon, 25 Apr 2016 17:12:58 +0000
treeherdermozilla-aurora@0d6a91c76a9e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersme
bugs1265866
milestone48.0a1
Bug 1265866 - update Readability to the latest version from github, rs=me MozReview-Commit-ID: 2n8zVKjTfma
toolkit/components/reader/JSDOMParser.js
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -1,8 +1,9 @@
+/*eslint-env es6:false*/
 /*
  * DO NOT MODIFY THIS FILE DIRECTLY!
  *
  * This is a shared library that is maintained in an external repo:
  * https://github.com/mozilla/readability
  */
 
 /* This Source Code Form is subject to the terms of the Mozilla Public
@@ -693,22 +694,23 @@
       var arr = [];
       getHTML(this);
       return arr.join("");
     },
 
     set innerHTML(html) {
       var parser = new JSDOMParser();
       var node = parser.parse(html);
-      for (let i = this.childNodes.length; --i >= 0;) {
+      var i;
+      for (i = this.childNodes.length; --i >= 0;) {
         this.childNodes[i].parentNode = null;
       }
       this.childNodes = node.childNodes;
       this.children = node.children;
-      for (let i = this.childNodes.length; --i >= 0;) {
+      for (i = this.childNodes.length; --i >= 0;) {
         this.childNodes[i].parentNode = this;
       }
     },
 
     set textContent(text) {
       // clear parentNodes for existing children
       for (var i = this.childNodes.length; --i >= 0;) {
         this.childNodes[i].parentNode = null;
@@ -1083,26 +1085,26 @@
       var c = this.nextChar();
 
       if (c === undefined)
         return null;
 
       // Read any text as Text node
       if (c !== "<") {
         --this.currentChar;
-        let node = new Text();
+        var textNode = new Text();
         var n = this.html.indexOf("<", this.currentChar);
         if (n === -1) {
-          node.innerHTML = this.html.substring(this.currentChar, this.html.length);
+          textNode.innerHTML = this.html.substring(this.currentChar, this.html.length);
           this.currentChar = this.html.length;
         } else {
-          node.innerHTML = this.html.substring(this.currentChar, n);
+          textNode.innerHTML = this.html.substring(this.currentChar, n);
           this.currentChar = n;
         }
-        return node;
+        return textNode;
       }
 
       c = this.peekNext();
 
       // Read Comment node. Normally, Comment nodes know their inner
       // textContent, but we don't really care about Comment nodes (we throw
       // them away in readChildren()). So just returning an empty Comment node
       // here is sufficient.
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -1,8 +1,9 @@
+/*eslint-env es6:false*/
 /*
  * DO NOT MODIFY THIS FILE DIRECTLY!
  *
  * This is a shared library that is maintained in an external repo:
  * https://github.com/mozilla/readability
  */
 
 /*
@@ -59,32 +60,33 @@ var Readability = function(uri, doc, opt
 
   // A list of the ETag headers of pages we've parsed, in case they happen to match,
   // we'll know it's a duplicate.
   this._pageETags = {};
 
   // Make an AJAX request for each page and append it to the document.
   this._curPageNum = 1;
 
+  var logEl;
+
   // Control whether log messages are sent to the console
   if (this._debug) {
-    function logEl(e) {
+    logEl = function(e) {
       var rv = e.nodeName + " ";
       if (e.nodeType == e.TEXT_NODE) {
         return rv + '("' + e.textContent + '")';
       }
       var classDesc = e.className && ("." + e.className.replace(/ /g, "."));
       var elDesc = "";
-      if (e.id) {
+      if (e.id)
         elDesc = "(#" + e.id + classDesc + ")";
-      } else if (classDesc) {
+      else if (classDesc)
         elDesc = "(" + classDesc + ")";
-      }
       return rv + elDesc;
-    }
+    };
     this.log = function () {
       if ("dump" in root) {
         var msg = Array.prototype.map.call(arguments, function(x) {
           return (x && x.nodeName) ? logEl(x) : x;
         }).join(" ");
         dump("Reader: (Readability) " + msg + "\n");
       } else if ("console" in root) {
         var args = ["Reader: (Readability) "].concat(arguments);
@@ -113,20 +115,20 @@ Readability.prototype = {
   DEFAULT_MAX_PAGES: 5,
 
   // Element tags to score by default.
   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
-    unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
+    unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
-    negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
+    negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
     normalize: /\s{2,}/g,
     videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
     prevLink: /(prev|earl|old|new|<|«)/i,
     whitespace: /^\s*$/,
@@ -193,22 +195,24 @@ Readability.prototype = {
     return Array.prototype.concat.apply([], nodeLists);
   },
 
   _getAllNodesWithTag: function(node, tagNames) {
     if (node.querySelectorAll) {
       return node.querySelectorAll(tagNames.join(','));
     }
     return [].concat.apply([], tagNames.map(function(tag) {
-      return node.getElementsByTagName(tag);
+      var collection = node.getElementsByTagName(tag);
+      return Array.isArray(collection) ? collection : Array.from(collection);
     }));
   },
 
   /**
-   * Converts each <a> and <img> uri in the given element to an absolute URI.
+   * Converts each <a> and <img> uri in the given element to an absolute URI,
+   * ignoring #ref URIs.
    *
    * @param Element
    * @return void
    */
   _fixRelativeUris: function(articleContent) {
     var scheme = this._uri.scheme;
     var prePath = this._uri.prePath;
     var pathBase = this._uri.pathBase;
@@ -225,16 +229,20 @@ Readability.prototype = {
       // Prepath-rooted relative URI.
       if (uri[0] == "/")
         return prePath + uri;
 
       // Dotslash relative URI.
       if (uri.indexOf("./") === 0)
         return pathBase + uri.slice(2);
 
+      // Ignore hash URIs:
+      if (uri[0] == "#")
+        return uri;
+
       // Standard relative URI; add entire path. pathBase already includes a
       // trailing "/".
       return pathBase + uri;
     }
 
     var links = articleContent.getElementsByTagName("a");
     this._forEachNode(links, function(link) {
       var href = link.getAttribute("href");
@@ -369,19 +377,19 @@ Readability.prototype = {
       // <p> block.
       var replaced = false;
 
       // If we find a <br> chain, remove the <br>s until we hit another element
       // or non-whitespace. This leaves behind the first <br> in the chain
       // (which will be replaced with a <p> later).
       while ((next = this._nextElement(next)) && (next.tagName == "BR")) {
         replaced = true;
-        let sibling = next.nextSibling;
+        var brSibling = next.nextSibling;
         next.parentNode.removeChild(next);
-        next = sibling;
+        next = brSibling;
       }
 
       // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
       // all sibling nodes as children of the <p> until we hit another <br>
       // chain.
       if (replaced) {
         var p = this._doc.createElement("p");
         br.parentNode.replaceChild(p, br);
@@ -391,17 +399,17 @@ Readability.prototype = {
           // If we've hit another <br><br>, we're done adding children to this <p>.
           if (next.tagName == "BR") {
             var nextElem = this._nextElement(next);
             if (nextElem && nextElem.tagName == "BR")
               break;
           }
 
           // Otherwise, make this node a child of the new <p>.
-          let sibling = next.nextSibling;
+          var sibling = next.nextSibling;
           p.appendChild(next);
           next = sibling;
         }
       }
     });
   },
 
   _setNodeTag: function (node, tag) {
@@ -742,17 +750,22 @@ Readability.prototype = {
             this._initializeNode(ancestor);
             candidates.push(ancestor);
           }
 
           // Node score divider:
           // - parent:             1 (no division)
           // - grandparent:        2
           // - great grandparent+: ancestor level * 3
-          var scoreDivider = level < 2 ? level + 1 : level * 3;
+          if (level === 0)
+            var scoreDivider = 1;
+          else if (level === 1)
+            scoreDivider = 2;
+          else
+            scoreDivider = level * 3;
           ancestor.readability.contentScore += contentScore / scoreDivider;
         });
       });
 
       // After we've calculated scores, loop through all of the possible
       // candidate nodes we found and find the one with the highest score.
       var topCandidates = [];
       for (var c = 0, cl = candidates.length; c < cl; c += 1) {
@@ -855,17 +868,18 @@ Readability.prototype = {
             append = true;
           } else if (sibling.nodeName === "P") {
             var linkDensity = this._getLinkDensity(sibling);
             var nodeContent = this._getInnerText(sibling);
             var nodeLength = nodeContent.length;
 
             if (nodeLength > 80 && linkDensity < 0.25) {
               append = true;
-            } else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) {
+            } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 &&
+                       nodeContent.search(/\.( |$)/) !== -1) {
               append = true;
             }
           }
         }
 
         if (append) {
           this.log("Appending node:", sibling);
 
@@ -1140,17 +1154,17 @@ Readability.prototype = {
    * This is the amount of text that is inside a link divided by the total text in the node.
    *
    * @param Element
    * @return number (float)
   **/
   _getLinkDensity: function(element) {
     var textLength = this._getInnerText(element).length;
     if (textLength === 0)
-      return undefined;
+      return 0;
 
     var linkLength = 0;
 
     // XXX implement _reduceNodeList?
     this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
       linkLength += this._getInnerText(linkNode).length;
     });