No bug: update readability libs to the up-to-date github versions. rs=me+Gijs
authorMargaret Leibovic <margaret.leibovic@gmail.com>
Mon, 23 Mar 2015 11:29:23 -0700
changeset 235336 c3681d12e524822bb2125fb8d801c154618c8ec1
parent 235175 9cee181014eb6302296b9b9f1552ab9019c9c32b
child 235337 466d2f2a7b7e51cca2ce024a23baeb8723c8ae76
push id57400
push userryanvm@gmail.com
push dateTue, 24 Mar 2015 15:59:13 +0000
treeherdermozilla-inbound@47fa87252df0 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersme
milestone39.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
No bug: update readability libs to the up-to-date github versions. rs=me+Gijs
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -98,17 +98,17 @@ Readability.prototype = {
     unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
     negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
     normalize: /\s{2,}/g,
-    videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i,
+    videos: /https?:\/\/(www\.)?(youtube|youtube-nocookie|player\.vimeo)\.com/i,
     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
     prevLink: /(prev|earl|old|new|<|«)/i,
     whitespace: /^\s*$/,
     hasContent: /\S$/,
   },
 
   DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
 
@@ -121,16 +121,46 @@ Readability.prototype = {
    * @return void
   **/
   _postProcessContent: function(articleContent) {
     // Readability cannot open relative uris so we convert them to absolute uris.
     this._fixRelativeUris(articleContent);
   },
 
   /**
+   * Iterate over a NodeList, which doesn't natively fully implement the Array
+   * interface.
+   *
+   * For convenience, the current object context is applied to the provided
+   * iterate function.
+   *
+   * @param  NodeList nodeList The NodeList.
+   * @param  Function fn       The iterate function.
+   * @return void
+   */
+  _forEachNode: function(nodeList, fn) {
+    return Array.prototype.forEach.call(nodeList, fn, this);
+  },
+
+  /**
+   * Iterate over a NodeList, return true if any of the provided iterate
+   * function calls returns true, false otherwise.
+   *
+   * For convenience, the current object context is applied to the
+   * provided iterate function.
+   *
+   * @param  NodeList nodeList The NodeList.
+   * @param  Function fn       The iterate function.
+   * @return Boolean
+   */
+  _someNode: function(nodeList, fn) {
+    return Array.prototype.some.call(nodeList, fn, this);
+  },
+
+  /**
    * Converts each <a> and <img> uri in the given element to an absolute URI.
    *
    * @param Element
    * @return void
    */
   _fixRelativeUris: function(articleContent) {
     var scheme = this._uri.scheme;
     var prePath = this._uri.prePath;
@@ -144,36 +174,39 @@ Readability.prototype = {
       // Scheme-rooted relative URI.
       if (uri.substr(0, 2) == "//")
         return scheme + "://" + uri.substr(2);
 
       // Prepath-rooted relative URI.
       if (uri[0] == "/")
         return prePath + uri;
 
+      // Dotslash relative URI.
+      if (uri.indexOf("./") === 0)
+        return pathBase + uri.slice(2);
+
       // Standard relative URI; add entire path. pathBase already includes a
       // trailing "/".
       return pathBase + uri;
     }
 
     function convertRelativeURIs(tagName, propName) {
       var elems = articleContent.getElementsByTagName(tagName);
-      for (var i = elems.length; --i >= 0;) {
-        var elem = elems[i];
+      this._forEachNode(elems, function(elem) {
         var relativeURI = elem.getAttribute(propName);
         if (relativeURI != null)
-          elems[i].setAttribute(propName, toAbsoluteURI(relativeURI));
-      }
+          elem.setAttribute(propName, toAbsoluteURI(relativeURI));
+      });
     }
 
      // Fix links.
-    convertRelativeURIs("a", "href");
+    convertRelativeURIs.call(this, "a", "href");
 
      // Fix images.
-    convertRelativeURIs("img", "src");
+    convertRelativeURIs.call(this, "img", "src");
   },
 
   /**
    * Get the article title as an H1.
    *
    * @return void
    **/
   _getArticleTitle: function() {
@@ -219,29 +252,27 @@ Readability.prototype = {
    * This includes things like stripping javascript, CSS, and handling terrible markup.
    *
    * @return void
    **/
   _prepDocument: function() {
     var doc = this._doc;
 
     // Remove all style tags in head
-    var styleTags = doc.getElementsByTagName("style");
-    for (var st = styleTags.length - 1; st >= 0; st -= 1) {
-      styleTags[st].parentNode.removeChild(styleTags[st]);
-    }
+    this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) {
+      styleNode.parentNode.removeChild(styleNode);
+    });
 
     if (doc.body) {
       this._replaceBrs(doc.body);
     }
 
-    var fonts = doc.getElementsByTagName("FONT");
-    for (var i = fonts.length; --i >=0;) {
-      this._setNodeTag(fonts[i], "SPAN");
-    }
+    this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) {
+      this._setNodeTag(fontNode, "SPAN");
+    });
   },
 
   /**
    * Finds the next element, starting from the given node, and ignoring
    * whitespace in between. If the given node is an element, the same node is
    * returned.
    */
   _nextElement: function (node) {
@@ -257,19 +288,17 @@ Readability.prototype = {
   /**
    * Replaces 2 or more successive <br> elements with a single <p>.
    * Whitespace between <br> elements are ignored. For example:
    *   <div>foo<br>bar<br> <br><br>abc</div>
    * will become:
    *   <div>foo<br>bar<p>abc</p></div>
    */
   _replaceBrs: function (elem) {
-    var brs = elem.getElementsByTagName("br");
-    for (var i = 0; i < brs.length; i++) {
-      var br = brs[i];
+    this._forEachNode(elem.getElementsByTagName("br"), function(br) {
       var next = br.nextSibling;
 
       // Whether 2 or more <br> elements have been found and replaced with a
       // <p> block.
       var replaced = false;
 
       // If we find a <br> chain, remove the <br>s until we hit another element
       // or non-whitespace. This leaves behind the first <br> in the chain
@@ -298,17 +327,17 @@ Readability.prototype = {
           }
 
           // Otherwise, make this node a child of the new <p>.
           var sibling = next.nextSibling;
           p.appendChild(next);
           next = sibling;
         }
       }
-    }
+    });
   },
 
   _setNodeTag: function (node, tag) {
     // FIXME this doesn't work on anything but JSDOMParser (ie the node's tag
     // won't actually be set).
     node.localName = tag.toLowerCase();
     node.tagName = tag.toUpperCase();
   },
@@ -321,16 +350,17 @@ Readability.prototype = {
    * @return void
    **/
   _prepArticle: function(articleContent) {
     this._cleanStyles(articleContent);
 
     // Clean out junk from the article content
     this._cleanConditionally(articleContent, "form");
     this._clean(articleContent, "object");
+    this._clean(articleContent, "embed");
     this._clean(articleContent, "h1");
 
     // If there is only one h2, they are probably using it as a header
     // and not a subheader, so remove it since we already have a header.
     if (articleContent.getElementsByTagName('h2').length === 1)
       this._clean(articleContent, "h2");
 
     this._clean(articleContent, "iframe");
@@ -338,36 +368,33 @@ Readability.prototype = {
 
     // Do these last as the previous stuff may have removed junk
     // that will affect these
     this._cleanConditionally(articleContent, "table");
     this._cleanConditionally(articleContent, "ul");
     this._cleanConditionally(articleContent, "div");
 
     // Remove extra paragraphs
-    var articleParagraphs = articleContent.getElementsByTagName('p');
-    for (var i = articleParagraphs.length - 1; i >= 0; i -= 1) {
-      var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
-      var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
-      var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
+    this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) {
+      var imgCount = paragraph.getElementsByTagName('img').length;
+      var embedCount = paragraph.getElementsByTagName('embed').length;
+      var objectCount = paragraph.getElementsByTagName('object').length;
+      // At this point, nasty iframes have been removed, only remain embedded video ones.
+      var iframeCount = paragraph.getElementsByTagName('iframe').length;
+      var totalCount = imgCount + embedCount + objectCount + iframeCount;
 
-      if (imgCount === 0 &&
-        embedCount === 0 &&
-        objectCount === 0 &&
-        this._getInnerText(articleParagraphs[i], false) === '')
-        articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
-    }
+      if (totalCount === 0 && !this._getInnerText(paragraph, false))
+        paragraph.parentNode.removeChild(paragraph);
+    });
 
-    var brs = articleContent.getElementsByTagName("BR");
-    for (var i = brs.length; --i >= 0;) {
-      var br = brs[i];
+    this._forEachNode(articleContent.getElementsByTagName("br"), function(br) {
       var next = this._nextElement(br.nextSibling);
       if (next && next.tagName == "P")
         br.parentNode.removeChild(br);
-    }
+    });
   },
 
   /**
    * Initialize a node with the readability object. Also checks the
    * className/id for special names to add to its score.
    *
    * @param Element
    * @return void
@@ -524,49 +551,48 @@ Readability.prototype = {
             var newNode = node.firstElementChild;
             node.parentNode.replaceChild(newNode, node);
             node = newNode;
           } else if (!this._hasChildBlockElement(node)) {
             this._setNodeTag(node, "P");
             elementsToScore.push(node);
           } else {
             // EXPERIMENTAL
-            for (var i = 0, il = node.childNodes.length; i < il; i += 1) {
-              var childNode = node.childNodes[i];
+            this._forEachNode(node.childNodes, function(childNode) {
               if (childNode.nodeType === Node.TEXT_NODE) {
                 var p = doc.createElement('p');
                 p.textContent = childNode.textContent;
                 p.style.display = 'inline';
                 p.className = 'readability-styled';
                 node.replaceChild(p, childNode);
               }
-            }
+            });
           }
         }
         node = this._getNextNode(node);
       }
 
       /**
        * Loop through all paragraphs, and assign a score to them based on how content-y they look.
        * Then add their score to their parent node.
        *
        * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
       **/
       var candidates = [];
-      for (var pt = 0; pt < elementsToScore.length; pt += 1) {
-        var parentNode = elementsToScore[pt].parentNode;
+      this._forEachNode(elementsToScore, function(elementToScore) {
+        var parentNode = elementToScore.parentNode;
         var grandParentNode = parentNode ? parentNode.parentNode : null;
-        var innerText = this._getInnerText(elementsToScore[pt]);
+        var innerText = this._getInnerText(elementToScore);
 
         if (!parentNode || typeof(parentNode.tagName) === 'undefined')
-          continue;
+          return;
 
         // If this paragraph is less than 25 characters, don't even count it.
         if (innerText.length < 25)
-          continue;
+          return;
 
         // Initialize readability data for the parent.
         if (typeof parentNode.readability === 'undefined') {
           this._initializeNode(parentNode);
           candidates.push(parentNode);
         }
 
         // Initialize readability data for the grandparent.
@@ -588,17 +614,17 @@ Readability.prototype = {
         // For every 100 characters in this paragraph, add another point. Up to 3 points.
         contentScore += Math.min(Math.floor(innerText.length / 100), 3);
 
         // Add the score to the parent. The grandparent gets half.
         parentNode.readability.contentScore += contentScore;
 
         if (grandParentNode)
           grandParentNode.readability.contentScore += contentScore / 2;
-      }
+      });
 
       // After we've calculated scores, loop through all of the possible
       // candidate nodes we found and find the one with the highest score.
       var topCandidates = [];
       for (var c = 0, cl = candidates.length; c < cl; c += 1) {
         var candidate = candidates[c];
 
         // Scale the final candidates score based on link density. Good content
@@ -645,28 +671,29 @@ Readability.prototype = {
         // Because of our bonus system, parents of candidates might have scores
         // themselves. They get half of the node. There won't be nodes with higher
         // scores than our topCandidate, but if we see the score going *up* in the first
         // few steps up the tree, that's a decent sign that there might be more content
         // lurking in other places that we want to unify in. The sibling stuff
         // below does some of that - but only if we've looked high enough up the DOM
         // tree.
         var parentOfTopCandidate = topCandidate.parentNode;
+        var lastScore = topCandidate.readability.contentScore;
         // The scores shouldn't get too low.
-        var scoreThreshold = topCandidate.readability.contentScore / 3;
-        var lastScore = parentOfTopCandidate.readability.contentScore;
+        var scoreThreshold = lastScore / 3;
         while (parentOfTopCandidate && parentOfTopCandidate.readability) {
           var parentScore = parentOfTopCandidate.readability.contentScore;
           if (parentScore < scoreThreshold)
             break;
           if (parentScore > lastScore) {
             // Alright! We found a better parent to use.
             topCandidate = parentOfTopCandidate;
             break;
           }
+          lastScore = parentOfTopCandidate.readability.contentScore;
           parentOfTopCandidate = parentOfTopCandidate.parentNode;
         }
       }
 
       // Now that we have the top candidate, look through its siblings for content
       // that might also be related. Things like preambles, content split by ads
       // that we removed, etc.
       var articleContent = doc.createElement("DIV");
@@ -799,40 +826,39 @@ Readability.prototype = {
       byline = byline.trim();
       return (byline.length > 0) && (byline.length < 100);
     }
     return false;
   },
 
   /**
    * Attempts to get excerpt and byline metadata for the article.
-   * 
+   *
    * @return Object with optional "excerpt" and "byline" properties
    */
   _getArticleMetadata: function() {
     var metadata = {};
     var values = {};
     var metaElements = this._doc.getElementsByTagName("meta");
 
     // Match "description", or Twitter's "twitter:description" (Cards)
     // in name attribute.
     var namePattern = /^\s*((twitter)\s*:\s*)?description\s*$/gi;
 
     // Match Facebook's og:description (Open Graph) in property attribute.
     var propertyPattern = /^\s*og\s*:\s*description\s*$/gi;
 
     // Find description tags.
-    for (var i = 0; i < metaElements.length; i++) {
-      var element = metaElements[i];
+    this._forEachNode(metaElements, function(element) {
       var elementName = element.getAttribute("name");
       var elementProperty = element.getAttribute("property");
 
       if (elementName === "author") {
         metadata.byline = element.getAttribute("content");
-        continue;
+        return;
       }
 
       var name = null;
       if (namePattern.test(elementName)) {
         name = elementName;
       } else if (propertyPattern.test(elementProperty)) {
         name = elementProperty;
       }
@@ -841,17 +867,17 @@ Readability.prototype = {
         var content = element.getAttribute("content");
         if (content) {
           // Convert to lowercase and remove any whitespace
           // so we can match below.
           name = name.toLowerCase().replace(/\s/g, '');
           values[name] = content.trim();
         }
       }
-    }
+    });
 
     if ("description" in values) {
       metadata.excerpt = values["description"];
     } else if ("og:description" in values) {
       // Use facebook open graph description.
       metadata.excerpt = values["og:description"];
     } else if ("twitter:description" in values) {
       // Use twitter cards description.
@@ -862,76 +888,68 @@ Readability.prototype = {
   },
 
   /**
    * Removes script tags from the document.
    *
    * @param Element
   **/
   _removeScripts: function(doc) {
-    var scripts = doc.getElementsByTagName('script');
-    for (var i = scripts.length - 1; i >= 0; i -= 1) {
-      scripts[i].nodeValue="";
-      scripts[i].removeAttribute('src');
+    this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
+      scriptNode.nodeValue = "";
+      scriptNode.removeAttribute('src');
 
-      if (scripts[i].parentNode)
-          scripts[i].parentNode.removeChild(scripts[i]);
-    }
+      if (scriptNode.parentNode)
+        scriptNode.parentNode.removeChild(scriptNode);
+    });
   },
 
   /**
    * Check if this node has only whitespace and a single P element
    * Returns false if the DIV node contains non-empty text nodes
    * or if it contains no P or more than 1 element.
    *
    * @param Element
   **/
-  _hasSinglePInsideElement: function(e) {
+  _hasSinglePInsideElement: function(element) {
     // There should be exactly 1 element child which is a P:
-    if (e.children.length != 1 || e.firstElementChild.tagName !== "P") {
+    if (element.children.length != 1 || element.firstElementChild.tagName !== "P") {
       return false;
     }
+
     // And there should be no text nodes with real content
-    var childNodes = e.childNodes;
-    for (var i = childNodes.length; --i >= 0;) {
-      var node = childNodes[i];
-      if (node.nodeType == Node.TEXT_NODE &&
-          this.REGEXPS.hasContent.test(node.textContent)) {
-        return false;
-      }
-    }
-
-    return true;
+    return !this._someNode(element.childNodes, function(node) {
+      return node.nodeType === Node.TEXT_NODE &&
+             this.REGEXPS.hasContent.test(node.textContent);
+    });
   },
 
   /**
    * Determine whether element has any children block level elements.
    *
    * @param Element
    */
-  _hasChildBlockElement: function (e) {
-    var length = e.children.length;
-    for (var i = 0; i < length; i++) {
-      var child = e.children[i];
-      if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child))
-        return true;
-    }
-    return false;
+  _hasChildBlockElement: function (element) {
+    return this._someNode(element.childNodes, function(node) {
+      return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
+             this._hasChildBlockElement(node);
+    });
   },
 
   /**
    * Get the inner text of a node - cross browser compatibly.
    * This also strips out any excess whitespace to be found.
    *
    * @param Element
+   * @param Boolean normalizeSpaces (default: true)
    * @return string
   **/
   _getInnerText: function(e, normalizeSpaces) {
+    normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
     var textContent = e.textContent.trim();
-    normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
 
     if (normalizeSpaces) {
       return textContent.replace(this.REGEXPS.normalize, " ");
     } else {
       return textContent;
     }
   },
 
@@ -980,24 +998,27 @@ Readability.prototype = {
 
   /**
    * Get the density of links as a percentage of the content
    * This is the amount of text that is inside a link divided by the total text in the node.
    *
    * @param Element
    * @return number (float)
   **/
-  _getLinkDensity: function(e) {
-    var links = e.getElementsByTagName("a");
-    var textLength = this._getInnerText(e).length;
+  _getLinkDensity: function(element) {
+    var textLength = this._getInnerText(element).length;
+    if (textLength === 0)
+      return;
+
     var linkLength = 0;
 
-    for (var i = 0, il = links.length; i < il; i += 1) {
-      linkLength += this._getInnerText(links[i]).length;
-    }
+    // XXX implement _reduceNodeList?
+    this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
+      linkLength += this._getInnerText(linkNode).length;
+    });
 
     return linkLength / textLength;
   },
 
   /**
    * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
    *
    * @author Dan Lacy
@@ -1400,38 +1421,36 @@ Readability.prototype = {
    * Clean a node of all elements of type "tag".
    * (Unless it's a youtube/vimeo video. People love movies.)
    *
    * @param Element
    * @param string tag to clean
    * @return void
    **/
   _clean: function(e, tag) {
-    var targetList = e.getElementsByTagName(tag);
-    var isEmbed = (tag === 'object' || tag === 'embed');
+    var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
 
-    for (var y = targetList.length - 1; y >= 0; y -= 1) {
+    this._forEachNode(e.getElementsByTagName(tag), function(element) {
       // Allow youtube and vimeo videos through as people usually want to see those.
       if (isEmbed) {
-        var attributeValues = "";
-        for (var i = 0, il = targetList[y].attributes.length; i < il; i += 1) {
-          attributeValues += targetList[y].attributes[i].value + '|';
-        }
+        var attributeValues = [].map.call(element.attributes, function(attr) {
+          return attr.value;
+        }).join("|");
 
         // First, check the elements attributes to see if any of them contain youtube or vimeo
         if (this.REGEXPS.videos.test(attributeValues))
-          continue;
+          return;
 
         // Then check the elements inside this element for the same.
-        if (this.REGEXPS.videos.test(targetList[y].innerHTML))
-          continue;
+        if (this.REGEXPS.videos.test(element.innerHTML))
+          return;
       }
 
-      targetList[y].parentNode.removeChild(targetList[y]);
-    }
+      element.parentNode.removeChild(element);
+    });
   },
 
   /**
    * Clean an element of all tags of type "tag" if they look fishy.
    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
    *
    * @return void
    **/
@@ -1573,17 +1592,17 @@ Readability.prototype = {
     // }
 
     // If we haven't found an excerpt in the article's metadata, use the article's
     // first paragraph as the excerpt. This is used for displaying a preview of
     // the article's content.
     if (!metadata.excerpt) {
       var paragraphs = articleContent.getElementsByTagName("p");
       if (paragraphs.length > 0) {
-        metadata.excerpt = paragraphs[0].textContent;
+        metadata.excerpt = paragraphs[0].textContent.trim();
       }
     }
 
     return { uri: this._uri,
              title: articleTitle,
              byline: metadata.byline || this._articleByline,
              dir: this._articleDir,
              content: articleContent.innerHTML,