No bug: update readability libs to the up-to-date github versions. rs=me+Gijs a=readinglist
authorMargaret Leibovic <margaret.leibovic@gmail.com>
Mon, 23 Mar 2015 11:29:23 -0700
changeset 258076 cb38b4973ea93df3807296d0bdee82beb1cc13f0
parent 258075 54db0a4c777fc2e2f2d33229d2b2754be1dbfb29
child 258077 dfd7c0ccfffda4afd76c57bcf5ab5546b1895340
push id4610
push userjlund@mozilla.com
push dateMon, 30 Mar 2015 18:32:55 +0000
treeherdermozilla-beta@4df54044d9ef [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersme, readinglist
milestone38.0a2
No bug: update readability libs to the up-to-date github versions. rs=me+Gijs a=readinglist
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -98,17 +98,17 @@ Readability.prototype = {
     unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
     negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
     normalize: /\s{2,}/g,
-    videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i,
+    videos: /https?:\/\/(www\.)?(youtube|youtube-nocookie|player\.vimeo)\.com/i,
     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
     prevLink: /(prev|earl|old|new|<|«)/i,
     whitespace: /^\s*$/,
     hasContent: /\S$/,
   },
 
   DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
 
@@ -121,16 +121,46 @@ Readability.prototype = {
    * @return void
   **/
   _postProcessContent: function(articleContent) {
     // Readability cannot open relative uris so we convert them to absolute uris.
     this._fixRelativeUris(articleContent);
   },
 
   /**
+   * Iterate over a NodeList, which doesn't natively fully implement the Array
+   * interface.
+   *
+   * For convenience, the current object context is applied to the provided
+   * iterate function.
+   *
+   * @param  NodeList nodeList The NodeList.
+   * @param  Function fn       The iterate function.
+   * @return void
+   */
+  _forEachNode: function(nodeList, fn) {
+    return Array.prototype.forEach.call(nodeList, fn, this);
+  },
+
+  /**
+   * Iterate over a NodeList, return true if any of the provided iterate
+   * function calls returns true, false otherwise.
+   *
+   * For convenience, the current object context is applied to the
+   * provided iterate function.
+   *
+   * @param  NodeList nodeList The NodeList.
+   * @param  Function fn       The iterate function.
+   * @return Boolean
+   */
+  _someNode: function(nodeList, fn) {
+    return Array.prototype.some.call(nodeList, fn, this);
+  },
+
+  /**
    * Converts each <a> and <img> uri in the given element to an absolute URI.
    *
    * @param Element
    * @return void
    */
   _fixRelativeUris: function(articleContent) {
     var scheme = this._uri.scheme;
     var prePath = this._uri.prePath;
@@ -144,36 +174,39 @@ Readability.prototype = {
       // Scheme-rooted relative URI.
       if (uri.substr(0, 2) == "//")
         return scheme + "://" + uri.substr(2);
 
       // Prepath-rooted relative URI.
       if (uri[0] == "/")
         return prePath + uri;
 
+      // Dotslash relative URI.
+      if (uri.indexOf("./") === 0)
+        return pathBase + uri.slice(2);
+
       // Standard relative URI; add entire path. pathBase already includes a
       // trailing "/".
       return pathBase + uri;
     }
 
     function convertRelativeURIs(tagName, propName) {
       var elems = articleContent.getElementsByTagName(tagName);
-      for (var i = elems.length; --i >= 0;) {
-        var elem = elems[i];
+      this._forEachNode(elems, function(elem) {
         var relativeURI = elem.getAttribute(propName);
         if (relativeURI != null)
-          elems[i].setAttribute(propName, toAbsoluteURI(relativeURI));
-      }
+          elem.setAttribute(propName, toAbsoluteURI(relativeURI));
+      });
     }
 
      // Fix links.
-    convertRelativeURIs("a", "href");
+    convertRelativeURIs.call(this, "a", "href");
 
      // Fix images.
-    convertRelativeURIs("img", "src");
+    convertRelativeURIs.call(this, "img", "src");
   },
 
   /**
    * Get the article title as an H1.
    *
    * @return void
    **/
   _getArticleTitle: function() {
@@ -219,29 +252,27 @@ Readability.prototype = {
    * This includes things like stripping javascript, CSS, and handling terrible markup.
    *
    * @return void
    **/
   _prepDocument: function() {
     var doc = this._doc;
 
     // Remove all style tags in head
-    var styleTags = doc.getElementsByTagName("style");
-    for (var st = styleTags.length - 1; st >= 0; st -= 1) {
-      styleTags[st].parentNode.removeChild(styleTags[st]);
-    }
+    this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) {
+      styleNode.parentNode.removeChild(styleNode);
+    });
 
     if (doc.body) {
       this._replaceBrs(doc.body);
     }
 
-    var fonts = doc.getElementsByTagName("FONT");
-    for (var i = fonts.length; --i >=0;) {
-      this._setNodeTag(fonts[i], "SPAN");
-    }
+    this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) {
+      this._setNodeTag(fontNode, "SPAN");
+    });
   },
 
   /**
    * Finds the next element, starting from the given node, and ignoring
    * whitespace in between. If the given node is an element, the same node is
    * returned.
    */
   _nextElement: function (node) {
@@ -257,19 +288,17 @@ Readability.prototype = {
   /**
    * Replaces 2 or more successive <br> elements with a single <p>.
    * Whitespace between <br> elements are ignored. For example:
    *   <div>foo<br>bar<br> <br><br>abc</div>
    * will become:
    *   <div>foo<br>bar<p>abc</p></div>
    */
   _replaceBrs: function (elem) {
-    var brs = elem.getElementsByTagName("br");
-    for (var i = 0; i < brs.length; i++) {
-      var br = brs[i];
+    this._forEachNode(elem.getElementsByTagName("br"), function(br) {
       var next = br.nextSibling;
 
       // Whether 2 or more <br> elements have been found and replaced with a
       // <p> block.
       var replaced = false;
 
       // If we find a <br> chain, remove the <br>s until we hit another element
       // or non-whitespace. This leaves behind the first <br> in the chain
@@ -298,17 +327,17 @@ Readability.prototype = {
           }
 
           // Otherwise, make this node a child of the new <p>.
           var sibling = next.nextSibling;
           p.appendChild(next);
           next = sibling;
         }
       }
-    }
+    });
   },
 
   _setNodeTag: function (node, tag) {
     // FIXME this doesn't work on anything but JSDOMParser (ie the node's tag
     // won't actually be set).
     node.localName = tag.toLowerCase();
     node.tagName = tag.toUpperCase();
   },
@@ -321,16 +350,17 @@ Readability.prototype = {
    * @return void
    **/
   _prepArticle: function(articleContent) {
     this._cleanStyles(articleContent);
 
     // Clean out junk from the article content
     this._cleanConditionally(articleContent, "form");
     this._clean(articleContent, "object");
+    this._clean(articleContent, "embed");
     this._clean(articleContent, "h1");
 
     // If there is only one h2, they are probably using it as a header
     // and not a subheader, so remove it since we already have a header.
     if (articleContent.getElementsByTagName('h2').length === 1)
       this._clean(articleContent, "h2");
 
     this._clean(articleContent, "iframe");
@@ -338,36 +368,33 @@ Readability.prototype = {
 
     // Do these last as the previous stuff may have removed junk
     // that will affect these
     this._cleanConditionally(articleContent, "table");
     this._cleanConditionally(articleContent, "ul");
     this._cleanConditionally(articleContent, "div");
 
     // Remove extra paragraphs
-    var articleParagraphs = articleContent.getElementsByTagName('p');
-    for (var i = articleParagraphs.length - 1; i >= 0; i -= 1) {
-      var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
-      var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
-      var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
+    this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) {
+      var imgCount = paragraph.getElementsByTagName('img').length;
+      var embedCount = paragraph.getElementsByTagName('embed').length;
+      var objectCount = paragraph.getElementsByTagName('object').length;
+      // At this point, nasty iframes have been removed, only remain embedded video ones.
+      var iframeCount = paragraph.getElementsByTagName('iframe').length;
+      var totalCount = imgCount + embedCount + objectCount + iframeCount;
 
-      if (imgCount === 0 &&
-        embedCount === 0 &&
-        objectCount === 0 &&
-        this._getInnerText(articleParagraphs[i], false) === '')
-        articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
-    }
+      if (totalCount === 0 && !this._getInnerText(paragraph, false))
+        paragraph.parentNode.removeChild(paragraph);
+    });
 
-    var brs = articleContent.getElementsByTagName("BR");
-    for (var i = brs.length; --i >= 0;) {
-      var br = brs[i];
+    this._forEachNode(articleContent.getElementsByTagName("br"), function(br) {
       var next = this._nextElement(br.nextSibling);
       if (next && next.tagName == "P")
         br.parentNode.removeChild(br);
-    }
+    });
   },
 
   /**
    * Initialize a node with the readability object. Also checks the
    * className/id for special names to add to its score.
    *
    * @param Element
    * @return void
@@ -524,49 +551,48 @@ Readability.prototype = {
             var newNode = node.firstElementChild;
             node.parentNode.replaceChild(newNode, node);
             node = newNode;
           } else if (!this._hasChildBlockElement(node)) {
             this._setNodeTag(node, "P");
             elementsToScore.push(node);
           } else {
             // EXPERIMENTAL
-            for (var i = 0, il = node.childNodes.length; i < il; i += 1) {
-              var childNode = node.childNodes[i];
+            this._forEachNode(node.childNodes, function(childNode) {
               if (childNode.nodeType === Node.TEXT_NODE) {
                 var p = doc.createElement('p');
                 p.textContent = childNode.textContent;
                 p.style.display = 'inline';
                 p.className = 'readability-styled';
                 node.replaceChild(p, childNode);
               }
-            }
+            });
           }
         }
         node = this._getNextNode(node);
       }
 
       /**
        * Loop through all paragraphs, and assign a score to them based on how content-y they look.
        * Then add their score to their parent node.
        *
        * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
       **/
       var candidates = [];
-      for (var pt = 0; pt < elementsToScore.length; pt += 1) {
-        var parentNode = elementsToScore[pt].parentNode;
+      this._forEachNode(elementsToScore, function(elementToScore) {
+        var parentNode = elementToScore.parentNode;
         var grandParentNode = parentNode ? parentNode.parentNode : null;
-        var innerText = this._getInnerText(elementsToScore[pt]);
+        var innerText = this._getInnerText(elementToScore);
 
         if (!parentNode || typeof(parentNode.tagName) === 'undefined')
-          continue;
+          return;
 
         // If this paragraph is less than 25 characters, don't even count it.
         if (innerText.length < 25)
-          continue;
+          return;
 
         // Initialize readability data for the parent.
         if (typeof parentNode.readability === 'undefined') {
           this._initializeNode(parentNode);
           candidates.push(parentNode);
         }
 
         // Initialize readability data for the grandparent.
@@ -588,17 +614,17 @@ Readability.prototype = {
         // For every 100 characters in this paragraph, add another point. Up to 3 points.
         contentScore += Math.min(Math.floor(innerText.length / 100), 3);
 
         // Add the score to the parent. The grandparent gets half.
         parentNode.readability.contentScore += contentScore;
 
         if (grandParentNode)
           grandParentNode.readability.contentScore += contentScore / 2;
-      }
+      });
 
       // After we've calculated scores, loop through all of the possible
       // candidate nodes we found and find the one with the highest score.
       var topCandidates = [];
       for (var c = 0, cl = candidates.length; c < cl; c += 1) {
         var candidate = candidates[c];
 
         // Scale the final candidates score based on link density. Good content
@@ -645,28 +671,29 @@ Readability.prototype = {
         // Because of our bonus system, parents of candidates might have scores
         // themselves. They get half of the node. There won't be nodes with higher
         // scores than our topCandidate, but if we see the score going *up* in the first
         // few steps up the tree, that's a decent sign that there might be more content
         // lurking in other places that we want to unify in. The sibling stuff
         // below does some of that - but only if we've looked high enough up the DOM
         // tree.
         var parentOfTopCandidate = topCandidate.parentNode;
+        var lastScore = topCandidate.readability.contentScore;
         // The scores shouldn't get too low.
-        var scoreThreshold = topCandidate.readability.contentScore / 3;
-        var lastScore = parentOfTopCandidate.readability.contentScore;
+        var scoreThreshold = lastScore / 3;
         while (parentOfTopCandidate && parentOfTopCandidate.readability) {
           var parentScore = parentOfTopCandidate.readability.contentScore;
           if (parentScore < scoreThreshold)
             break;
           if (parentScore > lastScore) {
             // Alright! We found a better parent to use.
             topCandidate = parentOfTopCandidate;
             break;
           }
+          lastScore = parentOfTopCandidate.readability.contentScore;
           parentOfTopCandidate = parentOfTopCandidate.parentNode;
         }
       }
 
       // Now that we have the top candidate, look through its siblings for content
       // that might also be related. Things like preambles, content split by ads
       // that we removed, etc.
       var articleContent = doc.createElement("DIV");
@@ -799,40 +826,39 @@ Readability.prototype = {
       byline = byline.trim();
       return (byline.length > 0) && (byline.length < 100);
     }
     return false;
   },
 
   /**
    * Attempts to get excerpt and byline metadata for the article.
-   * 
+   *
    * @return Object with optional "excerpt" and "byline" properties
    */
   _getArticleMetadata: function() {
     var metadata = {};
     var values = {};
     var metaElements = this._doc.getElementsByTagName("meta");
 
     // Match "description", or Twitter's "twitter:description" (Cards)
     // in name attribute.
     var namePattern = /^\s*((twitter)\s*:\s*)?description\s*$/gi;
 
     // Match Facebook's og:description (Open Graph) in property attribute.
     var propertyPattern = /^\s*og\s*:\s*description\s*$/gi;
 
     // Find description tags.
-    for (var i = 0; i < metaElements.length; i++) {
-      var element = metaElements[i];
+    this._forEachNode(metaElements, function(element) {
       var elementName = element.getAttribute("name");
       var elementProperty = element.getAttribute("property");
 
       if (elementName === "author") {
         metadata.byline = element.getAttribute("content");
-        continue;
+        return;
       }
 
       var name = null;
       if (namePattern.test(elementName)) {
         name = elementName;
       } else if (propertyPattern.test(elementProperty)) {
         name = elementProperty;
       }
@@ -841,17 +867,17 @@ Readability.prototype = {
         var content = element.getAttribute("content");
         if (content) {
           // Convert to lowercase and remove any whitespace
           // so we can match below.
           name = name.toLowerCase().replace(/\s/g, '');
           values[name] = content.trim();
         }
       }
-    }
+    });
 
     if ("description" in values) {
       metadata.excerpt = values["description"];
     } else if ("og:description" in values) {
       // Use facebook open graph description.
       metadata.excerpt = values["og:description"];
     } else if ("twitter:description" in values) {
       // Use twitter cards description.
@@ -862,76 +888,68 @@ Readability.prototype = {
   },
 
   /**
    * Removes script tags from the document.
    *
    * @param Element
   **/
   _removeScripts: function(doc) {
-    var scripts = doc.getElementsByTagName('script');
-    for (var i = scripts.length - 1; i >= 0; i -= 1) {
-      scripts[i].nodeValue="";
-      scripts[i].removeAttribute('src');
+    this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
+      scriptNode.nodeValue = "";
+      scriptNode.removeAttribute('src');
 
-      if (scripts[i].parentNode)
-          scripts[i].parentNode.removeChild(scripts[i]);
-    }
+      if (scriptNode.parentNode)
+        scriptNode.parentNode.removeChild(scriptNode);
+    });
   },
 
   /**
    * Check if this node has only whitespace and a single P element
    * Returns false if the DIV node contains non-empty text nodes
    * or if it contains no P or more than 1 element.
    *
    * @param Element
   **/
-  _hasSinglePInsideElement: function(e) {
+  _hasSinglePInsideElement: function(element) {
     // There should be exactly 1 element child which is a P:
-    if (e.children.length != 1 || e.firstElementChild.tagName !== "P") {
+    if (element.children.length != 1 || element.firstElementChild.tagName !== "P") {
       return false;
     }
+
     // And there should be no text nodes with real content
-    var childNodes = e.childNodes;
-    for (var i = childNodes.length; --i >= 0;) {
-      var node = childNodes[i];
-      if (node.nodeType == Node.TEXT_NODE &&
-          this.REGEXPS.hasContent.test(node.textContent)) {
-        return false;
-      }
-    }
-
-    return true;
+    return !this._someNode(element.childNodes, function(node) {
+      return node.nodeType === Node.TEXT_NODE &&
+             this.REGEXPS.hasContent.test(node.textContent);
+    });
   },
 
   /**
    * Determine whether element has any children block level elements.
    *
    * @param Element
    */
-  _hasChildBlockElement: function (e) {
-    var length = e.children.length;
-    for (var i = 0; i < length; i++) {
-      var child = e.children[i];
-      if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child))
-        return true;
-    }
-    return false;
+  _hasChildBlockElement: function (element) {
+    return this._someNode(element.childNodes, function(node) {
+      return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
+             this._hasChildBlockElement(node);
+    });
   },
 
   /**
    * Get the inner text of a node - cross browser compatibly.
    * This also strips out any excess whitespace to be found.
    *
    * @param Element
+   * @param Boolean normalizeSpaces (default: true)
    * @return string
   **/
   _getInnerText: function(e, normalizeSpaces) {
+    normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
     var textContent = e.textContent.trim();
-    normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
 
     if (normalizeSpaces) {
       return textContent.replace(this.REGEXPS.normalize, " ");
     } else {
       return textContent;
     }
   },
 
@@ -980,24 +998,27 @@ Readability.prototype = {
 
   /**
    * Get the density of links as a percentage of the content
    * This is the amount of text that is inside a link divided by the total text in the node.
    *
    * @param Element
    * @return number (float)
   **/
-  _getLinkDensity: function(e) {
-    var links = e.getElementsByTagName("a");
-    var textLength = this._getInnerText(e).length;
+  _getLinkDensity: function(element) {
+    var textLength = this._getInnerText(element).length;
+    if (textLength === 0)
+      return;
+
     var linkLength = 0;
 
-    for (var i = 0, il = links.length; i < il; i += 1) {
-      linkLength += this._getInnerText(links[i]).length;
-    }
+    // XXX implement _reduceNodeList?
+    this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
+      linkLength += this._getInnerText(linkNode).length;
+    });
 
     return linkLength / textLength;
   },
 
   /**
    * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
    *
    * @author Dan Lacy
@@ -1400,38 +1421,36 @@ Readability.prototype = {
    * Clean a node of all elements of type "tag".
    * (Unless it's a youtube/vimeo video. People love movies.)
    *
    * @param Element
    * @param string tag to clean
    * @return void
    **/
   _clean: function(e, tag) {
-    var targetList = e.getElementsByTagName(tag);
-    var isEmbed = (tag === 'object' || tag === 'embed');
+    var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
 
-    for (var y = targetList.length - 1; y >= 0; y -= 1) {
+    this._forEachNode(e.getElementsByTagName(tag), function(element) {
       // Allow youtube and vimeo videos through as people usually want to see those.
       if (isEmbed) {
-        var attributeValues = "";
-        for (var i = 0, il = targetList[y].attributes.length; i < il; i += 1) {
-          attributeValues += targetList[y].attributes[i].value + '|';
-        }
+        var attributeValues = [].map.call(element.attributes, function(attr) {
+          return attr.value;
+        }).join("|");
 
         // First, check the elements attributes to see if any of them contain youtube or vimeo
         if (this.REGEXPS.videos.test(attributeValues))
-          continue;
+          return;
 
         // Then check the elements inside this element for the same.
-        if (this.REGEXPS.videos.test(targetList[y].innerHTML))
-          continue;
+        if (this.REGEXPS.videos.test(element.innerHTML))
+          return;
       }
 
-      targetList[y].parentNode.removeChild(targetList[y]);
-    }
+      element.parentNode.removeChild(element);
+    });
   },
 
   /**
    * Clean an element of all tags of type "tag" if they look fishy.
    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
    *
    * @return void
    **/
@@ -1573,17 +1592,17 @@ Readability.prototype = {
     // }
 
     // If we haven't found an excerpt in the article's metadata, use the article's
     // first paragraph as the excerpt. This is used for displaying a preview of
     // the article's content.
     if (!metadata.excerpt) {
       var paragraphs = articleContent.getElementsByTagName("p");
       if (paragraphs.length > 0) {
-        metadata.excerpt = paragraphs[0].textContent;
+        metadata.excerpt = paragraphs[0].textContent.trim();
       }
     }
 
     return { uri: this._uri,
              title: articleTitle,
              byline: metadata.byline || this._articleByline,
              dir: this._articleDir,
              content: articleContent.innerHTML,