Bug 1152022 - Update Readability to github tip. r=gijs, r=margaret, a=sledru
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Thu, 02 Apr 2015 21:48:31 +0100
changeset 258450 333017ad43a9
parent 258449 2659ba26dcf2
child 258451 7717f3aa4cf6
push id4669
push userryanvm@gmail.com
push date2015-04-13 16:55 +0000
treeherdermozilla-beta@333017ad43a9 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersgijs, margaret, sledru
bugs1152022
milestone38.0
Bug 1152022 - Update Readability to github tip. r=gijs, r=margaret, a=sledru
toolkit/components/reader/JSDOMParser.js
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -32,16 +32,49 @@
  *      document, you must take care to manually update them yourself.
  */
 (function (global) {
 
   function error(m) {
     dump("JSDOMParser error: " + m + "\n");
   }
 
+  // XML only defines these and the numeric ones:
+
+  var entityTable = {
+    "lt": "<",
+    "gt": ">",
+    "amp": "&",
+    "quot": '"',
+    "apos": "'",
+  };
+
+  var reverseEntityTable = {
+    "<": "&lt;",
+    ">": "&gt;",
+    "&": "&amp;",
+    '"': "&quot;",
+    "'": "&apos;",
+  };
+
+  function encodeHTML(s) {
+    return s.replace(/[&<>'"]/g, function(x) {
+      return reverseEntityTable[x];
+    });
+  }
+
+  function decodeHTML(str) {
+    return str.replace(/&(quot|amp|apos|lt|gt);/g, function(match, tag) {
+      return entityTable[tag];
+    }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(match, hex, numStr) {
+      var num = parseInt(hex || numStr, hex ? 16 : 10); // read num
+      return String.fromCharCode(num);
+    });
+  }
+
   // When a style is set in JS, map it to the corresponding CSS attribute
   var styleMap = {
     "alignmentBaseline": "alignment-baseline",
     "background": "background",
     "backgroundAttachment": "background-attachment",
     "backgroundClip": "background-clip",
     "backgroundColor": "background-color",
     "backgroundImage": "background-image",
@@ -442,26 +475,48 @@
         oldNode.previousSibling = null;
         oldNode.nextSibling = null;
         if (oldNode.nodeType === Node.ELEMENT_NODE) {
           oldNode.previousElementSibling = null;
           oldNode.nextElementSibling = null;
         }
         return oldNode;
       }
-    }
+    },
+
+    __JSDOMParser__: true,
   };
 
   for (var i in nodeTypes) {
     Node[i] = Node.prototype[i] = nodeTypes[i];
   }
 
   var Attribute = function (name, value) {
     this.name = name;
-    this.value = value;
+    this._value = value;
+  };
+
+  Attribute.prototype = {
+    get value() {
+      return this._value;
+    },
+    setValue: function(newValue) {
+      this._value = newValue;
+      delete this._decodedValue;
+    },
+    setDecodedValue: function(newValue) {
+      this._value = encodeHTML(newValue);
+      this._decodedValue = newValue;
+    },
+    getDecodedValue: function() {
+      if (typeof this._decodedValue === "undefined") {
+        this._decodedValue = (this._value && decodeHTML(this._value)) || "";
+      }
+      return this._decodedValue;
+    },
   };
 
   var Comment = function () {
     this.childNodes = [];
   };
 
   Comment.prototype = {
     __proto__: Node.prototype,
@@ -474,17 +529,37 @@
     this.childNodes = [];
   };
 
   Text.prototype = {
     __proto__: Node.prototype,
 
     nodeName: "#text",
     nodeType: Node.TEXT_NODE,
-    textContent: ""
+    get textContent() {
+      if (typeof this._textContent === "undefined") {
+        this._textContent = decodeHTML(this._innerHTML || "");
+      }
+      return this._textContent;
+    },
+    get innerHTML() {
+      if (typeof this._innerHTML === "undefined") {
+        this._innerHTML = encodeHTML(this._textContent || "");
+      }
+      return this._innerHTML;
+    },
+
+    set innerHTML(newHTML) {
+      this._innerHTML = newHTML;
+      delete this._textContent;
+    },
+    set textContent(newText) {
+      this._textContent = newText;
+      delete this._innerHTML;
+    },
   }
 
   var Document = function () {
     this.styleSheets = [];
     this.childNodes = [];
     this.children = [];
   };
 
@@ -577,31 +652,34 @@
         for (i = 0; i < node.childNodes.length; i++) {
           var child = node.childNodes[i];
           if (child.localName) {
             arr.push("<" + child.localName);
 
             // serialize attribute list
             for (var j = 0; j < child.attributes.length; j++) {
               var attr = child.attributes[j];
-              var quote = (attr.value.indexOf('"') === -1 ? '"' : "'");
-              arr.push(" " + attr.name + '=' + quote + attr.value + quote);
+              // the attribute value will be HTML escaped.
+              var val = attr.value;
+              var quote = (val.indexOf('"') === -1 ? '"' : "'");
+              arr.push(" " + attr.name + '=' + quote + val + quote);
             }
 
             if (child.localName in voidElems) {
               // if this is a self-closing element, end it here
-              arr.push("/>");
+              arr.push(">");
             } else {
               // otherwise, add its children
               arr.push(">");
               getHTML(child);
               arr.push("</" + child.localName + ">");
             }
           } else {
-            arr.push(child.textContent);
+            // This is a text node, so asking for innerHTML won't recurse.
+            arr.push(child.innerHTML);
           }
         }
       }
 
       // Using Array.join() avoids the overhead from lazy string concatenation.
       // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
       var arr = [];
       getHTML(this);
@@ -610,29 +688,31 @@
 
     set innerHTML(html) {
       var parser = new JSDOMParser();
       var node = parser.parse(html);
       for (var i = this.childNodes.length; --i >= 0;) {
         this.childNodes[i].parentNode = null;
       }
       this.childNodes = node.childNodes;
+      this.children = node.children;
       for (var i = this.childNodes.length; --i >= 0;) {
         this.childNodes[i].parentNode = this;
       }
     },
 
     set textContent(text) {
       // clear parentNodes for existing children
       for (var i = this.childNodes.length; --i >= 0;) {
         this.childNodes[i].parentNode = null;
       }
 
       var node = new Text();
       this.childNodes = [ node ];
+      this.children = [];
       node.textContent = text;
       node.parentNode = this;
     },
 
     get textContent() {
       function getText(node) {
         var nodes = node.childNodes;
         for (var i = 0; i < nodes.length; i++) {
@@ -651,30 +731,30 @@
       getText(this);
       return text.join("");
     },
 
     getAttribute: function (name) {
       for (var i = this.attributes.length; --i >= 0;) {
         var attr = this.attributes[i];
         if (attr.name === name)
-          return attr.value;
+          return attr.getDecodedValue();
       }
       return undefined;
     },
 
     setAttribute: function (name, value) {
       for (var i = this.attributes.length; --i >= 0;) {
         var attr = this.attributes[i];
         if (attr.name === name) {
-          attr.value = value;
+          attr.setDecodedValue(value);
           return;
         }
       }
-      this.attributes.push(new Attribute(name, value));
+      this.attributes.push(new Attribute(name, encodeHTML(value)));
     },
 
     removeAttribute: function (name) {
       for (var i = this.attributes.length; --i >= 0;) {
         var attr = this.attributes[i];
         if (attr.name === name) {
           this.attributes.splice(i, 1);
           break;
@@ -815,19 +895,16 @@
       if (c !== '"' && c !== "'") {
         error("Error reading attribute " + name + ", expecting '\"'");
         return;
       }
 
       // Read the attribute value (and consume the matching quote)
       var value = this.readString(c);
 
-      if (!value)
-        return;
-
       node.attributes.push(new Attribute(name, value));
 
       return;
     },
 
     /**
      * Parses and returns an Element node. This is called after a '<' has been
      * read.
@@ -887,17 +964,17 @@
     /**
      * If the current input matches this string, advance the input index;
      * otherwise, do nothing.
      *
      * @returns whether input matched string
      */
     match: function (str) {
       var strlen = str.length;
-      if (this.html.substr(this.currentChar, strlen) === str) {
+      if (this.html.substr(this.currentChar, strlen).toLowerCase() === str.toLowerCase()) {
         this.currentChar += strlen;
         return true;
       }
       return false;
     },
 
     /**
      * Searches the input until a string is found and discards all input up to
@@ -919,75 +996,109 @@
         // Don't keep Comment nodes
         if (child.nodeType !== 8) {
           node.appendChild(child);
         }
       }
     },
 
     readScript: function (node) {
-      var index = this.html.indexOf("</script>", this.currentChar);
-      if (index === -1) {
-        index = this.html.length;
+      while (this.currentChar < this.html.length) {
+        var c = this.nextChar();
+        var nextC = this.peekNext();
+        if (c === "<") {
+          if (nextC === "!" || nextC === "?") {
+            // We're still before the ! or ? that is starting this comment:
+            this.currentChar++;
+            node.appendChild(this.discardNextComment());
+            continue;
+          }
+          if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") {
+            // Go back before the '<' so we find the end tag.
+            this.currentChar--;
+            // Done with this script tag, the caller will close:
+            return;
+          }
+        }
+        // Either c wasn't a '<' or it was but we couldn't find either a comment
+        // or a closing script tag, so we should just parse as text until the next one
+        // comes along:
+
+        var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE;
+        var textNode = haveTextNode ? node.lastChild : new Text();
+        var n = this.html.indexOf("<", this.currentChar);
+        // Decrement this to include the current character *afterwards* so we don't get stuck
+        // looking for the same < all the time.
+        this.currentChar--;
+        if (n === -1) {
+          textNode.innerHTML += this.html.substring(this.currentChar, this.html.length);
+          this.currentChar = this.html.length;
+        } else {
+          textNode.innerHTML += this.html.substring(this.currentChar, n);
+          this.currentChar = n;
+        }
+        if (!haveTextNode)
+          node.appendChild(textNode);
       }
-      var txt = new Text();
-      txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index);
-      node.appendChild(txt);
-      this.currentChar = index;
+    },
+
+    discardNextComment: function() {
+      if (this.match("--")) {
+        this.discardTo("-->");
+      } else {
+        var c = this.nextChar();
+        while (c !== ">") {
+          if (c === undefined)
+            return null;
+          if (c === '"' || c === "'")
+            this.readString(c);
+          c = this.nextChar();
+        }
+      }
+      return new Comment();
     },
 
 
     /**
      * Reads the next child node from the input. If we're reading a closing
      * tag, or if we've reached the end of input, return null.
      *
      * @returns the node
      */
     readNode: function () {
       var c = this.nextChar();
- 
+
       if (c === undefined)
         return null;
 
       // Read any text as Text node
       if (c !== "<") {
         --this.currentChar;
         var node = new Text();
         var n = this.html.indexOf("<", this.currentChar);
         if (n === -1) {
-          node.textContent = this.html.substring(this.currentChar, this.html.length);
+          node.innerHTML = this.html.substring(this.currentChar, this.html.length);
           this.currentChar = this.html.length;
         } else {
-          node.textContent = this.html.substring(this.currentChar, n);
+          node.innerHTML = this.html.substring(this.currentChar, n);
           this.currentChar = n;
         }
         return node;
       }
 
       c = this.peekNext();
 
       // Read Comment node. Normally, Comment nodes know their inner
       // textContent, but we don't really care about Comment nodes (we throw
       // them away in readChildren()). So just returning an empty Comment node
       // here is sufficient.
       if (c === "!" || c === "?") {
+        // We're still before the ! or ? that is starting this comment:
         this.currentChar++;
-        if (this.match("--")) {
-          this.discardTo("-->");
-        } else {
-          var c = this.nextChar();
-          while (c !== ">") {
-            if (c === undefined)
-              return null;
-            if (c === '"' || c === "'")
-              this.readString(c);
-            c = this.nextChar();
-          }
-        }
-        return new Comment();
+        return this.discardNextComment();
       }
 
       // If we're reading a closing tag, return null. This means we've reached
       // the end of this set of child nodes.
       if (c === "/") {
         --this.currentChar;
         return null;
       }
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -21,25 +21,38 @@
  * limitations under the License.
  */
 
 /*
  * This code is heavily based on Arc90's readability.js (1.7.1) script
  * available at: http://code.google.com/p/arc90labs-readability
  */
 var root = this;
-var Readability = function(uri, doc) {
-  var ENABLE_LOGGING = false;
+
+/**
+ * Public constructor.
+ * @param {Object}       uri     The URI descriptor object.
+ * @param {HTMLDocument} doc     The document to parse.
+ * @param {Object}       options The options object.
+ */
+var Readability = function(uri, doc, options) {
+  options = options || {};
 
   this._uri = uri;
   this._doc = doc;
   this._biggestFrame = false;
   this._articleByline = null;
   this._articleDir = null;
 
+  // Configureable options
+  this._debug = !!options.debug;
+  this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
+  this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
+  this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
+
   // Start with all flags set
   this._flags = this.FLAG_STRIP_UNLIKELYS |
                 this.FLAG_WEIGHT_CLASSES |
                 this.FLAG_CLEAN_CONDITIONALLY;
 
   // The list of pages we've parsed in this call of readability,
   // for autopaging. As a key store for easier searching.
   this._parsedPages = {};
@@ -47,17 +60,17 @@ var Readability = function(uri, doc) {
   // A list of the ETag headers of pages we've parsed, in case they happen to match,
   // we'll know it's a duplicate.
   this._pageETags = {};
 
   // Make an AJAX request for each page and append it to the document.
   this._curPageNum = 1;
 
   // Control whether log messages are sent to the console
-  if (ENABLE_LOGGING) {
+  if (this._debug) {
     function logEl(e) {
       var rv = e.nodeName + " ";
       if (e.nodeType == e.TEXT_NODE) {
         return rv + '("' + e.textContent + '")';
       }
       var classDesc = e.className && ("." + e.className.replace(/ /g, "."));
       var elDesc = e.id ? "(#" + e.id + classDesc + ")" :
                           (classDesc ? "(" + classDesc + ")" : "");
@@ -79,31 +92,34 @@ var Readability = function(uri, doc) {
   }
 }
 
 Readability.prototype = {
   FLAG_STRIP_UNLIKELYS: 0x1,
   FLAG_WEIGHT_CLASSES: 0x2,
   FLAG_CLEAN_CONDITIONALLY: 0x4,
 
+  // Max number of nodes supported by this parser. Default: 0 (no limit)
+  DEFAULT_MAX_ELEMS_TO_PARSE: 0,
+
   // The number of top candidates to consider when analysing how
   // tight the competition is among candidates.
-  N_TOP_CANDIDATES: 5,
+  DEFAULT_N_TOP_CANDIDATES: 5,
 
   // The maximum number of pages to loop through before we call
   // it quits and just show a link.
-  MAX_PAGES: 5,
+  DEFAULT_MAX_PAGES: 5,
 
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
-    unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
+    unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
-    negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
+    negative: /hidden|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
     normalize: /\s{2,}/g,
     videos: /https?:\/\/(www\.)?(youtube|youtube-nocookie|player\.vimeo)\.com/i,
     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
     prevLink: /(prev|earl|old|new|<|«)/i,
     whitespace: /^\s*$/,
@@ -331,20 +347,35 @@ Readability.prototype = {
           p.appendChild(next);
           next = sibling;
         }
       }
     });
   },
 
   _setNodeTag: function (node, tag) {
-    // FIXME this doesn't work on anything but JSDOMParser (ie the node's tag
-    // won't actually be set).
-    node.localName = tag.toLowerCase();
-    node.tagName = tag.toUpperCase();
+    this.log("_setNodeTag", node, tag);
+    if (node.__JSDOMParser__) {
+      node.localName = tag.toLowerCase();
+      node.tagName = tag.toUpperCase();
+      return node;
+    }
+
+    var replacement = node.ownerDocument.createElement(tag);
+    while (node.firstChild) {
+      replacement.appendChild(node.firstChild);
+    }
+    node.parentNode.replaceChild(replacement, node);
+    if (node.readability)
+      replacement.readability = node.readability;
+
+    for (var i = 0; i < node.attributes.length; i++) {
+      replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
+    }
+    return replacement;
   },
 
   /**
    * Prepare the article node for display. Clean out any inline styles,
    * iframes, forms, strip extraneous <p> tags, etc.
    *
    * @param Element
    * @return void
@@ -464,16 +495,47 @@ Readability.prototype = {
     // (because this is depth-first traversal, we will have already
     // seen the parent nodes themselves).
     do {
       node = node.parentNode;
     } while (node && !node.nextElementSibling);
     return node && node.nextElementSibling;
   },
 
+  /**
+   * Like _getNextNode, but for DOM implementations with no
+   * firstElementChild/nextElementSibling functionality...
+   */
+  _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
+    function nextSiblingEl(n) {
+      do {
+        n = n.nextSibling;
+      } while (n && n.nodeType !== n.ELEMENT_NODE);
+      return n;
+    }
+    // First check for kids if those aren't being ignored
+    if (!ignoreSelfAndKids && node.children[0]) {
+      return node.children[0];
+    }
+    // Then for siblings...
+    var next = nextSiblingEl(node);
+    if (next) {
+      return next;
+    }
+    // And finally, move up the parent chain *and* find a sibling
+    // (because this is depth-first traversal, we will have already
+    // seen the parent nodes themselves).
+    do {
+      node = node.parentNode;
+      if (node)
+        next = nextSiblingEl(node);
+    } while (node && !next);
+    return node && next;
+  },
+
   _checkByline: function(node, matchString) {
     if (this._articleByline) {
       return false;
     }
 
     if (node.getAttribute !== undefined) {
       var rel = node.getAttribute("rel");
     }
@@ -489,16 +551,17 @@ Readability.prototype = {
   /***
    * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
    *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
    *
    * @param page a document to run upon. Needs to be a full document, complete with body.
    * @return Element
   **/
   _grabArticle: function (page) {
+    this.log("**** grabArticle ****");
     var doc = this._doc;
     var isPaging = (page !== null ? true: false);
     page = page ? page : this._doc.body;
 
     // We can't grab an article if we don't have a page!
     if (!page) {
       this.log("No body found in document. Abort.");
       return null;
@@ -543,21 +606,21 @@ Readability.prototype = {
 
         // Turn all divs that don't have children block level elements into p's
         if (node.tagName === "DIV") {
           // Sites like http://mobile.slate.com encloses each paragraph with a DIV
           // element. DIVs with only a P element inside and no text content can be
           // safely converted into plain P elements to avoid confusing the scoring
           // algorithm with DIVs with are, in practice, paragraphs.
           if (this._hasSinglePInsideElement(node)) {
-            var newNode = node.firstElementChild;
+            var newNode = node.children[0];
             node.parentNode.replaceChild(newNode, node);
             node = newNode;
           } else if (!this._hasChildBlockElement(node)) {
-            this._setNodeTag(node, "P");
+            node = this._setNodeTag(node, "P");
             elementsToScore.push(node);
           } else {
             // EXPERIMENTAL
             this._forEachNode(node.childNodes, function(childNode) {
               if (childNode.nodeType === Node.TEXT_NODE) {
                 var p = doc.createElement('p');
                 p.textContent = childNode.textContent;
                 p.style.display = 'inline';
@@ -630,22 +693,22 @@ Readability.prototype = {
         // Scale the final candidates score based on link density. Good content
         // should have a relatively small link density (5% or less) and be mostly
         // unaffected by this operation.
         var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
         candidate.readability.contentScore = candidateScore;
 
         this.log('Candidate:', candidate, "with score " + candidateScore);
 
-        for (var t = 0; t < this.N_TOP_CANDIDATES; t++) {
+        for (var t = 0; t < this._nbTopCandidates; t++) {
           var aTopCandidate = topCandidates[t];
 
           if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
             topCandidates.splice(t, 0, candidate);
-            if (topCandidates.length > this.N_TOP_CANDIDATES)
+            if (topCandidates.length > this._nbTopCandidates)
               topCandidates.pop();
             break;
           }
         }
       }
 
       var topCandidate = topCandidates[0] || null;
       var neededToCreateTopCandidate = false;
@@ -738,38 +801,38 @@ Readability.prototype = {
         if (append) {
           this.log("Appending node:", sibling);
 
           if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
             // We have a node that isn't a common block level element, like a form or td tag.
             // Turn it into a div so it doesn't get filtered out later by accident.
             this.log("Altering sibling:", sibling, 'to div.');
 
-            this._setNodeTag(sibling, "DIV");
+            sibling = this._setNodeTag(sibling, "DIV");
           }
 
           // To ensure a node does not interfere with readability styles,
           // remove its classnames.
           sibling.removeAttribute("class");
 
           articleContent.appendChild(sibling);
           // siblings is a reference to the children array, and
           // sibling is removed from the array when we call appendChild().
           // As a result, we must revisit this index since the nodes
           // have been shifted.
           s -= 1;
           sl -= 1;
         }
       }
 
-      if (this.ENABLE_LOGGING)
+      if (this._debug)
         this.log("Article content pre-prep: " + articleContent.innerHTML);
       // So we have all of the content that we need. Now we clean it up for presentation.
       this._prepArticle(articleContent);
-      if (this.ENABLE_LOGGING)
+      if (this._debug)
         this.log("Article content post-prep: " + articleContent.innerHTML);
 
       if (this._curPageNum === 1) {
         if (neededToCreateTopCandidate) {
           // We already created a fake div thing, and there wouldn't have been any siblings left
           // for the previous loop, so there's no point trying to create a new div, and then
           // move all the children over. Just assign IDs and class names here. No need to append
           // because that already happened anyway.
@@ -782,17 +845,17 @@ Readability.prototype = {
           var children = articleContent.childNodes;
           while (children.length) {
             div.appendChild(children[0]);
           }
           articleContent.appendChild(div);
         }
       }
 
-      if (this.ENABLE_LOGGING)
+      if (this._debug)
         this.log("Article content after paging: " + articleContent.innerHTML);
 
       // Now that we've gone through the full algorithm, check to see if
       // we got any meaningful content. If we didn't, we may need to re-run
       // grabArticle with different flags set. This gives us a higher likelihood of
       // finding the content, and the sieve approach gives us a higher likelihood of
       // finding the -right- content.
       if (this._getInnerText(articleContent, true).length < 500) {
@@ -895,28 +958,32 @@ Readability.prototype = {
   _removeScripts: function(doc) {
     this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
       scriptNode.nodeValue = "";
       scriptNode.removeAttribute('src');
 
       if (scriptNode.parentNode)
         scriptNode.parentNode.removeChild(scriptNode);
     });
+    this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) {
+      if (noscriptNode.parentNode)
+        noscriptNode.parentNode.removeChild(noscriptNode);
+    });
   },
 
   /**
    * Check if this node has only whitespace and a single P element
    * Returns false if the DIV node contains non-empty text nodes
    * or if it contains no P or more than 1 element.
    *
    * @param Element
   **/
   _hasSinglePInsideElement: function(element) {
     // There should be exactly 1 element child which is a P:
-    if (element.children.length != 1 || element.firstElementChild.tagName !== "P") {
+    if (element.children.length != 1 || element.children[0].tagName !== "P") {
       return false;
     }
 
     // And there should be no text nodes with real content
     return !this._someNode(element.childNodes, function(node) {
       return node.nodeType === Node.TEXT_NODE &&
              this.REGEXPS.hasContent.test(node.textContent);
     });
@@ -1285,17 +1352,17 @@ Readability.prototype = {
 
     var articlePage = doc.createElement("DIV");
     articlePage.id = 'readability-page-' + this._curPageNum;
     articlePage.className = 'page';
     articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">&sect;</p>';
 
     doc.getElementById("readability-content").appendChild(articlePage);
 
-    if (this._curPageNum > this.MAX_PAGES) {
+    if (this._curPageNum > this._maxPages) {
       var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
       articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
       return;
     }
 
     // Now that we've built the article page DOM element, get the page content
     // asynchronously and load the cleaned content into the div we created for it.
     (function(pageUrl, thisPage) {
@@ -1444,16 +1511,39 @@ Readability.prototype = {
           return;
       }
 
       element.parentNode.removeChild(element);
     });
   },
 
   /**
+   * Check if a given node has one of its ancestor tag name matching the
+   * provided one.
+   * @param  HTMLElement node
+   * @param  String      tagName
+   * @param  Number      maxDepth
+   * @return Boolean
+   */
+  _hasAncestorTag: function(node, tagName, maxDepth) {
+    maxDepth = maxDepth || 3;
+    tagName = tagName.toUpperCase();
+    var depth = 0;
+    while (node.parentNode) {
+      if (depth > maxDepth)
+        return false;
+      if (node.parentNode.tagName === tagName)
+        return true;
+      node = node.parentNode;
+      depth++;
+    }
+    return false;
+  },
+
+  /**
    * Clean an element of all tags of type "tag" if they look fishy.
    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
    *
    * @return void
    **/
   _cleanConditionally: function(e, tag) {
     if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
       return;
@@ -1488,35 +1578,35 @@ Readability.prototype = {
         for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
           if (!this.REGEXPS.videos.test(embeds[ei].src))
             embedCount += 1;
         }
 
         var linkDensity = this._getLinkDensity(tagsList[i]);
         var contentLength = this._getInnerText(tagsList[i]).length;
         var toRemove = false;
-
-        if (img > p) {
+        if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
           toRemove = true;
         } else if (li > p && tag !== "ul" && tag !== "ol") {
           toRemove = true;
         } else if ( input > Math.floor(p/3) ) {
           toRemove = true;
         } else if (contentLength < 25 && (img === 0 || img > 2) ) {
           toRemove = true;
         } else if (weight < 25 && linkDensity > 0.2) {
           toRemove = true;
         } else if (weight >= 25 && linkDensity > 0.5) {
           toRemove = true;
         } else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
           toRemove = true;
         }
 
-        if (toRemove)
+        if (toRemove) {
           tagsList[i].parentNode.removeChild(tagsList[i]);
+        }
       }
     }
   },
 
   /**
    * Clean out spurious headers from an Element. Checks things like classnames and link density.
    *
    * @param Element
@@ -1540,28 +1630,72 @@ Readability.prototype = {
     this._flags = this._flags | flag;
   },
 
   _removeFlag: function(flag) {
     this._flags = this._flags & ~flag;
   },
 
   /**
+   * Decides whether or not the document is reader-able without parsing the whole thing.
+   *
+   * @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
+   */
+  isProbablyReaderable: function() {
+    var nodes = this._doc.getElementsByTagName("p");
+    if (nodes.length < 5) {
+      return false;
+    }
+
+    var possibleParagraphs = 0;
+    for (var i = 0; i < nodes.length; i++) {
+      var node = nodes[i];
+      var matchString = node.className + " " + node.id;
+
+      if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
+          !this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
+        continue;
+      }
+
+      if (node.textContent.trim().length < 100) {
+        continue;
+      }
+
+      possibleParagraphs++;
+      if (possibleParagraphs >= 5) {
+        return true;
+      }
+    }
+    return false;
+  },
+
+  /**
    * Runs readability.
    *
    * Workflow:
    *  1. Prep the document by removing script tags, css, etc.
    *  2. Build readability's DOM tree.
    *  3. Grab the article content from the current dom tree.
    *  4. Replace the current DOM tree with the new one.
    *  5. Read peacefully.
    *
    * @return void
    **/
   parse: function () {
+    // Avoid parsing too large documents, as per configuration option
+    if (this._maxElemsToParse > 0) {
+      var numTags = this._doc.getElementsByTagName("*").length;
+      if (numTags > this._maxElemsToParse) {
+        throw new Error("Aborting parsing document; " + numTags + " elements found");
+      }
+    }
+
+    if (typeof this._doc.documentElement.firstElementChild === "undefined") {
+      this._getNextNode = this._getNextNodeNoElementProperties;
+    }
     // Remove script tags from the document.
     this._removeScripts(this._doc);
 
     // FIXME: Disabled multi-page article support for now as it
     // needs more work on infrastructure.
 
     // Make sure this document is added to the list of parsed pages first,
     // so we don't double up on the first page.