Bug 1416561 - update readability from git ( c3ff1a2d2c94c1db257b2c9aa88a4b8fbeb221c5 ), r=already-reviewed
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Sun, 12 Nov 2017 12:47:32 +0000
changeset 444715 7fb001f811d183095e9cfdd958c9215484377396
parent 444714 046d8bdbc0238ea0ea611c5ffd92770020a4abf6
child 444721 1f91961bb79ad06fd4caef9e5dfd546afd5bf42c
child 445740 559f4487dc65fa76687447282d40b0eee60f20bf
push id1618
push userCallek@gmail.com
push dateThu, 11 Jan 2018 17:45:48 +0000
treeherdermozilla-release@882ca853e05a [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersalready-reviewed
bugs1416561
milestone58.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1416561 - update readability from git ( c3ff1a2d2c94c1db257b2c9aa88a4b8fbeb221c5 ), r=already-reviewed MozReview-Commit-ID: 3WX99CsZ4z5
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -33,43 +33,31 @@
  * @param {HTMLDocument} doc     The document to parse.
  * @param {Object}       options The options object.
  */
 function Readability(uri, doc, options) {
   options = options || {};
 
   this._uri = uri;
   this._doc = doc;
-  this._biggestFrame = false;
   this._articleTitle = null;
   this._articleByline = null;
   this._articleDir = null;
 
   // Configureable options
   this._debug = !!options.debug;
   this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
-  this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
+  this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
 
   // Start with all flags set
   this._flags = this.FLAG_STRIP_UNLIKELYS |
                 this.FLAG_WEIGHT_CLASSES |
                 this.FLAG_CLEAN_CONDITIONALLY;
 
-  // The list of pages we've parsed in this call of readability,
-  // for autopaging. As a key store for easier searching.
-  this._parsedPages = {};
-
-  // A list of the ETag headers of pages we've parsed, in case they happen to match,
-  // we'll know it's a duplicate.
-  this._pageETags = {};
-
-  // Make an AJAX request for each page and append it to the document.
-  this._curPageNum = 1;
-
   var logEl;
 
   // Control whether log messages are sent to the console
   if (this._debug) {
     logEl = function(e) {
       var rv = e.nodeName + " ";
       if (e.nodeType == e.TEXT_NODE) {
         return rv + '("' + e.textContent + '")';
@@ -105,23 +93,22 @@ Readability.prototype = {
 
   // Max number of nodes supported by this parser. Default: 0 (no limit)
   DEFAULT_MAX_ELEMS_TO_PARSE: 0,
 
   // The number of top candidates to consider when analysing how
   // tight the competition is among candidates.
   DEFAULT_N_TOP_CANDIDATES: 5,
 
-  // The maximum number of pages to loop through before we call
-  // it quits and just show a link.
-  DEFAULT_MAX_PAGES: 5,
-
   // Element tags to score by default.
   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 
+  // The default number of words an article must have in order to return a result
+  DEFAULT_WORD_THRESHOLD: 500,
+
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
     unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
     negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
@@ -134,16 +121,20 @@ Readability.prototype = {
     whitespace: /^\s*$/,
     hasContent: /\S$/,
   },
 
   DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
 
   ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
 
+  PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
+
+  DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
+
   /**
    * Run any post-process modifications to article content as necessary.
    *
    * @param Element
    * @return void
   **/
   _postProcessContent: function(articleContent) {
     // Readability cannot open relative uris so we convert them to absolute uris.
@@ -316,51 +307,67 @@ Readability.prototype = {
     try {
       curTitle = origTitle = doc.title;
 
       // If they had an element with id "title" in their HTML
       if (typeof curTitle !== "string")
         curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
     } catch (e) {/* ignore exceptions setting the title. */}
 
-    if (curTitle.match(/ [\|\-] /)) {
-      curTitle = origTitle.replace(/(.*)[\|\-] .*/gi, '$1');
+    var titleHadHierarchicalSeparators = false;
+    function wordCount(str) {
+      return str.split(/\s+/).length;
+    }
 
-      if (curTitle.split(' ').length < 3)
-        curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi, '$1');
+    // If there's a separator in the title, first remove the final part
+    if ((/ [\|\-\\\/>»] /).test(curTitle)) {
+      titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
+      curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, '$1');
+
+      // If the resulting title is too short (3 words or fewer), remove
+      // the first part instead:
+      if (wordCount(curTitle) < 3)
+        curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1');
     } else if (curTitle.indexOf(': ') !== -1) {
       // Check if we have an heading containing this exact string, so we
       // could assume it's the full title.
       var headings = this._concatNodeLists(
         doc.getElementsByTagName('h1'),
         doc.getElementsByTagName('h2')
       );
       var match = this._someNode(headings, function(heading) {
         return heading.textContent === curTitle;
       });
 
       // If we don't, let's extract the title out of the original title string.
       if (!match) {
         curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
 
         // If the title is now too short, try the first colon instead:
-        if (curTitle.split(' ').length < 3)
+        if (wordCount(curTitle) < 3)
           curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
       }
     } else if (curTitle.length > 150 || curTitle.length < 15) {
       var hOnes = doc.getElementsByTagName('h1');
 
       if (hOnes.length === 1)
         curTitle = this._getInnerText(hOnes[0]);
     }
 
     curTitle = curTitle.trim();
-
-    if (curTitle.split(' ').length <= 4)
+    // If we now have 4 words or fewer as our title, and either no
+    // 'hierarchical' separators (\, /, > or ») were found in the original
+    // title or we decreased the number of words by more than 1 word, use
+    // the original title.
+    var curTitleWordCount = wordCount(curTitle);
+    if (curTitleWordCount <= 4 &&
+        (!titleHadHierarchicalSeparators ||
+         curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
       curTitle = origTitle;
+    }
 
     return curTitle;
   },
 
   /**
    * Prepare the HTML document for readability to scrape it.
    * This includes things like stripping javascript, CSS, and handling terrible markup.
    *
@@ -496,20 +503,26 @@ Readability.prototype = {
     });
 
     // If there is only one h2 and its text content substantially equals article title,
     // they are probably using it as a header and not a subheader,
     // so remove it since we already extract the title separately.
     var h2 = articleContent.getElementsByTagName('h2');
     if (h2.length === 1) {
       var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
-      if (Math.abs(lengthSimilarRate) < 0.5 &&
-          (lengthSimilarRate > 0 ? h2[0].textContent.includes(this._articleTitle) :
-                                   this._articleTitle.includes(h2[0].textContent))) {
-        this._clean(articleContent, "h2");
+      if (Math.abs(lengthSimilarRate) < 0.5) {
+        var titlesMatch = false;
+        if (lengthSimilarRate > 0) {
+          titlesMatch = h2[0].textContent.includes(this._articleTitle);
+        } else {
+          titlesMatch = this._articleTitle.includes(h2[0].textContent);
+        }
+        if (titlesMatch) {
+          this._clean(articleContent, "h2");
+        }
       }
     }
 
     this._clean(articleContent, "iframe");
     this._clean(articleContent, "input");
     this._clean(articleContent, "textarea");
     this._clean(articleContent, "select");
     this._clean(articleContent, "button");
@@ -1009,45 +1022,43 @@ Readability.prototype = {
 
       if (this._debug)
         this.log("Article content pre-prep: " + articleContent.innerHTML);
       // So we have all of the content that we need. Now we clean it up for presentation.
       this._prepArticle(articleContent);
       if (this._debug)
         this.log("Article content post-prep: " + articleContent.innerHTML);
 
-      if (this._curPageNum === 1) {
-        if (neededToCreateTopCandidate) {
-          // We already created a fake div thing, and there wouldn't have been any siblings left
-          // for the previous loop, so there's no point trying to create a new div, and then
-          // move all the children over. Just assign IDs and class names here. No need to append
-          // because that already happened anyway.
-          topCandidate.id = "readability-page-1";
-          topCandidate.className = "page";
-        } else {
-          var div = doc.createElement("DIV");
-          div.id = "readability-page-1";
-          div.className = "page";
-          var children = articleContent.childNodes;
-          while (children.length) {
-            div.appendChild(children[0]);
-          }
-          articleContent.appendChild(div);
+      if (neededToCreateTopCandidate) {
+        // We already created a fake div thing, and there wouldn't have been any siblings left
+        // for the previous loop, so there's no point trying to create a new div, and then
+        // move all the children over. Just assign IDs and class names here. No need to append
+        // because that already happened anyway.
+        topCandidate.id = "readability-page-1";
+        topCandidate.className = "page";
+      } else {
+        var div = doc.createElement("DIV");
+        div.id = "readability-page-1";
+        div.className = "page";
+        var children = articleContent.childNodes;
+        while (children.length) {
+          div.appendChild(children[0]);
         }
+        articleContent.appendChild(div);
       }
 
       if (this._debug)
         this.log("Article content after paging: " + articleContent.innerHTML);
 
       // Now that we've gone through the full algorithm, check to see if
       // we got any meaningful content. If we didn't, we may need to re-run
       // grabArticle with different flags set. This gives us a higher likelihood of
       // finding the content, and the sieve approach gives us a higher likelihood of
       // finding the -right- content.
-      if (this._getInnerText(articleContent, true).length < 500) {
+      if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
         page.innerHTML = pageCacheHtml;
 
         if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
           this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
         } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
           this._removeFlag(this.FLAG_WEIGHT_CLASSES);
         } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
           this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
@@ -1243,36 +1254,35 @@ Readability.prototype = {
   /**
    * Remove the style attribute on every e and under.
    * TODO: Test if getElementsByTagName(*) is faster.
    *
    * @param Element
    * @return void
   **/
   _cleanStyles: function(e) {
-    e = e || this._doc;
-    if (!e)
+    if (!e || e.tagName.toLowerCase() === 'svg')
       return;
-    var cur = e.firstChild;
-
-    // Remove any root styles, if we're able.
-    if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
-      e.removeAttribute('style');
 
-    // Go until there are no more child nodes
-    while (cur !== null) {
-      if (cur.nodeType === cur.ELEMENT_NODE) {
-        // Remove style attribute(s) :
-        if (cur.className !== "readability-styled")
-          cur.removeAttribute("style");
-
-        this._cleanStyles(cur);
+    if (e.className !== 'readability-styled') {
+      // Remove `style` and deprecated presentational attributes
+      for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
+        e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
       }
 
-      cur = cur.nextSibling;
+      if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
+        e.removeAttribute('width');
+        e.removeAttribute('height');
+      }
+    }
+
+    var cur = e.firstElementChild;
+    while (cur !== null) {
+      this._cleanStyles(cur);
+      cur = cur.nextElementSibling;
     }
   },
 
   /**
    * Get the density of links as a percentage of the content
    * This is the amount of text that is inside a link divided by the total text in the node.
    *
    * @param Element
@@ -1289,373 +1299,16 @@ Readability.prototype = {
     this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
       linkLength += this._getInnerText(linkNode).length;
     });
 
     return linkLength / textLength;
   },
 
   /**
-   * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
-   *
-   * @author Dan Lacy
-   * @return string the base url
-  **/
-  _findBaseUrl: function() {
-    var uri = this._uri;
-    var noUrlParams = uri.pathQueryRef.split("?")[0];
-    var urlSlashes = noUrlParams.split("/").reverse();
-    var cleanedSegments = [];
-    var possibleType = "";
-
-    for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) {
-      var segment = urlSlashes[i];
-
-      // Split off and save anything that looks like a file type.
-      if (segment.indexOf(".") !== -1) {
-        possibleType = segment.split(".")[1];
-
-        // If the type isn't alpha-only, it's probably not actually a file extension.
-        if (!possibleType.match(/[^a-zA-Z]/))
-          segment = segment.split(".")[0];
-      }
-
-      // If our first or second segment has anything looking like a page number, remove it.
-      if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0)))
-        segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
-
-      var del = false;
-
-      // If this is purely a number, and it's the first or second segment,
-      // it's probably a page number. Remove it.
-      if (i < 2 && segment.match(/^\d{1,2}$/))
-        del = true;
-
-      // If this is the first segment and it's just "index", remove it.
-      if (i === 0 && segment.toLowerCase() === "index")
-        del = true;
-
-      // If our first or second segment is smaller than 3 characters,
-      // and the first segment was purely alphas, remove it.
-      if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i))
-        del = true;
-
-      // If it's not marked for deletion, push it to cleanedSegments.
-      if (!del)
-        cleanedSegments.push(segment);
-    }
-
-    // This is our final, cleaned, base article URL.
-    return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/");
-  },
-
-  /**
-   * Look for any paging links that may occur within the document.
-   *
-   * @param body
-   * @return object (array)
-  **/
-  _findNextPageLink: function(elem) {
-    var uri = this._uri;
-    var possiblePages = {};
-    var allLinks = elem.getElementsByTagName('a');
-    var articleBaseUrl = this._findBaseUrl();
-
-    // Loop through all links, looking for hints that they may be next-page links.
-    // Things like having "page" in their textContent, className or id, or being a child
-    // of a node with a page-y className or id.
-    //
-    // Also possible: levenshtein distance? longest common subsequence?
-    //
-    // After we do that, assign each page a score, and
-    for (var i = 0, il = allLinks.length; i < il; i += 1) {
-      var link = allLinks[i];
-      var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
-
-      // If we've already seen this page, ignore it.
-      if (linkHref === "" ||
-        linkHref === articleBaseUrl ||
-        linkHref === uri.spec ||
-        linkHref in this._parsedPages) {
-        continue;
-      }
-
-      // If it's on a different domain, skip it.
-      if (uri.host !== linkHref.split(/\/+/g)[1])
-        continue;
-
-      var linkText = this._getInnerText(link);
-
-      // If the linkText looks like it's not the next page, skip it.
-      if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25)
-        continue;
-
-      // If the leftovers of the URL after removing the base URL don't contain
-      // any digits, it's certainly not a next page link.
-      var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
-      if (!linkHrefLeftover.match(/\d/))
-        continue;
-
-      if (!(linkHref in possiblePages)) {
-        possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
-      } else {
-        possiblePages[linkHref].linkText += ' | ' + linkText;
-      }
-
-      var linkObj = possiblePages[linkHref];
-
-      // If the articleBaseUrl isn't part of this URL, penalize this link. It could
-      // still be the link, but the odds are lower.
-      // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
-      if (linkHref.indexOf(articleBaseUrl) !== 0)
-        linkObj.score -= 25;
-
-      var linkData = linkText + ' ' + link.className + ' ' + link.id;
-      if (linkData.match(this.REGEXPS.nextLink))
-        linkObj.score += 50;
-
-      if (linkData.match(/pag(e|ing|inat)/i))
-        linkObj.score += 25;
-
-      if (linkData.match(/(first|last)/i)) {
-        // -65 is enough to negate any bonuses gotten from a > or » in the text,
-        // If we already matched on "next", last is probably fine.
-        // If we didn't, then it's bad. Penalize.
-        if (!linkObj.linkText.match(this.REGEXPS.nextLink))
-          linkObj.score -= 65;
-      }
-
-      if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous))
-        linkObj.score -= 50;
-
-      if (linkData.match(this.REGEXPS.prevLink))
-        linkObj.score -= 200;
-
-      // If a parentNode contains page or paging or paginat
-      var parentNode = link.parentNode;
-      var positiveNodeMatch = false;
-      var negativeNodeMatch = false;
-
-      while (parentNode) {
-        var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
-
-        if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
-          positiveNodeMatch = true;
-          linkObj.score += 25;
-        }
-
-        if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) {
-          // If this is just something like "footer", give it a negative.
-          // If it's something like "body-and-footer", leave it be.
-          if (!parentNodeClassAndId.match(this.REGEXPS.positive)) {
-            linkObj.score -= 25;
-            negativeNodeMatch = true;
-          }
-        }
-
-        parentNode = parentNode.parentNode;
-      }
-
-      // If the URL looks like it has paging in it, add to the score.
-      // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
-      if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i))
-        linkObj.score += 25;
-
-      // If the URL contains negative values, give a slight decrease.
-      if (linkHref.match(this.REGEXPS.extraneous))
-        linkObj.score -= 15;
-
-      /**
-       * Minor punishment to anything that doesn't match our current URL.
-       * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
-       *     Dan, can you show me a counterexample where this is necessary?
-       * if (linkHref.indexOf(window.location.href) !== 0) {
-       *  linkObj.score -= 1;
-       * }
-      **/
-
-      // If the link text can be parsed as a number, give it a minor bonus, with a slight
-      // bias towards lower numbered pages. This is so that pages that might not have 'next'
-      // in their text can still get scored, and sorted properly by score.
-      var linkTextAsNumber = parseInt(linkText, 10);
-      if (linkTextAsNumber) {
-        // Punish 1 since we're either already there, or it's probably
-        // before what we want anyways.
-        if (linkTextAsNumber === 1) {
-          linkObj.score -= 10;
-        } else {
-          linkObj.score += Math.max(0, 10 - linkTextAsNumber);
-        }
-      }
-    }
-
-    // Loop thrugh all of our possible pages from above and find our top
-    // candidate for the next page URL. Require at least a score of 50, which
-    // is a relatively high confidence that this page is the next link.
-    var topPage = null;
-    for (var page in possiblePages) {
-      if (possiblePages.hasOwnProperty(page)) {
-        if (possiblePages[page].score >= 50 &&
-          (!topPage || topPage.score < possiblePages[page].score))
-          topPage = possiblePages[page];
-      }
-    }
-
-    var nextHref = null;
-    if (topPage) {
-      nextHref = topPage.href.replace(/\/$/, '');
-
-      this.log('NEXT PAGE IS ' + nextHref);
-      this._parsedPages[nextHref] = true;
-    }
-    return nextHref;
-  },
-
-  _successfulRequest: function(request) {
-    return (request.status >= 200 && request.status < 300) ||
-        request.status === 304 ||
-         (request.status === 0 && request.responseText);
-  },
-
-  _ajax: function(url, options) {
-    var request = new XMLHttpRequest();
-
-    function respondToReadyState(readyState) {
-      if (request.readyState === 4) {
-        if (this._successfulRequest(request)) {
-          if (options.success)
-            options.success(request);
-        } else if (options.error) {
-          options.error(request);
-        }
-      }
-    }
-
-    if (typeof options === 'undefined')
-      options = {};
-
-    request.onreadystatechange = respondToReadyState;
-
-    request.open('get', url, true);
-    request.setRequestHeader('Accept', 'text/html');
-
-    try {
-      request.send(options.postBody);
-    } catch (e) {
-      if (options.error)
-        options.error();
-    }
-
-    return request;
-  },
-
-  _appendNextPage: function(nextPageLink) {
-    var doc = this._doc;
-    this._curPageNum += 1;
-
-    var articlePage = doc.createElement("DIV");
-    articlePage.id = 'readability-page-' + this._curPageNum;
-    articlePage.className = 'page';
-    articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">&sect;</p>';
-
-    doc.getElementById("readability-content").appendChild(articlePage);
-
-    if (this._curPageNum > this._maxPages) {
-      var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
-      articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
-      return;
-    }
-
-    // Now that we've built the article page DOM element, get the page content
-    // asynchronously and load the cleaned content into the div we created for it.
-    ((pageUrl, thisPage) => {
-      this._ajax(pageUrl, {
-        success: function(r) {
-
-          // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page.
-          var eTag = r.getResponseHeader('ETag');
-          if (eTag) {
-            if (eTag in this._pageETags) {
-              this.log("Exact duplicate page found via ETag. Aborting.");
-              articlePage.style.display = 'none';
-              return;
-            }
-            this._pageETags[eTag] = 1;
-          }
-
-          // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
-          var page = doc.createElement("DIV");
-
-          // Do some preprocessing to our HTML to make it ready for appending.
-          // - Remove any script tags. Swap and reswap newlines with a unicode
-          //   character because multiline regex doesn't work in javascript.
-          // - Turn any noscript tags into divs so that we can parse them. This
-          //   allows us to find any next page links hidden via javascript.
-          // - Turn all double br's into p's - was handled by prepDocument in the original view.
-          //   Maybe in the future abstract out prepDocument to work for both the original document
-          //   and AJAX-added pages.
-          var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
-          responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
-          responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
-          responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
-
-          page.innerHTML = responseHtml;
-          this._replaceBrs(page);
-
-          // Reset all flags for the next page, as they will search through it and
-          // disable as necessary at the end of grabArticle.
-          this._flags = 0x1 | 0x2 | 0x4;
-
-          var secondNextPageLink = this._findNextPageLink(page);
-
-          // NOTE: if we end up supporting _appendNextPage(), we'll need to
-          // change this call to be async
-          var content = this._grabArticle(page);
-
-          if (!content) {
-            this.log("No content found in page to append. Aborting.");
-            return;
-          }
-
-          // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
-          // Compare it against all of the the previous document's we've gotten. If the previous
-          // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
-          var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
-          if (firstP && firstP.innerHTML.length > 100) {
-            for (var i = 1; i <= this._curPageNum; i += 1) {
-              var rPage = doc.getElementById('readability-page-' + i);
-              if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
-                this.log('Duplicate of page ' + i + ' - skipping.');
-                articlePage.style.display = 'none';
-                this._parsedPages[pageUrl] = true;
-                return;
-              }
-            }
-          }
-
-          this._removeScripts(content);
-
-          thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
-
-          // After the page has rendered, post process the content. This delay is necessary because,
-          // in webkit at least, offsetWidth is not set in time to determine image width. We have to
-          // wait a little bit for reflow to finish before we can fix floating images.
-          setTimeout(() => {
-            this._postProcessContent(thisPage);
-          }, 500);
-
-
-          if (secondNextPageLink)
-            this._appendNextPage(secondNextPageLink);
-        }
-      });
-    })(nextPageLink, articlePage);
-  },
-
-  /**
    * Get an elements class/id weight. Uses regular expressions to tell if this
    * element looks good or bad.
    *
    * @param Element
    * @return number (Integer)
   **/
   _getClassWeight: function(e) {
     if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES))
@@ -1927,20 +1580,16 @@ Readability.prototype = {
       });
     }
   },
 
   _flagIsActive: function(flag) {
     return (this._flags & flag) > 0;
   },
 
-  _addFlag: function(flag) {
-    this._flags = this._flags | flag;
-  },
-
   _removeFlag: function(flag) {
     this._flags = this._flags & ~flag;
   },
 
   /**
    * Decides whether or not the document is reader-able without parsing the whole thing.
    *
    * @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
@@ -2021,47 +1670,29 @@ Readability.prototype = {
     }
 
     if (typeof this._doc.documentElement.firstElementChild === "undefined") {
       this._getNextNode = this._getNextNodeNoElementProperties;
     }
     // Remove script tags from the document.
     this._removeScripts(this._doc);
 
-    // FIXME: Disabled multi-page article support for now as it
-    // needs more work on infrastructure.
-
-    // Make sure this document is added to the list of parsed pages first,
-    // so we don't double up on the first page.
-    // this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
-
-    // Pull out any possible next page link first.
-    // var nextPageLink = this._findNextPageLink(doc.body);
-
     this._prepDocument();
 
     var metadata = this._getArticleMetadata();
     this._articleTitle = metadata.title;
 
     var articleContent = this._grabArticle();
     if (!articleContent)
       return null;
 
     this.log("Grabbed: " + articleContent.innerHTML);
 
     this._postProcessContent(articleContent);
 
-    // if (nextPageLink) {
-    //   // Append any additional pages after a small timeout so that people
-    //   // can start reading without having to wait for this to finish processing.
-    //   setTimeout((function() {
-    //     this._appendNextPage(nextPageLink);
-    //   }).bind(this), 500);
-    // }
-
     // If we haven't found an excerpt in the article's metadata, use the article's
     // first paragraph as the excerpt. This is used for displaying a preview of
     // the article's content.
     if (!metadata.excerpt) {
       var paragraphs = articleContent.getElementsByTagName("p");
       if (paragraphs.length > 0) {
         metadata.excerpt = paragraphs[0].textContent.trim();
       }