Bug 1444082 - sync reader mode to github tip ( 8525c6af36d3badbe27c4672a6f2dd99ddb4097f ), r=johannh
☠☠ backed out by dd45b23b67c8 ☠ ☠
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Thu, 08 Mar 2018 14:35:02 +0000
changeset 462476 609e4952a46e77839c34afac884876154bd24482
parent 462475 8a988d1a6f23f900b90c7843e2c61db14a57e56c
child 462477 f749f8d424c44b23815245374522c77f982e66bf
push id1683
push usersfraser@mozilla.com
push dateThu, 26 Apr 2018 16:43:40 +0000
treeherdermozilla-release@5af6cb21869d [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjohannh
bugs1444082
milestone60.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1444082 - sync reader mode to github tip ( 8525c6af36d3badbe27c4672a6f2dd99ddb4097f ), r=johannh MozReview-Commit-ID: LZLFf9kyUR5
toolkit/components/reader/JSDOMParser.js
toolkit/components/reader/Readability.js
toolkit/components/reader/ReaderWorker.js
toolkit/components/reader/test/readerModeNonArticle.html
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -555,17 +555,18 @@
       delete this._textContent;
     },
     set textContent(newText) {
       this._textContent = newText;
       delete this._innerHTML;
     },
   };
 
-  var Document = function () {
+  var Document = function (url) {
+    this.documentURI = url;
     this.styleSheets = [];
     this.childNodes = [];
     this.children = [];
   };
 
   Document.prototype = {
     __proto__: Node.prototype,
 
@@ -595,16 +596,30 @@
       return node;
     },
 
     createTextNode: function (text) {
       var node = new Text();
       node.textContent = text;
       return node;
     },
+
+    get baseURI() {
+      if (!this.hasOwnProperty("_baseURI")) {
+        this._baseURI = this.documentURI;
+        var baseElements = this.getElementsByTagName("base");
+        var href = baseElements[0] && baseElements[0].getAttribute("href");
+        if (href) {
+          try {
+            this._baseURI = (new URL(href, this._baseURI)).href;
+          } catch (ex) {/* Just fall back to documentURI */}
+        }
+      }
+      return this._baseURI;
+    },
   };
 
   var Element = function (tag) {
     this.attributes = [];
     this.childNodes = [];
     this.children = [];
     this.nextElementSibling = this.previousElementSibling = null;
     this.localName = tag.toLowerCase();
@@ -1113,19 +1128,19 @@
       }
 
       return node;
     },
 
     /**
      * Parses an HTML string and returns a JS implementation of the Document.
      */
-    parse: function (html) {
+    parse: function (html, url) {
       this.html = html;
-      var doc = this.doc = new Document();
+      var doc = this.doc = new Document(url);
       this.readChildren(doc);
 
       // If this is an HTML document, remove root-level children except for the
       // <html> node
       if (doc.documentElement) {
         for (var i = doc.childNodes.length; --i >= 0;) {
           var child = doc.childNodes[i];
           if (child !== doc.documentElement) {
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -36,16 +36,17 @@
 function Readability(uri, doc, options) {
   options = options || {};
 
   this._uri = uri;
   this._doc = doc;
   this._articleTitle = null;
   this._articleByline = null;
   this._articleDir = null;
+  this._attempts = [];
 
   // Configurable options
   this._debug = !!options.debug;
   this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
   this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
   this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
 
@@ -270,44 +271,30 @@ Readability.prototype = {
   /**
    * Converts each <a> and <img> uri in the given element to an absolute URI,
    * ignoring #ref URIs.
    *
    * @param Element
    * @return void
    */
   _fixRelativeUris: function(articleContent) {
-    var scheme = this._uri.scheme;
-    var prePath = this._uri.prePath;
-    var pathBase = this._uri.pathBase;
-
+    var baseURI = this._doc.baseURI;
+    var documentURI = this._doc.documentURI;
     function toAbsoluteURI(uri) {
-      // If this is already an absolute URI, return it.
-      if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri))
+      // Leave hash links alone if the base URI matches the document URI:
+      if (baseURI == documentURI && uri.charAt(0) == "#") {
         return uri;
-
-      // Scheme-rooted relative URI.
-      if (uri.substr(0, 2) == "//")
-        return scheme + "://" + uri.substr(2);
-
-      // Prepath-rooted relative URI.
-      if (uri[0] == "/")
-        return prePath + uri;
-
-      // Dotslash relative URI.
-      if (uri.indexOf("./") === 0)
-        return pathBase + uri.slice(2);
-
-      // Ignore hash URIs:
-      if (uri[0] == "#")
-        return uri;
-
-      // Standard relative URI; add entire path. pathBase already includes a
-      // trailing "/".
-      return pathBase + uri;
+      }
+      // Otherwise, resolve against base URI:
+      try {
+        return new URL(uri, baseURI).href;
+      } catch (ex) {
+        // Something went wrong, just return the original:
+      }
+      return uri;
     }
 
     var links = articleContent.getElementsByTagName("a");
     this._forEachNode(links, function(link) {
       var href = link.getAttribute("href");
       if (href) {
         // Replace links with javascript: URIs with text content, since
         // they won't work after scripts have been removed from the page.
@@ -530,16 +517,17 @@ Readability.prototype = {
 
     // Clean out junk from the article content
     this._cleanConditionally(articleContent, "form");
     this._cleanConditionally(articleContent, "fieldset");
     this._clean(articleContent, "object");
     this._clean(articleContent, "embed");
     this._clean(articleContent, "h1");
     this._clean(articleContent, "footer");
+    this._clean(articleContent, "link");
 
     // Clean out elements have "share" in their id/class combinations from final top candidates,
     // which means we don't remove the top candidates even they have "share".
     this._forEachNode(articleContent.children, function(topCandidate) {
       this._cleanMatchedNodes(topCandidate, /share/);
     });
 
     // If there is only one h2 and its text content substantially equals article title,
@@ -1084,34 +1072,55 @@ Readability.prototype = {
           div.appendChild(children[0]);
         }
         articleContent.appendChild(div);
       }
 
       if (this._debug)
         this.log("Article content after paging: " + articleContent.innerHTML);
 
+      var parseSuccessful = true;
+
       // Now that we've gone through the full algorithm, check to see if
       // we got any meaningful content. If we didn't, we may need to re-run
       // grabArticle with different flags set. This gives us a higher likelihood of
       // finding the content, and the sieve approach gives us a higher likelihood of
       // finding the -right- content.
-      if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
+      var textLength = this._getInnerText(articleContent, true).length;
+      if (textLength < this._wordThreshold) {
+        parseSuccessful = false;
         page.innerHTML = pageCacheHtml;
 
         if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
           this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
         } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
           this._removeFlag(this.FLAG_WEIGHT_CLASSES);
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
         } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
           this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
         } else {
-          return null;
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
+          // No luck after removing flags, just return the longest text we found during the different loops
+          this._attempts.sort(function (a, b) {
+            return a.textLength < b.textLength;
+          });
+
+          // But first check if we actually have something
+          if (!this._attempts[0].textLength) {
+            return null;
+          }
+
+          articleContent = this._attempts[0].articleContent;
+          parseSuccessful = true;
         }
-      } else {
+      }
+
+      if (parseSuccessful) {
         // Find out text direction from ancestors of final top candidate.
         var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
         this._someNode(ancestors, function(ancestor) {
           if (!ancestor.tagName)
             return false;
           var articleDir = ancestor.getAttribute("dir");
           if (articleDir) {
             this._articleDir = articleDir;
--- a/toolkit/components/reader/ReaderWorker.js
+++ b/toolkit/components/reader/ReaderWorker.js
@@ -42,12 +42,12 @@ var Agent = {
    *
    * @param {object} uri URI data for the document.
    * @param {string} serializedDoc The serialized document.
    * @param {object} options Options object to pass to Readability.
    *
    * @return {object} Article object returned from Readability.
    */
   parseDocument(uri, serializedDoc, options) {
-    let doc = new JSDOMParser().parse(serializedDoc);
+    let doc = new JSDOMParser().parse(serializedDoc, uri.spec);
     return new Readability(uri, doc, options).parse();
   },
 };
--- a/toolkit/components/reader/test/readerModeNonArticle.html
+++ b/toolkit/components/reader/test/readerModeNonArticle.html
@@ -1,14 +1,9 @@
 <!DOCTYPE html>
 <html>
 <head>
 <title>Non article title</title>
 <meta name="description" content="This is the non-article description." />
 </head>
 <body>
-<header>Site header</header>
-<div>
-<h1>Non article title</h1>
-<p>Woot!</p>
-</div>
 </body>
 </html>