Bug 1155692 - Include latest Readability/JSDOMParser changes into m-c. a=sledru
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Fri, 17 Apr 2015 16:02:19 +0100
changeset 258534 eb5e2063637b
parent 258533 44de10db57a6
child 258535 746934eab883
push id4690
push userryanvm@gmail.com
push date2015-04-20 16:04 +0000
treeherdermozilla-beta@eb5e2063637b [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssledru
bugs1155692
milestone38.0
Bug 1155692 - Include latest Readability/JSDOMParser changes into m-c. a=sledru
toolkit/components/reader/JSDOMParser.js
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -50,16 +50,22 @@
   var reverseEntityTable = {
     "<": "&lt;",
     ">": "&gt;",
     "&": "&amp;",
     '"': "&quot;",
     "'": "&apos;",
   };
 
+  function encodeTextContentHTML(s) {
+    return s.replace(/[&<>]/g, function(x) {
+      return reverseEntityTable[x];
+    });
+  }
+
   function encodeHTML(s) {
     return s.replace(/[&<>'"]/g, function(x) {
       return reverseEntityTable[x];
     });
   }
 
   function decodeHTML(str) {
     return str.replace(/&(quot|amp|apos|lt|gt);/g, function(match, tag) {
@@ -537,17 +543,17 @@
     get textContent() {
       if (typeof this._textContent === "undefined") {
         this._textContent = decodeHTML(this._innerHTML || "");
       }
       return this._textContent;
     },
     get innerHTML() {
       if (typeof this._innerHTML === "undefined") {
-        this._innerHTML = encodeHTML(this._textContent || "");
+        this._innerHTML = encodeTextContentHTML(this._textContent || "");
       }
       return this._innerHTML;
     },
 
     set innerHTML(newHTML) {
       this._innerHTML = newHTML;
       delete this._textContent;
     },
@@ -585,17 +591,23 @@
         return null;
       }
       return getElem(this);
     },
 
     createElement: function (tag) {
       var node = new Element(tag);
       return node;
-    }
+    },
+
+    createTextNode: function (text) {
+      var node = new Text();
+      node.textContent = text;
+      return node;
+    },
   };
 
   var Element = function (tag) {
     this.attributes = [];
     this.childNodes = [];
     this.children = [];
     this.nextElementSibling = this.previousElementSibling = null;
     this.localName = tag.toLowerCase();
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -199,30 +199,38 @@ Readability.prototype = {
       if (uri.indexOf("./") === 0)
         return pathBase + uri.slice(2);
 
       // Standard relative URI; add entire path. pathBase already includes a
       // trailing "/".
       return pathBase + uri;
     }
 
-    function convertRelativeURIs(tagName, propName) {
-      var elems = articleContent.getElementsByTagName(tagName);
-      this._forEachNode(elems, function(elem) {
-        var relativeURI = elem.getAttribute(propName);
-        if (relativeURI != null)
-          elem.setAttribute(propName, toAbsoluteURI(relativeURI));
-      });
-    }
+    var links = articleContent.getElementsByTagName("a");
+    this._forEachNode(links, function(link) {
+      var href = link.getAttribute("href");
+      if (href) {
+        // Replace links with javascript: URIs with text content, since
+        // they won't work after scripts have been removed from the page.
+        if (href.indexOf("javascript:") === 0) {
+          var text = this._doc.createTextNode(link.textContent);
+          link.parentNode.replaceChild(text, link);
+        } else {
+          link.setAttribute("href", toAbsoluteURI(href));
+        }
+      }
+    });
 
-     // Fix links.
-    convertRelativeURIs.call(this, "a", "href");
-
-     // Fix images.
-    convertRelativeURIs.call(this, "img", "src");
+    var imgs = articleContent.getElementsByTagName("img");
+    this._forEachNode(imgs, function(img) {
+      var src = img.getAttribute("src");
+      if (src) {
+        img.setAttribute("src", toAbsoluteURI(src));
+      }
+    });
   },
 
   /**
    * Get the article title as an H1.
    *
    * @return void
    **/
   _getArticleTitle: function() {
@@ -589,17 +597,18 @@ Readability.prototype = {
           node = this._removeAndGetNext(node);
           continue;
         }
 
         // Remove unlikely candidates
         if (stripUnlikelyCandidates) {
           if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
               !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
-              node.tagName !== "BODY") {
+              node.tagName !== "BODY" &&
+              node.tagName !== "A") {
             this.log("Removing unlikely candidate - " + matchString);
             node = this._removeAndGetNext(node);
             continue;
           }
         }
 
         if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
           elementsToScore.push(node);