No bug, update Readability.js to the version in github, rs=margaret,me per discussion earlier today a=readinglist
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Tue, 17 Mar 2015 20:58:58 -0700
changeset 248433 8492c9dae13c3f5ab25ea54b052393adafa6a8ca
parent 248432 b7c0318cd5b0ac571011aca8e6dfa1ad2298a52a
child 248434 090a386f48d55cfd5c9ebd868013854bb2905aa6
push id7837
push userjwein@mozilla.com
push dateFri, 27 Mar 2015 00:27:16 +0000
treeherdermozilla-aurora@cb0db44ce60e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersmargaret, me, readinglist
milestone38.0a2
No bug, update Readability.js to the version in github, rs=margaret,me per discussion earlier today a=readinglist
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -20,19 +20,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 /*
  * This code is heavily based on Arc90's readability.js (1.7.1) script
  * available at: http://code.google.com/p/arc90labs-readability
  */
-
+var root = this;
 var Readability = function(uri, doc) {
-  const ENABLE_LOGGING = false;
+  var ENABLE_LOGGING = false;
 
   this._uri = uri;
   this._doc = doc;
   this._biggestFrame = false;
   this._articleByline = null;
   this._articleDir = null;
 
   // Start with all flags set
@@ -48,18 +48,36 @@ var Readability = function(uri, doc) {
   // we'll know it's a duplicate.
   this._pageETags = {};
 
   // Make an AJAX request for each page and append it to the document.
   this._curPageNum = 1;
 
   // Control whether log messages are sent to the console
   if (ENABLE_LOGGING) {
-    this.log = function (msg) {
-      dump("Reader: (Readability) " + msg);
+    function logEl(e) {
+      var rv = e.nodeName + " ";
+      if (e.nodeType == e.TEXT_NODE) {
+        return rv + '("' + e.textContent + '")';
+      }
+      var classDesc = e.className && ("." + e.className.replace(/ /g, "."));
+      var elDesc = e.id ? "(#" + e.id + classDesc + ")" :
+                          (classDesc ? "(" + classDesc + ")" : "");
+      return rv + elDesc;
+    }
+    this.log = function () {
+      if ("dump" in root) {
+        var msg = Array.prototype.map.call(arguments, function(x) {
+          return (x && x.nodeName) ? logEl(x) : x;
+        }).join(" ");
+        dump("Reader: (Readability) " + msg + "\n");
+      } else if ("console" in root) {
+        var args = ["Reader: (Readability) "].concat(arguments);
+        console.log.apply(console, args);
+      }
     };
   } else {
     this.log = function () {};
   }
 }
 
 Readability.prototype = {
   FLAG_STRIP_UNLIKELYS: 0x1,
@@ -198,37 +216,25 @@ Readability.prototype = {
    * Prepare the HTML document for readability to scrape it.
    * This includes things like stripping javascript, CSS, and handling terrible markup.
    *
    * @return void
    **/
   _prepDocument: function() {
     var doc = this._doc;
 
-    // In some cases a body element can't be found (if the HTML is
-    // totally hosed for example) so we create a new body node and
-    // append it to the document.
-    if (!doc.body) {
-      var body = doc.createElement("body");
-
-      try {
-        doc.body = body;
-      } catch(e) {
-        doc.documentElement.appendChild(body);
-        this.log(e);
-      }
-    }
-
     // Remove all style tags in head
     var styleTags = doc.getElementsByTagName("style");
     for (var st = 0; st < styleTags.length; st += 1) {
       styleTags[st].textContent = "";
     }
 
-    this._replaceBrs(doc.body);
+    if (doc.body) {
+      this._replaceBrs(doc.body);
+    }
 
     var fonts = doc.getElementsByTagName("FONT");
     for (var i = fonts.length; --i >=0;) {
       this._setNodeTag(fonts[i], "SPAN");
     }
   },
 
   /**
@@ -407,16 +413,23 @@ Readability.prototype = {
    *
    * @param page a document to run upon. Needs to be a full document, complete with body.
    * @return Element
   **/
   _grabArticle: function (page) {
     var doc = this._doc;
     var isPaging = (page !== null ? true: false);
     page = page ? page : this._doc.body;
+
+    // We can't grab an article if we don't have a page!
+    if (!page) {
+      this.log("No body found in document. Abort.");
+      return null;
+    }
+
     var pageCacheHtml = page.innerHTML;
 
     // Check if any "dir" is set on the toplevel document element
     this._articleDir = doc.documentElement.getAttribute("dir");
 
     //helper function used below in the 'while' loop:
     function purgeNode(node, allElements) {
       for (var i = node.childNodes.length; --i >= 0;) {
@@ -571,41 +584,43 @@ Readability.prototype = {
         var candidate = candidates[c];
 
         // Scale the final candidates score based on link density. Good content
         // should have a relatively small link density (5% or less) and be mostly
         // unaffected by this operation.
         var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
         candidate.readability.contentScore = candidateScore;
 
-        this.log('Candidate: ' + candidate + " (" + candidate.className + ":" +
-          candidate.id + ") with score " + candidateScore);
+        this.log('Candidate:', candidate, "with score " + candidateScore);
 
         for (var t = 0; t < this.N_TOP_CANDIDATES; t++) {
           var aTopCandidate = topCandidates[t];
 
           if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
             topCandidates.splice(t, 0, candidate);
             if (topCandidates.length > this.N_TOP_CANDIDATES)
               topCandidates.pop();
             break;
           }
         }
       }
 
       var topCandidate = topCandidates[0] || null;
+      var neededToCreateTopCandidate = false;
 
       // If we still have no top candidate, just use the body as a last resort.
       // We also have to copy the body node so it is something we can modify.
       if (topCandidate === null || topCandidate.tagName === "BODY") {
         // Move all of the page's children into topCandidate
         topCandidate = doc.createElement("DIV");
+        neededToCreateTopCandidate = true;
         var children = page.childNodes;
-        for (var i = 0; i < children.length; ++i) {
-          topCandidate.appendChild(children[i]);
+        while (children.length) {
+          this.log("Moving child out:", children[0]);
+          topCandidate.appendChild(children[0]);
         }
 
         page.appendChild(topCandidate);
 
         this._initializeNode(topCandidate);
       }
 
       // Now that we have the top candidate, look through its siblings for content
@@ -617,17 +632,17 @@ Readability.prototype = {
 
       var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
       var siblingNodes = topCandidate.parentNode.childNodes;
 
       for (var s = 0, sl = siblingNodes.length; s < sl; s += 1) {
         var siblingNode = siblingNodes[s];
         var append = false;
 
-        this.log("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
+        this.log("Looking at sibling node:", siblingNode, ((typeof siblingNode.readability !== 'undefined') ? ("with score " + siblingNode.readability.contentScore) : ''));
         this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
 
         if (siblingNode === topCandidate)
           append = true;
 
         var contentBonus = 0;
 
         // Give a bonus if sibling nodes and top candidates have the example same classname
@@ -646,57 +661,70 @@ Readability.prototype = {
           if (nodeLength > 80 && linkDensity < 0.25) {
             append = true;
           } else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) {
             append = true;
           }
         }
 
         if (append) {
-          this.log("Appending node: " + siblingNode);
+          this.log("Appending node:", siblingNode);
 
           // siblingNodes is a reference to the childNodes array, and
           // siblingNode is removed from the array when we call appendChild()
           // below. As a result, we must revisit this index since the nodes
           // have been shifted.
           s -= 1;
           sl -= 1;
 
           if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
             // We have a node that isn't a common block level element, like a form or td tag.
             // Turn it into a div so it doesn't get filtered out later by accident. */
-            this.log("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
+            this.log("Altering siblingNode:", siblingNode, 'to div.');
 
             this._setNodeTag(siblingNode, "DIV");
           }
 
           // To ensure a node does not interfere with readability styles,
           // remove its classnames.
-          siblingNode.className = "";
+          siblingNode.removeAttribute("class");
 
           // Append sibling and subtract from our list because it removes
           // the node when you append to another node.
           articleContent.appendChild(siblingNode);
         }
       }
 
+      this.log("Article content pre-prep: " + articleContent.innerHTML);
       // So we have all of the content that we need. Now we clean it up for presentation.
       this._prepArticle(articleContent);
+      this.log("Article content post-prep: " + articleContent.innerHTML);
 
       if (this._curPageNum === 1) {
-        var div = doc.createElement("DIV");
-        div.id = "readability-page-1";
-        div.className = "page";
-        var children = articleContent.childNodes;
-        for (var i = 0; i < children.length; ++i) {
-          div.appendChild(children[i]);
+        if (neededToCreateTopCandidate) {
+          // We already created a fake div thing, and there wouldn't have been any siblings left
+          // for the previous loop, so there's no point trying to create a new div, and then
+          // move all the children over. Just assign IDs and class names here. No need to append
+          // because that already happened anyway.
+          topCandidate.id = "readability-page-1";
+          topCandidate.className = "page";
+        } else {
+          var div = doc.createElement("DIV");
+          div.id = "readability-page-1";
+          div.className = "page";
+          var children = articleContent.childNodes;
+          while (children.length) {
+            div.appendChild(children[0]);
+          }
+          articleContent.appendChild(div);
         }
-        articleContent.appendChild(div);
       }
 
+      this.log("Article content after paging: " + articleContent.innerHTML);
+
       // Now that we've gone through the full algorithm, check to see if
       // we got any meaningful content. If we didn't, we may need to re-run
       // grabArticle with different flags set. This gives us a higher likelihood of
       // finding the content, and the sieve approach gives us a higher likelihood of
       // finding the -right- content.
       if (this._getInnerText(articleContent, true).length < 500) {
         page.innerHTML = pageCacheHtml;
 
@@ -1396,17 +1424,17 @@ Readability.prototype = {
     // Traverse backwards so we can remove nodes at the same time
     // without effecting the traversal.
     //
     // TODO: Consider taking into account original contentScore here.
     for (var i = curTagsLength-1; i >= 0; i -= 1) {
       var weight = this._getClassWeight(tagsList[i]);
       var contentScore = 0;
 
-      this.log("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")");
+      this.log("Cleaning Conditionally", tagsList[i]);
 
       if (weight + contentScore < 0) {
         tagsList[i].parentNode.removeChild(tagsList[i]);
       } else if (this._getCharCount(tagsList[i],',') < 10) {
         // If there are not very many commas, and the number of
         // non-paragraph elements is more than paragraphs or other
         // ominous signs, remove the element.
         var p = tagsList[i].getElementsByTagName("p").length;
@@ -1503,16 +1531,18 @@ Readability.prototype = {
 
     this._prepDocument();
 
     var articleTitle = this._getArticleTitle();
     var articleContent = this._grabArticle();
     if (!articleContent)
       return null;
 
+    this.log("Grabbed: " + articleContent.innerHTML);
+
     this._postProcessContent(articleContent);
 
     // if (nextPageLink) {
     //   // Append any additional pages after a small timeout so that people
     //   // can start reading without having to wait for this to finish processing.
     //   setTimeout((function() {
     //     this._appendNextPage(nextPageLink);
     //   }).bind(this), 500);