Bug 1516877 - update Readability code from upstream (git rev 15d411a8652ca35f553a2465a5bdc994be90a813), rs=already-reviewed
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Thu, 03 Jan 2019 14:28:01 +0000
changeset 509553 42156d5817a6ef502138d88f9ab38d2f460c7ade
parent 509552 df0c81365e998bc9e25920260230d3eed8fe0210
child 509554 9c9f8232272eda52a244059234f6f3ca9fc2afdd
push id10547
push userffxbld-merge
push dateMon, 21 Jan 2019 13:03:58 +0000
treeherdermozilla-beta@24ec1916bffe [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersalready-reviewed
bugs1516877
milestone66.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1516877 - update Readability code from upstream (git rev 15d411a8652ca35f553a2465a5bdc994be90a813), rs=already-reviewed
toolkit/components/reader/Readability-readerable.js
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/Readability-readerable.js
+++ b/toolkit/components/reader/Readability-readerable.js
@@ -1,11 +1,18 @@
 /* eslint-env es6:false */
 /* globals exports */
 /*
+ * DO NOT MODIFY THIS FILE DIRECTLY!
+ *
+ * This is a shared library that is maintained in an external repo:
+ * https://github.com/mozilla/readability
+ */
+
+/*
  * Copyright (c) 2010 Arc90 Inc
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
@@ -24,17 +31,18 @@
 var REGEXPS = {
   // NOTE: These two regular expressions are duplicated in
   // Readability.js. Please keep both copies in sync.
   unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
   okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
 };
 
 function isNodeVisible(node) {
-  return node.style.display != "none" && !node.hasAttribute("hidden");
+  // Have to null-check node.style to deal with SVG and MathML nodes.
+  return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
 }
 
 /**
  * Decides whether or not the document is reader-able without parsing the whole thing.
  *
  * @return boolean Whether or not we suspect Readability.parse() will suceeed at returning an article object.
  */
 function isProbablyReaderable(doc, isVisible) {
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -1,10 +1,17 @@
 /*eslint-env es6:false*/
 /*
+ * DO NOT MODIFY THIS FILE DIRECTLY!
+ *
+ * This is a shared library that is maintained in an external repo:
+ * https://github.com/mozilla/readability
+ */
+
+/*
  * Copyright (c) 2010 Arc90 Inc
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
@@ -34,16 +41,17 @@ function Readability(doc, options) {
     throw new Error("First argument to Readability constructor should be a document object.");
   }
   options = options || {};
 
   this._doc = doc;
   this._articleTitle = null;
   this._articleByline = null;
   this._articleDir = null;
+  this._articleSiteName = null;
   this._attempts = [];
 
   // Configurable options
   this._debug = !!options.debug;
   this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
   this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
   this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
@@ -318,32 +326,32 @@ Readability.prototype = {
       try {
         return new URL(uri, baseURI).href;
       } catch (ex) {
         // Something went wrong, just return the original:
       }
       return uri;
     }
 
-    var links = articleContent.getElementsByTagName("a");
+    var links = this._getAllNodesWithTag(articleContent, ["a"]);
     this._forEachNode(links, function(link) {
       var href = link.getAttribute("href");
       if (href) {
         // Replace links with javascript: URIs with text content, since
         // they won't work after scripts have been removed from the page.
         if (href.indexOf("javascript:") === 0) {
           var text = this._doc.createTextNode(link.textContent);
           link.parentNode.replaceChild(text, link);
         } else {
           link.setAttribute("href", toAbsoluteURI(href));
         }
       }
     });
 
-    var imgs = articleContent.getElementsByTagName("img");
+    var imgs = this._getAllNodesWithTag(articleContent, ["img"]);
     this._forEachNode(imgs, function(img) {
       var src = img.getAttribute("src");
       if (src) {
         img.setAttribute("src", toAbsoluteURI(src));
       }
     });
   },
 
@@ -406,17 +414,17 @@ Readability.prototype = {
       }
     } else if (curTitle.length > 150 || curTitle.length < 15) {
       var hOnes = doc.getElementsByTagName("h1");
 
       if (hOnes.length === 1)
         curTitle = this._getInnerText(hOnes[0]);
     }
 
-    curTitle = curTitle.trim();
+    curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
     // If we now have 4 words or fewer as our title, and either no
     // 'hierarchical' separators (\, /, > or ») were found in the original
     // title or we decreased the number of words by more than 1 word, use
     // the original title.
     var curTitleWordCount = wordCount(curTitle);
     if (curTitleWordCount <= 4 &&
         (!titleHadHierarchicalSeparators ||
          curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
@@ -1139,17 +1147,17 @@ Readability.prototype = {
           this._attempts.push({articleContent: articleContent, textLength: textLength});
         } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
           this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
           this._attempts.push({articleContent: articleContent, textLength: textLength});
         } else {
           this._attempts.push({articleContent: articleContent, textLength: textLength});
           // No luck after removing flags, just return the longest text we found during the different loops
           this._attempts.sort(function (a, b) {
-            return a.textLength < b.textLength;
+            return b.textLength - a.textLength;
           });
 
           // But first check if we actually have something
           if (!this._attempts[0].textLength) {
             return null;
           }
 
           articleContent = this._attempts[0].articleContent;
@@ -1197,20 +1205,20 @@ Readability.prototype = {
    * @return Object with optional "excerpt" and "byline" properties
    */
   _getArticleMetadata: function() {
     var metadata = {};
     var values = {};
     var metaElements = this._doc.getElementsByTagName("meta");
 
     // property is a space-separated list of values
-    var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title)\s*/gi;
+    var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi;
 
     // name is a single value
-    var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title)\s*$/i;
+    var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i;
 
     // Find description tags.
     this._forEachNode(metaElements, function(element) {
       var elementName = element.getAttribute("name");
       var elementProperty = element.getAttribute("property");
       var content = element.getAttribute("content");
       var matches = null;
       var name = null;
@@ -1260,16 +1268,19 @@ Readability.prototype = {
     metadata.excerpt = values["dc:description"] ||
                        values["dcterm:description"] ||
                        values["og:description"] ||
                        values["weibo:article:description"] ||
                        values["weibo:webpage:description"] ||
                        values["description"] ||
                        values["twitter:description"];
 
+    // get site name
+    metadata.siteName = values["og:site_name"];
+
     return metadata;
   },
 
   /**
    * Removes script tags from the document.
    *
    * @param Element
   **/
@@ -1699,17 +1710,17 @@ Readability.prototype = {
     return (this._flags & flag) > 0;
   },
 
   _removeFlag: function(flag) {
     this._flags = this._flags & ~flag;
   },
 
   _isProbablyVisible: function(node) {
-    return node.style.display != "none" && !node.hasAttribute("hidden");
+    return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
   },
 
   /**
    * Runs readability.
    *
    * Workflow:
    *  1. Prep the document by removing script tags, css, etc.
    *  2. Build readability's DOM tree.
@@ -1758,15 +1769,16 @@ Readability.prototype = {
     return {
       title: this._articleTitle,
       byline: metadata.byline || this._articleByline,
       dir: this._articleDir,
       content: articleContent.innerHTML,
       textContent: textContent,
       length: textContent.length,
       excerpt: metadata.excerpt,
+      siteName: metadata.siteName || this._articleSiteName
     };
   }
 };
 
 if (typeof module === "object") {
   module.exports = Readability;
 }