Bug 1158184 - merge recent github readability changes into m-c, rs=me
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Fri, 24 Apr 2015 16:20:02 +0100
changeset 241141 c6e4ebeb0e69d61998ab50e1d14fc4cd885cca17
parent 241140 836194cdafc404094ba72f58d63e0ee952d37fe4
child 241142 f0dd7524cbf3835289ada8a48b51f86d1a24ff54
push id59036
push usercbook@mozilla.com
push dateMon, 27 Apr 2015 10:37:48 +0000
treeherdermozilla-inbound@ad388474898c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersme
bugs1158184
milestone40.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1158184 - merge recent github readability changes into m-c, rs=me
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -167,16 +167,31 @@ Readability.prototype = {
    * @param  Function fn       The iterate function.
    * @return Boolean
    */
   _someNode: function(nodeList, fn) {
     return Array.prototype.some.call(nodeList, fn, this);
   },
 
   /**
+   * Concat all nodelists passed as arguments.
+   *
+   * @return ...NodeList
+   * @return Array
+   */
+  _concatNodeLists: function() {
+    var slice = Array.prototype.slice;
+    var args = slice.call(arguments);
+    var nodeLists = args.map(function(list) {
+      return slice.call(list);
+    });
+    return Array.prototype.concat.apply([], nodeLists);
+  },
+
+  /**
    * Converts each <a> and <img> uri in the given element to an absolute URI.
    *
    * @param Element
    * @return void
    */
   _fixRelativeUris: function(articleContent) {
     var scheme = this._uri.scheme;
     var prePath = this._uri.prePath;
@@ -247,20 +262,34 @@ Readability.prototype = {
     } catch(e) {}
 
     if (curTitle.match(/ [\|\-] /)) {
       curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
 
       if (curTitle.split(' ').length < 3)
         curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
     } else if (curTitle.indexOf(': ') !== -1) {
-      curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
+      // Check if we have an heading containing this exact string, so we
+      // could assume it's the full title.
+      var headings = this._concatNodeLists(
+        doc.getElementsByTagName('h1'),
+        doc.getElementsByTagName('h2')
+      );
+      var match = this._someNode(headings, function(heading) {
+        return heading.textContent === curTitle;
+      });
 
-      if (curTitle.split(' ').length < 3)
-        curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
+      // If we don't, let's extract the title out of the original title string.
+      if (!match) {
+        curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
+
+        // If the title is now too short, try the first colon instead:
+        if (curTitle.split(' ').length < 3)
+          curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
+      }
     } else if (curTitle.length > 150 || curTitle.length < 15) {
       var hOnes = doc.getElementsByTagName('h1');
 
       if (hOnes.length === 1)
         curTitle = this._getInnerText(hOnes[0]);
     }
 
     curTitle = curTitle.trim();
@@ -391,16 +420,17 @@ Readability.prototype = {
   _prepArticle: function(articleContent) {
     this._cleanStyles(articleContent);
 
     // Clean out junk from the article content
     this._cleanConditionally(articleContent, "form");
     this._clean(articleContent, "object");
     this._clean(articleContent, "embed");
     this._clean(articleContent, "h1");
+    this._clean(articleContent, "footer");
 
     // If there is only one h2, they are probably using it as a header
     // and not a subheader, so remove it since we already have a header.
     if (articleContent.getElementsByTagName('h2').length === 1)
       this._clean(articleContent, "h2");
 
     this._clean(articleContent, "iframe");
     this._cleanHeaders(articleContent);
@@ -908,20 +938,20 @@ Readability.prototype = {
    */
   _getArticleMetadata: function() {
     var metadata = {};
     var values = {};
     var metaElements = this._doc.getElementsByTagName("meta");
 
     // Match "description", or Twitter's "twitter:description" (Cards)
     // in name attribute.
-    var namePattern = /^\s*((twitter)\s*:\s*)?description\s*$/gi;
+    var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi;
 
-    // Match Facebook's og:description (Open Graph) in property attribute.
-    var propertyPattern = /^\s*og\s*:\s*description\s*$/gi;
+    // Match Facebook's Open Graph title & description properties.
+    var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi;
 
     // Find description tags.
     this._forEachNode(metaElements, function(element) {
       var elementName = element.getAttribute("name");
       var elementProperty = element.getAttribute("property");
 
       if (elementName === "author") {
         metadata.byline = element.getAttribute("content");
@@ -951,16 +981,24 @@ Readability.prototype = {
     } else if ("og:description" in values) {
       // Use facebook open graph description.
       metadata.excerpt = values["og:description"];
     } else if ("twitter:description" in values) {
       // Use twitter cards description.
       metadata.excerpt = values["twitter:description"];
     }
 
+    if ("og:title" in values) {
+      // Use facebook open graph title.
+      metadata.title = values["og:title"];
+    } else if ("twitter:title" in values) {
+      // Use twitter cards title.
+      metadata.title = values["twitter:title"];
+    }
+
     return metadata;
   },
 
   /**
    * Removes script tags from the document.
    *
    * @param Element
   **/
@@ -1710,18 +1748,18 @@ Readability.prototype = {
     // so we don't double up on the first page.
     // this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
 
     // Pull out any possible next page link first.
     // var nextPageLink = this._findNextPageLink(doc.body);
 
     this._prepDocument();
 
-    var articleTitle = this._getArticleTitle();
     var metadata = this._getArticleMetadata();
+    var articleTitle = metadata.title || this._getArticleTitle();
 
     var articleContent = this._grabArticle();
     if (!articleContent)
       return null;
 
     this.log("Grabbed: " + articleContent.innerHTML);
 
     this._postProcessContent(articleContent);