author | Gijs Kruitbosch <gijskruitbosch@gmail.com> |
Fri, 24 Apr 2015 16:20:02 +0100 | |
changeset 241141 | c6e4ebeb0e69d61998ab50e1d14fc4cd885cca17 |
parent 241140 | 836194cdafc404094ba72f58d63e0ee952d37fe4 |
child 241142 | f0dd7524cbf3835289ada8a48b51f86d1a24ff54 |
push id | 59036 |
push user | cbook@mozilla.com |
push date | Mon, 27 Apr 2015 10:37:48 +0000 |
treeherder | mozilla-inbound@ad388474898c [default view] [failures only] |
perfherder | [talos] [build metrics] [platform microbench] (compared to previous push) |
reviewers | me |
bugs | 1158184 |
milestone | 40.0a1 |
first release with | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
last release without | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
--- a/toolkit/components/reader/Readability.js +++ b/toolkit/components/reader/Readability.js @@ -167,16 +167,31 @@ Readability.prototype = { * @param Function fn The iterate function. * @return Boolean */ _someNode: function(nodeList, fn) { return Array.prototype.some.call(nodeList, fn, this); }, /** + * Concat all nodelists passed as arguments. + * + * @return ...NodeList + * @return Array + */ + _concatNodeLists: function() { + var slice = Array.prototype.slice; + var args = slice.call(arguments); + var nodeLists = args.map(function(list) { + return slice.call(list); + }); + return Array.prototype.concat.apply([], nodeLists); + }, + + /** * Converts each <a> and <img> uri in the given element to an absolute URI. * * @param Element * @return void */ _fixRelativeUris: function(articleContent) { var scheme = this._uri.scheme; var prePath = this._uri.prePath; @@ -247,20 +262,34 @@ Readability.prototype = { } catch(e) {} if (curTitle.match(/ [\|\-] /)) { curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); if (curTitle.split(' ').length < 3) curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); } else if (curTitle.indexOf(': ') !== -1) { - curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); + // Check if we have an heading containing this exact string, so we + // could assume it's the full title. + var headings = this._concatNodeLists( + doc.getElementsByTagName('h1'), + doc.getElementsByTagName('h2') + ); + var match = this._someNode(headings, function(heading) { + return heading.textContent === curTitle; + }); - if (curTitle.split(' ').length < 3) - curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); + // If we don't, let's extract the title out of the original title string. + if (!match) { + curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1); + + // If the title is now too short, try the first colon instead: + if (curTitle.split(' ').length < 3) + curTitle = origTitle.substring(origTitle.indexOf(':') + 1); + } } else if (curTitle.length > 150 || curTitle.length < 15) { var hOnes = doc.getElementsByTagName('h1'); if (hOnes.length === 1) curTitle = this._getInnerText(hOnes[0]); } curTitle = curTitle.trim(); @@ -391,16 +420,17 @@ Readability.prototype = { _prepArticle: function(articleContent) { this._cleanStyles(articleContent); // Clean out junk from the article content this._cleanConditionally(articleContent, "form"); this._clean(articleContent, "object"); this._clean(articleContent, "embed"); this._clean(articleContent, "h1"); + this._clean(articleContent, "footer"); // If there is only one h2, they are probably using it as a header // and not a subheader, so remove it since we already have a header. if (articleContent.getElementsByTagName('h2').length === 1) this._clean(articleContent, "h2"); this._clean(articleContent, "iframe"); this._cleanHeaders(articleContent); @@ -908,20 +938,20 @@ Readability.prototype = { */ _getArticleMetadata: function() { var metadata = {}; var values = {}; var metaElements = this._doc.getElementsByTagName("meta"); // Match "description", or Twitter's "twitter:description" (Cards) // in name attribute. - var namePattern = /^\s*((twitter)\s*:\s*)?description\s*$/gi; + var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi; - // Match Facebook's og:description (Open Graph) in property attribute. - var propertyPattern = /^\s*og\s*:\s*description\s*$/gi; + // Match Facebook's Open Graph title & description properties. + var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi; // Find description tags. this._forEachNode(metaElements, function(element) { var elementName = element.getAttribute("name"); var elementProperty = element.getAttribute("property"); if (elementName === "author") { metadata.byline = element.getAttribute("content"); @@ -951,16 +981,24 @@ Readability.prototype = { } else if ("og:description" in values) { // Use facebook open graph description. metadata.excerpt = values["og:description"]; } else if ("twitter:description" in values) { // Use twitter cards description. metadata.excerpt = values["twitter:description"]; } + if ("og:title" in values) { + // Use facebook open graph title. + metadata.title = values["og:title"]; + } else if ("twitter:title" in values) { + // Use twitter cards title. + metadata.title = values["twitter:title"]; + } + return metadata; }, /** * Removes script tags from the document. * * @param Element **/ @@ -1710,18 +1748,18 @@ Readability.prototype = { // so we don't double up on the first page. // this._parsedPages[uri.spec.replace(/\/$/, '')] = true; // Pull out any possible next page link first. // var nextPageLink = this._findNextPageLink(doc.body); this._prepDocument(); - var articleTitle = this._getArticleTitle(); var metadata = this._getArticleMetadata(); + var articleTitle = metadata.title || this._getArticleTitle(); var articleContent = this._grabArticle(); if (!articleContent) return null; this.log("Grabbed: " + articleContent.innerHTML); this._postProcessContent(articleContent);