--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -103,28 +103,31 @@ Readability.prototype = {
// The number of top candidates to consider when analysing how
// tight the competition is among candidates.
DEFAULT_N_TOP_CANDIDATES: 5,
// The maximum number of pages to loop through before we call
// it quits and just show a link.
DEFAULT_MAX_PAGES: 5,
+ // Element tags to score by default.
+ DEFAULT_TAGS_TO_SCORE: ["SECTION", "P", "TD", "PRE"],
+
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
- videos: /https?:\/\/(www\.)?(youtube|youtube-nocookie|player\.vimeo)\.com/i,
+ videos: /https?:\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
whitespace: /^\s*$/,
hasContent: /\S$/,
},
DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
@@ -181,16 +184,25 @@ Readability.prototype = {
var slice = Array.prototype.slice;
var args = slice.call(arguments);
var nodeLists = args.map(function(list) {
return slice.call(list);
});
return Array.prototype.concat.apply([], nodeLists);
},
+ _getAllNodesWithTag: function(node, tagNames) {
+ if (node.querySelectorAll) {
+ return node.querySelectorAll(tagNames.join(','));
+ }
+ return [].concat.apply([], tagNames.map(function(tag) {
+ return node.getElementsByTagName(tag);
+ }));
+ },
+
/**
* Converts each <a> and <img> uri in the given element to an absolute URI.
*
* @param Element
* @return void
*/
_fixRelativeUris: function(articleContent) {
var scheme = this._uri.scheme;
@@ -581,16 +593,28 @@ Readability.prototype = {
if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
this._articleByline = node.textContent.trim();
return true;
}
return false;
},
+ _getNodeAncestors: function(node, maxDepth) {
+ maxDepth = maxDepth || 0;
+ var i = 0, ancestors = [];
+ while (node.parentNode) {
+ ancestors.push(node.parentNode)
+ if (maxDepth && ++i === maxDepth)
+ break;
+ node = node.parentNode;
+ }
+ return ancestors;
+ },
+
/***
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
*
* @param page a document to run upon. Needs to be a full document, complete with body.
* @return Element
**/
_grabArticle: function (page) {
@@ -635,18 +659,19 @@ Readability.prototype = {
node.tagName !== "BODY" &&
node.tagName !== "A") {
this.log("Removing unlikely candidate - " + matchString);
node = this._removeAndGetNext(node);
continue;
}
}
- if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
+ if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
elementsToScore.push(node);
+ }
// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
// Sites like http://mobile.slate.com encloses each paragraph with a DIV
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
if (this._hasSinglePInsideElement(node)) {
@@ -675,57 +700,52 @@ Readability.prototype = {
/**
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
**/
var candidates = [];
this._forEachNode(elementsToScore, function(elementToScore) {
- var parentNode = elementToScore.parentNode;
- var grandParentNode = parentNode ? parentNode.parentNode : null;
- var innerText = this._getInnerText(elementToScore);
-
- if (!parentNode || typeof(parentNode.tagName) === 'undefined')
+ if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined')
return;
// If this paragraph is less than 25 characters, don't even count it.
+ var innerText = this._getInnerText(elementToScore);
if (innerText.length < 25)
return;
- // Initialize readability data for the parent.
- if (typeof parentNode.readability === 'undefined') {
- this._initializeNode(parentNode);
- candidates.push(parentNode);
- }
-
- // Initialize readability data for the grandparent.
- if (grandParentNode &&
- typeof(grandParentNode.readability) === 'undefined' &&
- typeof(grandParentNode.tagName) !== 'undefined') {
- this._initializeNode(grandParentNode);
- candidates.push(grandParentNode);
- }
+ // Exclude nodes with no ancestor.
+ var ancestors = this._getNodeAncestors(elementToScore, 3);
+ if (ancestors.length === 0)
+ return;
var contentScore = 0;
// Add a point for the paragraph itself as a base.
contentScore += 1;
// Add points for any commas within this paragraph.
contentScore += innerText.split(',').length;
// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
- // Add the score to the parent. The grandparent gets half.
- parentNode.readability.contentScore += contentScore;
+ // Initialize and score ancestors.
+ this._forEachNode(ancestors, function(ancestor, level) {
+ if (!ancestor.tagName)
+ return;
- if (grandParentNode)
- grandParentNode.readability.contentScore += contentScore / 2;
+ if (typeof(ancestor.readability) === 'undefined') {
+ this._initializeNode(ancestor);
+ candidates.push(ancestor);
+ }
+
+ ancestor.readability.contentScore += contentScore / (level === 0 ? 1 : level * 2);
+ });
});
// After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score.
var topCandidates = [];
for (var c = 0, cl = candidates.length; c < cl; c += 1) {
var candidate = candidates[c];
@@ -843,20 +863,16 @@ Readability.prototype = {
if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
// We have a node that isn't a common block level element, like a form or td tag.
// Turn it into a div so it doesn't get filtered out later by accident.
this.log("Altering sibling:", sibling, 'to div.');
sibling = this._setNodeTag(sibling, "DIV");
}
- // To ensure a node does not interfere with readability styles,
- // remove its classnames.
- sibling.removeAttribute("class");
-
articleContent.appendChild(sibling);
// siblings is a reference to the children array, and
// sibling is removed from the array when we call appendChild().
// As a result, we must revisit this index since the nodes
// have been shifted.
s -= 1;
sl -= 1;
}
@@ -948,17 +964,17 @@ Readability.prototype = {
// Match Facebook's Open Graph title & description properties.
var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi;
// Find description tags.
this._forEachNode(metaElements, function(element) {
var elementName = element.getAttribute("name");
var elementProperty = element.getAttribute("property");
- if (elementName === "author") {
+ if ([elementName, elementProperty].indexOf("author") !== -1) {
metadata.byline = element.getAttribute("content");
return;
}
var name = null;
if (namePattern.test(elementName)) {
name = elementName;
} else if (propertyPattern.test(elementProperty)) {
@@ -1592,16 +1608,17 @@ Readability.prototype = {
* @return void
**/
_cleanConditionally: function(e, tag) {
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
return;
var tagsList = e.getElementsByTagName(tag);
var curTagsLength = tagsList.length;
+ var isList = tag === "ul" || tag === "ol";
// Gather counts for other typical elements embedded within.
// Traverse backwards so we can remove nodes at the same time
// without effecting the traversal.
//
// TODO: Consider taking into account original contentScore here.
for (var i = curTagsLength-1; i >= 0; i -= 1) {
var weight = this._getClassWeight(tagsList[i]);
@@ -1627,23 +1644,23 @@ Readability.prototype = {
embedCount += 1;
}
var linkDensity = this._getLinkDensity(tagsList[i]);
var contentLength = this._getInnerText(tagsList[i]).length;
var toRemove = false;
if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
toRemove = true;
- } else if (li > p && tag !== "ul" && tag !== "ol") {
+ } else if (!isList && li > p) {
toRemove = true;
- } else if ( input > Math.floor(p/3) ) {
+ } else if (input > Math.floor(p/3)) {
toRemove = true;
- } else if (contentLength < 25 && (img === 0 || img > 2) ) {
+ } else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
toRemove = true;
- } else if (weight < 25 && linkDensity > 0.2) {
+ } else if (!isList && weight < 25 && linkDensity > 0.2) {
toRemove = true;
} else if (weight >= 25 && linkDensity > 0.5) {
toRemove = true;
} else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
toRemove = true;
}
if (toRemove) {
@@ -1658,17 +1675,17 @@ Readability.prototype = {
*
* @param Element
* @return void
**/
_cleanHeaders: function(e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
var headers = e.getElementsByTagName('h' + headerIndex);
for (var i = headers.length - 1; i >= 0; i -= 1) {
- if (this._getClassWeight(headers[i]) < 0 || this._getLinkDensity(headers[i]) > 0.33)
+ if (this._getClassWeight(headers[i]) < 0)
headers[i].parentNode.removeChild(headers[i]);
}
}
},
_flagIsActive: function(flag) {
return (this._flags & flag) > 0;
},
@@ -1681,42 +1698,52 @@ Readability.prototype = {
this._flags = this._flags & ~flag;
},
/**
* Decides whether or not the document is reader-able without parsing the whole thing.
*
* @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
*/
- isProbablyReaderable: function() {
- var nodes = this._doc.getElementsByTagName("p");
- if (nodes.length < 5) {
- return false;
- }
+ isProbablyReaderable: function(helperIsVisible) {
+ var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
+
+ // FIXME we should have a fallback for helperIsVisible, but this is
+ // problematic because of jsdom's elem.style handling - see
+ // https://github.com/mozilla/readability/pull/186 for context.
- var possibleParagraphs = 0;
- for (var i = 0; i < nodes.length; i++) {
- var node = nodes[i];
+ var score = 0;
+ // This is a little cheeky, we use the accumulator 'score' to decide what to return from
+ // this callback:
+ return this._someNode(nodes, function(node) {
+ if (helperIsVisible && !helperIsVisible(node))
+ return false;
var matchString = node.className + " " + node.id;
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
!this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
- continue;
+ return false;
}
- if (node.textContent.trim().length < 100) {
- continue;
+ if (node.matches && node.matches("li p")) {
+ return false;
}
- possibleParagraphs++;
- if (possibleParagraphs >= 5) {
+ var textContentLength = node.textContent.trim().length;
+ if (textContentLength < 140) {
+ return false;
+ }
+
+ score += Math.sqrt(textContentLength - 140);
+
+ if (score > 20) {
return true;
}
- }
- return false;
+ return false;
+ });
},
/**
* Runs readability.
*
* Workflow:
* 1. Prep the document by removing script tags, css, etc.
* 2. Build readability's DOM tree.