--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -77,22 +77,22 @@ function Readability(uri, doc, options)
var elDesc = "";
if (e.id)
elDesc = "(#" + e.id + classDesc + ")";
else if (classDesc)
elDesc = "(" + classDesc + ")";
return rv + elDesc;
};
this.log = function () {
- if (typeof dump !== undefined) {
+ if (typeof dump !== "undefined") {
var msg = Array.prototype.map.call(arguments, function(x) {
return (x && x.nodeName) ? logEl(x) : x;
}).join(" ");
dump("Reader: (Readability) " + msg + "\n");
- } else if (typeof console !== undefined) {
+ } else if (typeof console !== "undefined") {
var args = ["Reader: (Readability) "].concat(arguments);
console.log.apply(console, args);
}
};
} else {
this.log = function () {};
}
}
@@ -114,17 +114,17 @@ Readability.prototype = {
DEFAULT_MAX_PAGES: 5,
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
- unlikelyCandidates: /banner|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|rss|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+ unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby|p-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
@@ -472,16 +472,17 @@ Readability.prototype = {
* @param Element
* @return void
**/
_prepArticle: function(articleContent) {
this._cleanStyles(articleContent);
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
+ this._cleanConditionally(articleContent, "fieldset");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
// Clean out elements have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
this._forEachNode(articleContent.children, function(topCandidate) {
@@ -489,16 +490,20 @@ Readability.prototype = {
});
// If there is only one h2, they are probably using it as a header
// and not a subheader, so remove it since we already have a header.
if (articleContent.getElementsByTagName('h2').length === 1)
this._clean(articleContent, "h2");
this._clean(articleContent, "iframe");
+ this._clean(articleContent, "input");
+ this._clean(articleContent, "textarea");
+ this._clean(articleContent, "select");
+ this._clean(articleContent, "button");
this._cleanHeaders(articleContent);
// Do these last as the previous stuff may have removed junk
// that will affect these
this._cleanConditionally(articleContent, "table");
this._cleanConditionally(articleContent, "ul");
this._cleanConditionally(articleContent, "div");
@@ -841,28 +846,59 @@ Readability.prototype = {
this.log("Moving child out:", kids[0]);
topCandidate.appendChild(kids[0]);
}
page.appendChild(topCandidate);
this._initializeNode(topCandidate);
} else if (topCandidate) {
+ // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
+ // and whose scores are quite closed with current `topCandidate` node.
+ var alternativeCandidateAncestors = [];
+ for (var i = 1; i < topCandidates.length; i++) {
+ if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) {
+ alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i]));
+ }
+ }
+ var MINIMUM_TOPCANDIDATES = 3;
+ if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
+ parentOfTopCandidate = topCandidate.parentNode;
+ while (parentOfTopCandidate.tagName !== "BODY") {
+ var listsContainingThisAncestor = 0;
+ for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) {
+ listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate));
+ }
+ if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
+ topCandidate = parentOfTopCandidate;
+ break;
+ }
+ parentOfTopCandidate = parentOfTopCandidate.parentNode;
+ }
+ }
+ if (!topCandidate.readability) {
+ this._initializeNode(topCandidate);
+ }
+
// Because of our bonus system, parents of candidates might have scores
// themselves. They get half of the node. There won't be nodes with higher
// scores than our topCandidate, but if we see the score going *up* in the first
// few steps up the tree, that's a decent sign that there might be more content
// lurking in other places that we want to unify in. The sibling stuff
// below does some of that - but only if we've looked high enough up the DOM
// tree.
parentOfTopCandidate = topCandidate.parentNode;
var lastScore = topCandidate.readability.contentScore;
// The scores shouldn't get too low.
var scoreThreshold = lastScore / 3;
- while (parentOfTopCandidate && parentOfTopCandidate.readability) {
+ while (parentOfTopCandidate.tagName !== "BODY") {
+ if (!parentOfTopCandidate.readability) {
+ parentOfTopCandidate = parentOfTopCandidate.parentNode;
+ continue;
+ }
var parentScore = parentOfTopCandidate.readability.contentScore;
if (parentScore < scoreThreshold)
break;
if (parentScore > lastScore) {
// Alright! We found a better parent to use.
topCandidate = parentOfTopCandidate;
break;
}
@@ -1236,21 +1272,16 @@ Readability.prototype = {
if (segment.indexOf(".") !== -1) {
possibleType = segment.split(".")[1];
// If the type isn't alpha-only, it's probably not actually a file extension.
if (!possibleType.match(/[^a-zA-Z]/))
segment = segment.split(".")[0];
}
- // EW-CMS specific segment replacement. Ugly.
- // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
- if (segment.indexOf(',00') !== -1)
- segment = segment.replace(',00', '');
-
// If our first or second segment has anything looking like a page number, remove it.
if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0)))
segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
var del = false;
// If this is purely a number, and it's the first or second segment,
// it's probably a page number. Remove it.
@@ -1708,21 +1739,20 @@ Readability.prototype = {
if (!this.REGEXPS.videos.test(embeds[ei].src))
embedCount += 1;
}
var linkDensity = this._getLinkDensity(node);
var contentLength = this._getInnerText(node).length;
var haveToRemove =
- // Make an exception for elements with no p's and exactly 1 img.
- (img > p && !this._hasAncestorTag(node, "figure")) ||
+ (img > 1 && img > p && !this._hasAncestorTag(node, "figure")) ||
(!isList && li > p) ||
(input > Math.floor(p/3)) ||
- (!isList && contentLength < 25 && (img === 0 || img > 2)) ||
+ (!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
return haveToRemove;
}
return false;
});
},
@@ -1907,8 +1937,12 @@ Readability.prototype = {
dir: this._articleDir,
content: articleContent.innerHTML,
textContent: textContent,
length: textContent.length,
excerpt: metadata.excerpt,
};
}
};
+
+if (typeof module === "object") {
+ module.exports = Readability;
+}