Bug 1162917 - Update readability from github repo. a=sledru
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Fri, 08 May 2015 12:05:33 +0100
changeset 260460 5fc66f6dd277
parent 260459 855c88138927
child 260461 85229fbaf017
push id786
push userryanvm@gmail.com
push date2015-05-11 14:41 +0000
treeherdermozilla-release@85229fbaf017 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerssledru
bugs1162917
milestone38.0.5
Bug 1162917 - Update readability from github repo. a=sledru
toolkit/components/reader/JSDOMParser.js
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -278,16 +278,17 @@
     "embed": true,
     "hr": true,
     "img": true,
     "input": true,
     "link": true,
     "meta": true,
     "param": true,
     "source": true,
+    "wbr": true
   };
 
   var whitespace = [" ", "\t", "\n", "\r"];
 
   // See http://www.w3schools.com/dom/dom_nodetype.asp
   var nodeTypes = {
     ELEMENT_NODE: 1,
     ATTRIBUTE_NODE: 2,
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -104,30 +104,30 @@ Readability.prototype = {
   // tight the competition is among candidates.
   DEFAULT_N_TOP_CANDIDATES: 5,
 
   // The maximum number of pages to loop through before we call
   // it quits and just show a link.
   DEFAULT_MAX_PAGES: 5,
 
   // Element tags to score by default.
-  DEFAULT_TAGS_TO_SCORE: ["SECTION", "P", "TD", "PRE"],
+  DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
-    unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
+    unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
     negative: /hidden|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
     normalize: /\s{2,}/g,
-    videos: /https?:\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
+    videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
     prevLink: /(prev|earl|old|new|<|«)/i,
     whitespace: /^\s*$/,
     hasContent: /\S$/,
   },
 
   DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
 
@@ -734,17 +734,22 @@ Readability.prototype = {
           if (!ancestor.tagName)
             return;
 
           if (typeof(ancestor.readability) === 'undefined') {
             this._initializeNode(ancestor);
             candidates.push(ancestor);
           }
 
-          ancestor.readability.contentScore += contentScore / (level === 0 ? 1 : level * 2);
+          // Node score divider:
+          // - parent:             1 (no division)
+          // - grandparent:        2
+          // - great grandparent+: ancestor level * 3
+          var scoreDivider = level === 0 ? 1 : level === 1 ? 2 : level * 3;
+          ancestor.readability.contentScore += contentScore / scoreDivider;
         });
       });
 
       // After we've calculated scores, loop through all of the possible
       // candidate nodes we found and find the one with the highest score.
       var topCandidates = [];
       for (var c = 0, cl = candidates.length; c < cl; c += 1) {
         var candidate = candidates[c];