Bug 779796 - Part 6: Remove article generator function in Readability.js. r=lucasr
authorBrian Nicholson <bnicholson@mozilla.com>
Thu, 09 Aug 2012 23:30:46 -0700
changeset 107519 b5ae446888f54480e2d20e59dc1b8723799a5cbb
parent 107518 04f2152643e64832f151ade90420670a039ebbb9
child 107520 28aa0856c08eed1bb26eeb0571120b01075d8976
child 107549 b32dbbd0df6fd99b07b36585984b96b4e9641ef6
child 112966 e95f1a182bc2de73e1f4df727a7b80e2c5f3f8c8
push id1490
push userakeybl@mozilla.com
push dateMon, 08 Oct 2012 18:29:50 +0000
treeherdermozilla-beta@f335e7dacdc1 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerslucasr
bugs779796
milestone17.0a1
first release with
nightly linux32
b5ae446888f5 / 17.0a1 / 20120811030522 / files
nightly linux64
b5ae446888f5 / 17.0a1 / 20120811030522 / files
nightly mac
b5ae446888f5 / 17.0a1 / 20120811030522 / files
nightly win32
b5ae446888f5 / 17.0a1 / 20120811030522 / files
nightly win64
b5ae446888f5 / 17.0a1 / 20120811030522 / files
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
releases
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 779796 - Part 6: Remove article generator function in Readability.js. r=lucasr
mobile/android/chrome/content/Readability.js
mobile/android/chrome/content/readerWorker.js
--- a/mobile/android/chrome/content/Readability.js
+++ b/mobile/android/chrome/content/Readability.js
@@ -56,19 +56,16 @@ Readability.prototype = {
   FLAG_STRIP_UNLIKELYS: 0x1,
   FLAG_WEIGHT_CLASSES: 0x2,
   FLAG_CLEAN_CONDITIONALLY: 0x4,
 
   // The maximum number of pages to loop through before we call
   // it quits and just show a link.
   MAX_PAGES: 5,
 
-  // The number of iterations processed before yielding.
-  GEN_ITERATIONS: 100,
-
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
     unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
     negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
@@ -451,59 +448,34 @@ Readability.prototype = {
       case 'TH':
         node.readability.contentScore -= 5;
         break;
     }
 
     node.readability.contentScore += this._getClassWeight(node);
   },
 
-  _grabArticle: function (callback) {
-    let gen = this._grabArticleGenerator();
-    let iterate = function () {
-      for (let i = this.GEN_ITERATIONS; i--;) {
-        let result;
-        try {
-          // Parse can be interrupted if document changes (will throw dead
-          // object exception)
-          result = gen.next();
-        } catch (e) {
-          dump("Caught exception while grabbing article, aborting");
-          result = null;
-        }
-        if (result !== undefined) {
-          callback(result);
-          return;
-        }
-      }
-      setTimeout(iterate, 0);
-    }.bind(this);
-    iterate();
-  },
-
   /***
-   * grabArticleGenerator - Using a variety of metrics (content score, classname, element types), find the content that is
+   * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
    *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
    *
    * @param page a document to run upon. Needs to be a full document, complete with body.
    * @return Element
   **/
-  _grabArticleGenerator: function(page) {
+  _grabArticle: function (page) {
     while (true) {
       let doc = this._doc;
       let stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
       let isPaging = (page !== null ? true: false);
 
       page = page ? page : this._doc.body;
 
       let pageCacheHtml = page.innerHTML;
       let allElements = page.getElementsByTagName('*');
 
-      yield;
-
       // First, node prepping. Trash nodes that look cruddy (like ones with the
       // class name "comment", etc), and turn divs into P tags where they have been
       // used inappropriately (as in, where they contain no other block level elements.)
       //
       // Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
       // TODO: Shouldn't this be a reverse traversal?
       let node = null;
       let nodesToScore = [];
@@ -572,18 +544,16 @@ Readability.prototype = {
                 p.innerHTML = childNode.textContent;
                 p.style.display = 'inline';
                 p.className = 'readability-styled';
                 childNode.parentNode.replaceChild(p, childNode);
               }
             }
           }
         }
-
-        yield;
       }
 
       /**
        * Loop through all paragraphs, and assign a score to them based on how content-y they look.
        * Then add their score to their parent node.
        *
        * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
       **/
@@ -625,18 +595,16 @@ Readability.prototype = {
         // For every 100 characters in this paragraph, add another point. Up to 3 points.
         contentScore += Math.min(Math.floor(innerText.length / 100), 3);
 
         // Add the score to the parent. The grandparent gets half.
         parentNode.readability.contentScore += contentScore;
 
         if (grandParentNode)
           grandParentNode.readability.contentScore += contentScore / 2;
-
-        yield;
       }
 
       // After we've calculated scores, loop through all of the possible
       // candidate nodes we found and find the one with the highest score.
       let topCandidate = null;
       for (let c = 0, cl = candidates.length; c < cl; c += 1) {
         // Scale the final candidates score based on link density. Good content
         // should have a relatively small link density (5% or less) and be mostly
@@ -647,18 +615,16 @@ Readability.prototype = {
         this.log('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" +
           candidates[c].id + ") with score " +
           candidates[c].readability.contentScore);
 
         if (!topCandidate ||
           candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
           topCandidate = candidates[c];
         }
-
-        yield;
       }
 
       // If we still have no top candidate, just use the body as a last resort.
       // We also have to copy the body node so it is something we can modify.
       if (topCandidate === null || topCandidate.tagName === "BODY") {
         topCandidate = doc.createElement("DIV");
         topCandidate.innerHTML = page.innerHTML;
 
@@ -732,25 +698,21 @@ Readability.prototype = {
           // To ensure a node does not interfere with readability styles,
           // remove its classnames.
           nodeToAppend.className = "";
 
           // Append sibling and subtract from our list because it removes
           // the node when you append to another node.
           articleContent.appendChild(nodeToAppend);
         }
-
-        yield;
       }
 
       // So we have all of the content that we need. Now we clean it up for presentation.
       this._prepArticle(articleContent);
 
-      yield;
-
       if (this._curPageNum === 1)
         articleContent.innerHTML = '<div id="readability-page-1" class="page">' + articleContent.innerHTML + '</div>';
 
       // Now that we've gone through the full algorithm, check to see if
       // we got any meaningful content. If we didn't, we may need to re-run
       // grabArticle with different flags set. This gives us a higher likelihood of
       // finding the content, and the sieve approach gives us a higher likelihood of
       // finding the -right- content.
@@ -759,20 +721,20 @@ Readability.prototype = {
 
         if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
           this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
         } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
           this._removeFlag(this.FLAG_WEIGHT_CLASSES);
         } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
           this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
         } else {
-          yield null;
+          return null;
         }
       } else {
-        yield articleContent;
+        return articleContent;
       }
     }
   },
 
   /**
    * Removes script tags from the document.
    *
    * @param Element
@@ -1422,21 +1384,20 @@ Readability.prototype = {
    *  1. Prep the document by removing script tags, css, etc.
    *  2. Build readability's DOM tree.
    *  3. Grab the article content from the current dom tree.
    *  4. Replace the current DOM tree with the new one.
    *  5. Read peacefully.
    *
    * @return void
    **/
-  parse: function (callback) {
+  parse: function () {
     let uri = this._uri;
     if ((uri.prePath + "/") === uri.spec) {
-      callback(null);
-      return;
+      return null;
     }
 
     // Remove script tags from the document.
     this._removeScripts(this._doc);
 
     // FIXME: Disabled multi-page article support for now as it
     // needs more work on infrastructure.
 
@@ -1445,29 +1406,27 @@ Readability.prototype = {
     // this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
 
     // Pull out any possible next page link first.
     // let nextPageLink = this._findNextPageLink(doc.body);
 
     this._prepDocument();
 
     let articleTitle = this._getArticleTitle();
-    this._grabArticle(function (articleContent) {
-      if (!articleContent) {
-        callback(null);
-        return;
-      }
+    let articleContent = this._grabArticle();
+    if (!articleContent) {
+      return null;
+    }
 
-      this._postProcessContent(articleContent);
+    this._postProcessContent(articleContent);
 
-      // if (nextPageLink) {
-      //   // Append any additional pages after a small timeout so that people
-      //   // can start reading without having to wait for this to finish processing.
-      //   setTimeout((function() {
-      //     this._appendNextPage(nextPageLink);
-      //   }).bind(this), 500);
-      // }
+    // if (nextPageLink) {
+    //   // Append any additional pages after a small timeout so that people
+    //   // can start reading without having to wait for this to finish processing.
+    //   setTimeout((function() {
+    //     this._appendNextPage(nextPageLink);
+    //   }).bind(this), 500);
+    // }
 
-      callback({ title: this._getInnerText(articleTitle),
-                 content: articleContent.innerHTML });
-    }.bind(this));
+    return { title: this._getInnerText(articleTitle),
+             content: articleContent.innerHTML };
   }
 };
--- a/mobile/android/chrome/content/readerWorker.js
+++ b/mobile/android/chrome/content/readerWorker.js
@@ -2,12 +2,11 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this file,
  * You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 importScripts("JSDOMParser.js", "Readability.js");
 
 self.onmessage = function (msg) {
   let uri = msg.data.uri;
   let doc = new JSDOMParser().parse(msg.data.doc);
-  new Readability(uri, doc).parse(function (result) {
-    postMessage(result);
-  });
+  let article = new Readability(uri, doc).parse();
+  postMessage(article);
 };