Bug 774914/785145 - Convert divs with only a <p> element child into plain <p> elements (r=mfinkle)
authorLucas Rocha <lucasr@mozilla.com>
Sat, 25 Aug 2012 11:27:27 +0100
changeset 103428 5e2b2c9c4f670491189755b90f319da0d78e63f3
parent 103427 8182bfe539f8e94c93e9048db41309903bbcbb15
child 103429 66cd9a4209973a4a7b80f899a1564aaae1d68dec
push id23348
push userryanvm@gmail.com
push dateSun, 26 Aug 2012 02:09:16 +0000
treeherdermozilla-central@b3cce81fef1a [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersmfinkle
bugs774914, 785145
milestone17.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 774914/785145 - Convert divs with only a <p> element child into plain <p> elements (r=mfinkle)
mobile/android/chrome/content/Readability.js
--- a/mobile/android/chrome/content/Readability.js
+++ b/mobile/android/chrome/content/Readability.js
@@ -453,28 +453,39 @@ Readability.prototype = {
           }
         }
 
         if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
           nodesToScore[nodesToScore.length] = node;
 
         // Turn all divs that don't have children block level elements into p's
         if (node.tagName === "DIV") {
-          if (node.innerHTML.search(this.REGEXPS.divToPElements) === -1) {
-            let newNode = doc.createElement('p');
-            newNode.innerHTML = node.innerHTML;
-            node.parentNode.replaceChild(newNode, node);
+          // Sites like http://mobile.slate.com encloses each paragraph with a DIV
+          // element. DIVs with only a P element inside and no text content can be
+          // safely converted into plain P elements to avoid confusing the scoring
+          // algorithm with DIVs with are, in practice, paragraphs.
+          let pIndex = this._getSinglePIndexInsideDiv(node);
 
-            // Manually update allElements since it is not a live NodeList
-            newNode._index = nodeIndex;
-            allElements[nodeIndex] = newNode;
+          if (node.innerHTML.search(this.REGEXPS.divToPElements) === -1 || pIndex >= 0) {
+            let newNode;
+            if (pIndex >= 0) {
+              newNode = node.childNodes[pIndex];
+            } else {
+              newNode = doc.createElement('p');
+              newNode.innerHTML = node.innerHTML;
+
+              // Manually update allElements since it is not a live NodeList
+              newNode._index = nodeIndex;
+              allElements[nodeIndex] = newNode;
+
+              nodesToScore[nodesToScore.length] = newNode;
+            }
+
+            node.parentNode.replaceChild(newNode, node);
             purgeNode(node);
-
-            nodeIndex -= 1;
-            nodesToScore[nodesToScore.length] = node;
           } else {
             // EXPERIMENTAL
             for (let i = 0, il = node.childNodes.length; i < il; i += 1) {
               let childNode = node.childNodes[i];
               if (!childNode)
                 continue;
 
               if (childNode.nodeType === 3) { // Node.TEXT_NODE
@@ -705,16 +716,46 @@ Readability.prototype = {
       scripts[i].removeAttribute('src');
 
       if (scripts[i].parentNode)
           scripts[i].parentNode.removeChild(scripts[i]);
     }
   },
 
   /**
+   * Get child index of the only P element inside a DIV with no
+   * text content. Returns -1 if the DIV node contains non-empty
+   * text nodes or if it contains other element nodes.
+   *
+   * @param Element
+  **/
+  _getSinglePIndexInsideDiv: function(e) {
+    let childNodes = e.childNodes;
+    let pIndex = -1;
+
+    for (let i = childNodes.length; --i >= 0;) {
+      let node = childNodes[i];
+
+      if (node.nodeType === Node.ELEMENT_NODE) {
+        if (node.tagName !== "P")
+          return -1;
+
+        if (pIndex >= 0)
+          return -1;
+
+        pIndex = i;
+      } else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) {
+        return -1;
+      }
+    }
+
+    return pIndex;
+  },
+
+  /**
    * Get the inner text of a node - cross browser compatibly.
    * This also strips out any excess whitespace to be found.
    *
    * @param Element
    * @return string
   **/
   _getInnerText: function(e, normalizeSpaces) {
     let textContent = e.textContent.replace(this.REGEXPS.trim, "");