Bug 774914/785145 - Convert divs with only a <p> element child into plain <p> elements (r=mfinkle, a=akeybl)
authorLucas Rocha <lucasr@mozilla.com>
Sat, 25 Aug 2012 11:27:27 +0100
changeset 104632 233683cf491efda02381fe76e70da8d78cc6ae23
parent 104631 d1b2229d0e632a45bcf4cb52f7d5555f3c6727f5
child 104633 80edc958e20d960f2f2bf4af38b807f8c5358a59
push id1329
push userlrocha@mozilla.com
push dateWed, 29 Aug 2012 09:08:09 +0000
treeherdermozilla-beta@233683cf491e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersmfinkle, akeybl
bugs774914, 785145
milestone16.0
Bug 774914/785145 - Convert divs with only a <p> element child into plain <p> elements (r=mfinkle, a=akeybl)
mobile/android/chrome/content/Readability.js
--- a/mobile/android/chrome/content/Readability.js
+++ b/mobile/android/chrome/content/Readability.js
@@ -453,28 +453,39 @@ Readability.prototype = {
           }
         }
 
         if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
           nodesToScore[nodesToScore.length] = node;
 
         // Turn all divs that don't have children block level elements into p's
         if (node.tagName === "DIV") {
-          if (node.innerHTML.search(this.REGEXPS.divToPElements) === -1) {
-            let newNode = doc.createElement('p');
-            newNode.innerHTML = node.innerHTML;
-            node.parentNode.replaceChild(newNode, node);
+          // Sites like http://mobile.slate.com encloses each paragraph with a DIV
+          // element. DIVs with only a P element inside and no text content can be
+          // safely converted into plain P elements to avoid confusing the scoring
+          // algorithm with DIVs with are, in practice, paragraphs.
+          let pIndex = this._getSinglePIndexInsideDiv(node);
 
-            // Manually update allElements since it is not a live NodeList
-            newNode._index = nodeIndex;
-            allElements[nodeIndex] = newNode;
+          if (node.innerHTML.search(this.REGEXPS.divToPElements) === -1 || pIndex >= 0) {
+            let newNode;
+            if (pIndex >= 0) {
+              newNode = node.childNodes[pIndex];
+            } else {
+              newNode = doc.createElement('p');
+              newNode.innerHTML = node.innerHTML;
+
+              // Manually update allElements since it is not a live NodeList
+              newNode._index = nodeIndex;
+              allElements[nodeIndex] = newNode;
+
+              nodesToScore[nodesToScore.length] = newNode;
+            }
+
+            node.parentNode.replaceChild(newNode, node);
             purgeNode(node);
-
-            nodeIndex -= 1;
-            nodesToScore[nodesToScore.length] = node;
           } else {
             // EXPERIMENTAL
             for (let i = 0, il = node.childNodes.length; i < il; i += 1) {
               let childNode = node.childNodes[i];
               if (!childNode)
                 continue;
 
               if (childNode.nodeType === 3) { // Node.TEXT_NODE
@@ -705,16 +716,46 @@ Readability.prototype = {
       scripts[i].removeAttribute('src');
 
       if (scripts[i].parentNode)
           scripts[i].parentNode.removeChild(scripts[i]);
     }
   },
 
   /**
+   * Get child index of the only P element inside a DIV with no
+   * text content. Returns -1 if the DIV node contains non-empty
+   * text nodes or if it contains other element nodes.
+   *
+   * @param Element
+  **/
+  _getSinglePIndexInsideDiv: function(e) {
+    let childNodes = e.childNodes;
+    let pIndex = -1;
+
+    for (let i = childNodes.length; --i >= 0;) {
+      let node = childNodes[i];
+
+      if (node.nodeType === Node.ELEMENT_NODE) {
+        if (node.tagName !== "P")
+          return -1;
+
+        if (pIndex >= 0)
+          return -1;
+
+        pIndex = i;
+      } else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) {
+        return -1;
+      }
+    }
+
+    return pIndex;
+  },
+
+  /**
    * Get the inner text of a node - cross browser compatibly.
    * This also strips out any excess whitespace to be found.
    *
    * @param Element
    * @return string
   **/
   _getInnerText: function(e, normalizeSpaces) {
     let textContent = e.textContent.replace(this.REGEXPS.trim, "");