Bug 774914/785145 - Convert divs with only a <p> element child into plain <p> elements (r=mfinkle)
authorLucas Rocha <lucasr@mozilla.com>
Sat, 25 Aug 2012 11:27:27 +0100
changeset 105461 5e2b2c9c4f670491189755b90f319da0d78e63f3
parent 105460 8182bfe539f8e94c93e9048db41309903bbcbb15
child 105462 66cd9a4209973a4a7b80f899a1564aaae1d68dec
push id55
push usershu@rfrn.org
push dateThu, 30 Aug 2012 01:33:09 +0000
reviewersmfinkle
bugs774914, 785145
milestone17.0a1
Bug 774914/785145 - Convert divs with only a <p> element child into plain <p> elements (r=mfinkle)
mobile/android/chrome/content/Readability.js
--- a/mobile/android/chrome/content/Readability.js
+++ b/mobile/android/chrome/content/Readability.js
@@ -453,28 +453,39 @@ Readability.prototype = {
           }
         }
 
         if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
           nodesToScore[nodesToScore.length] = node;
 
         // Turn all divs that don't have children block level elements into p's
         if (node.tagName === "DIV") {
-          if (node.innerHTML.search(this.REGEXPS.divToPElements) === -1) {
-            let newNode = doc.createElement('p');
-            newNode.innerHTML = node.innerHTML;
-            node.parentNode.replaceChild(newNode, node);
+          // Sites like http://mobile.slate.com encloses each paragraph with a DIV
+          // element. DIVs with only a P element inside and no text content can be
+          // safely converted into plain P elements to avoid confusing the scoring
+          // algorithm with DIVs with are, in practice, paragraphs.
+          let pIndex = this._getSinglePIndexInsideDiv(node);
 
-            // Manually update allElements since it is not a live NodeList
-            newNode._index = nodeIndex;
-            allElements[nodeIndex] = newNode;
+          if (node.innerHTML.search(this.REGEXPS.divToPElements) === -1 || pIndex >= 0) {
+            let newNode;
+            if (pIndex >= 0) {
+              newNode = node.childNodes[pIndex];
+            } else {
+              newNode = doc.createElement('p');
+              newNode.innerHTML = node.innerHTML;
+
+              // Manually update allElements since it is not a live NodeList
+              newNode._index = nodeIndex;
+              allElements[nodeIndex] = newNode;
+
+              nodesToScore[nodesToScore.length] = newNode;
+            }
+
+            node.parentNode.replaceChild(newNode, node);
             purgeNode(node);
-
-            nodeIndex -= 1;
-            nodesToScore[nodesToScore.length] = node;
           } else {
             // EXPERIMENTAL
             for (let i = 0, il = node.childNodes.length; i < il; i += 1) {
               let childNode = node.childNodes[i];
               if (!childNode)
                 continue;
 
               if (childNode.nodeType === 3) { // Node.TEXT_NODE
@@ -705,16 +716,46 @@ Readability.prototype = {
       scripts[i].removeAttribute('src');
 
       if (scripts[i].parentNode)
           scripts[i].parentNode.removeChild(scripts[i]);
     }
   },
 
   /**
+   * Get child index of the only P element inside a DIV with no
+   * text content. Returns -1 if the DIV node contains non-empty
+   * text nodes or if it contains other element nodes.
+   *
+   * @param Element
+  **/
+  _getSinglePIndexInsideDiv: function(e) {
+    let childNodes = e.childNodes;
+    let pIndex = -1;
+
+    for (let i = childNodes.length; --i >= 0;) {
+      let node = childNodes[i];
+
+      if (node.nodeType === Node.ELEMENT_NODE) {
+        if (node.tagName !== "P")
+          return -1;
+
+        if (pIndex >= 0)
+          return -1;
+
+        pIndex = i;
+      } else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) {
+        return -1;
+      }
+    }
+
+    return pIndex;
+  },
+
+  /**
    * Get the inner text of a node - cross browser compatibly.
    * This also strips out any excess whitespace to be found.
    *
    * @param Element
    * @return string
   **/
   _getInnerText: function(e, normalizeSpaces) {
     let textContent = e.textContent.replace(this.REGEXPS.trim, "");