Bug 1509102. Stop converting \n to 
 in attribute values when serializing HTML. r=ehsan
☠☠ backed out by 7e8744eac2ca ☠ ☠
authorBoris Zbarsky <bzbarsky@mit.edu>
Wed, 28 Nov 2018 03:11:08 -0500
changeset 504932 8fc810b6e52834f8e87aa1118b56ade43e26d7ba
parent 504931 211c7dfdc4083e48ef9f86a91d219e22fe7f72b7
child 504933 b526d511169a3ba8ea647d6de9183a62df0da1c8
push id10290
push userffxbld-merge
push dateMon, 03 Dec 2018 16:23:23 +0000
treeherdermozilla-beta@700bed2445e6 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersehsan
bugs1509102
milestone65.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1509102. Stop converting \n to &#xA; in attribute values when serializing HTML. r=ehsan
dom/base/nsHTMLContentSerializer.cpp
dom/base/nsHTMLContentSerializer.h
dom/base/nsXMLContentSerializer.cpp
dom/base/nsXMLContentSerializer.h
dom/base/test/unit/test_serializers_entities.js
dom/base/test/unit/test_serializers_entities_in_attr.js
dom/base/test/unit/xpcshell.ini
--- a/dom/base/nsHTMLContentSerializer.cpp
+++ b/dom/base/nsHTMLContentSerializer.cpp
@@ -444,62 +444,43 @@ static const char* const kEntityStrings[
   /* 0 */ nullptr,
   /* 1 */ "&quot;",
   /* 2 */ "&amp;",
   /* 3 */ "&lt;",
   /* 4 */ "&gt;",
   /* 5 */ "&nbsp;"
 };
 
-uint32_t FindNextBasicEntity(const nsAString& aStr,
-                             const uint32_t aLen,
-                             uint32_t aIndex,
-                             const uint8_t* aEntityTable,
-                             const char** aEntity)
-{
-  for (; aIndex < aLen; ++aIndex) {
-    // for each character in this chunk, check if it
-    // needs to be replaced
-    char16_t val = aStr[aIndex];
-    if (val <= kValNBSP && aEntityTable[val]) {
-      *aEntity = kEntityStrings[aEntityTable[val]];
-      return aIndex;
-    }
-  }
-  return aIndex;
-}
-
 bool
 nsHTMLContentSerializer::AppendAndTranslateEntities(const nsAString& aStr,
-                                                     nsAString& aOutputStr)
+                                                    nsAString& aOutputStr)
 {
   if (mBodyOnly && !mInBody) {
     return true;
   }
 
   if (mDisableEntityEncoding) {
     return aOutputStr.Append(aStr, mozilla::fallible);
   }
 
   if (mFlags & (nsIDocumentEncoder::OutputEncodeBasicEntities)) {
-    const uint8_t* entityTable = mInAttribute ? kAttrEntities : kEntities;
-    uint32_t start = 0;
-    const uint32_t len = aStr.Length();
-    for (uint32_t i = 0; i < len; ++i) {
-      const char* entity = nullptr;
-      i = FindNextBasicEntity(aStr, len, i, entityTable, &entity);
-      uint32_t normalTextLen = i - start;
-      if (normalTextLen) {
-        NS_ENSURE_TRUE(aOutputStr.Append(Substring(aStr, start, normalTextLen),
-                                         mozilla::fallible), false);
-      }
-      if (entity) {
-        NS_ENSURE_TRUE(aOutputStr.AppendASCII(entity, mozilla::fallible), false);
-        start = i + 1;
-      }
+    // Per the API documentation, encode &nbsp;, &amp;, &lt;, &gt;, and &quot;
+    if (mInAttribute) {
+      return nsXMLContentSerializer::AppendAndTranslateEntities<kValNBSP>(
+        aStr, aOutputStr, kAttrEntities, kEntityStrings);
     }
-    return true;
-  } else {
-    NS_ENSURE_TRUE(nsXMLContentSerializer::AppendAndTranslateEntities(aStr, aOutputStr), false);
+
+    return nsXMLContentSerializer::AppendAndTranslateEntities<kValNBSP>(
+      aStr, aOutputStr, kEntities, kEntityStrings);
   }
 
-  return true;
+  // We don't want to call into our superclass 2-arg version of
+  // AppendAndTranslateEntities, because it wants to encode more characters
+  // than we do.  Use our tables, but avoid encoding &nbsp; by passing in a
+  // smaller max index.  This will only encode &amp;, &lt;, &gt;, and &quot;.
+  if (mInAttribute) {
+    return nsXMLContentSerializer::AppendAndTranslateEntities<kGTVal>(
+      aStr, aOutputStr, kAttrEntities, kEntityStrings);
+  }
+
+  return nsXMLContentSerializer::AppendAndTranslateEntities<kGTVal>(
+    aStr, aOutputStr, kEntities, kEntityStrings);
 }
--- a/dom/base/nsHTMLContentSerializer.h
+++ b/dom/base/nsHTMLContentSerializer.h
@@ -42,15 +42,14 @@ class nsHTMLContentSerializer final : pu
                                        const nsAString& aTagNamespaceURI,
                                        nsAtom* aTagName,
                                        int32_t aNamespace,
                                        nsAString& aStr);
 
   MOZ_MUST_USE
   virtual bool AppendAndTranslateEntities(const nsAString& aStr,
                                           nsAString& aOutputStr) override;
-
 };
 
 nsresult
 NS_NewHTMLContentSerializer(nsIContentSerializer** aSerializer);
 
 #endif
--- a/dom/base/nsXMLContentSerializer.cpp
+++ b/dom/base/nsXMLContentSerializer.cpp
@@ -1170,18 +1170,16 @@ nsXMLContentSerializer::AppendToString(c
   if (mBodyOnly && !mInBody) {
     return true;
   }
   mColPos += aStr.Length();
   return aOutputStr.Append(aStr, mozilla::fallible);
 }
 
 
-static const uint16_t kGTVal = 62;
-
 #define _ 0
 
 // This table indexes into kEntityStrings[].
 static const uint8_t kEntities[] = {
   // clang-format off
   _, _, _, _, _, _, _, _, _, _,
   _, _, _, _, _, _, _, _, _, _,
   _, _, _, _, _, _, _, _, _, _,
@@ -1217,41 +1215,55 @@ static const char* const kEntityStrings[
   /* 6 */ "&#xA;",
   /* 7 */ "&#xD;",
 };
 
 bool
 nsXMLContentSerializer::AppendAndTranslateEntities(const nsAString& aStr,
                                                    nsAString& aOutputStr)
 {
+  if (mInAttribute) {
+    return AppendAndTranslateEntities<kGTVal>(aStr, aOutputStr, kAttrEntities,
+                                              kEntityStrings);
+  }
+
+  return AppendAndTranslateEntities<kGTVal>(aStr, aOutputStr, kEntities, kEntityStrings);
+}
+
+/* static */
+bool
+nsXMLContentSerializer::AppendAndTranslateEntities(const nsAString& aStr,
+                                                   nsAString& aOutputStr,
+                                                   const uint8_t aEntityTable[],
+                                                   uint16_t aMaxTableIndex,
+                                                   const char* const aStringTable[])
+{
   nsReadingIterator<char16_t> done_reading;
   aStr.EndReading(done_reading);
 
   // for each chunk of |aString|...
   uint32_t advanceLength = 0;
   nsReadingIterator<char16_t> iter;
 
-  const uint8_t* entityTable = mInAttribute ? kAttrEntities : kEntities;
-
   for (aStr.BeginReading(iter);
        iter != done_reading;
        iter.advance(int32_t(advanceLength))) {
     uint32_t fragmentLength = done_reading - iter;
     const char16_t* c = iter.get();
     const char16_t* fragmentStart = c;
     const char16_t* fragmentEnd = c + fragmentLength;
     const char* entityText = nullptr;
 
     advanceLength = 0;
     // for each character in this chunk, check if it
     // needs to be replaced
     for (; c < fragmentEnd; c++, advanceLength++) {
       char16_t val = *c;
-      if ((val <= kGTVal) && entityTable[val]) {
-        entityText = kEntityStrings[entityTable[val]];
+      if ((val <= aMaxTableIndex) && aEntityTable[val]) {
+        entityText = aStringTable[aEntityTable[val]];
         break;
       }
     }
 
     NS_ENSURE_TRUE(aOutputStr.Append(fragmentStart, advanceLength, mozilla::fallible), false);
     if (entityText) {
       NS_ENSURE_TRUE(AppendASCIItoUTF16(mozilla::MakeStringSpan(entityText),
                                         aOutputStr,
--- a/dom/base/nsXMLContentSerializer.h
+++ b/dom/base/nsXMLContentSerializer.h
@@ -164,16 +164,65 @@ class nsXMLContentSerializer : public ns
    * Appends a string by translating entities
    * It doesn't increment the column position
    */
   MOZ_MUST_USE
   virtual bool AppendAndTranslateEntities(const nsAString& aStr,
                                           nsAString& aOutputStr);
 
   /**
+   * Helper for virtual AppendAndTranslateEntities that does the actualy work.
+   *
+   * Do not call this directly.  Call it via the template helper below.
+   */
+private:
+  MOZ_MUST_USE
+  static bool AppendAndTranslateEntities(const nsAString& aStr,
+                                         nsAString& aOutputStr,
+                                         const uint8_t aEntityTable[],
+                                         uint16_t aMaxTableIndex,
+                                         const char* const aStringTable[]);
+
+protected:
+  /**
+   * Helper for calling AppendAndTranslateEntities in a way that guarantees we
+   * don't mess up our aEntityTable sizing.  This is a bit more complicated than
+   * it could be, becaue sometimes we don't want to use all of aEntityTable, so
+   * we have to allow passing the amount to use independently.  But we can
+   * statically ensure it's not too big.
+   *
+   * The first integer template argument, which callers need to specify
+   * explicitly, is the index of the last entry in aEntityTable that should be
+   * considered for encoding as an entity reference.  The second integer
+   * argument will be deduced from the actual table passed in.
+   *
+   * aEntityTable contains as values indices into aStringTable.  Those represent
+   * the strings that should be used to replace the characters that are used to
+   * index into aEntityTable.  aStringTable[0] should be nullptr, and characters
+   * that do not need replacement should map to 0 in aEntityTable.
+   */
+  template<uint16_t LargestIndex, uint16_t TableLength>
+  MOZ_MUST_USE
+  bool AppendAndTranslateEntities(const nsAString& aStr,
+                                  nsAString& aOutputStr,
+                                  const uint8_t (&aEntityTable)[TableLength],
+                                  const char* const aStringTable[])
+  {
+    static_assert(LargestIndex < TableLength,
+                  "Largest allowed index must be smaller than table length");
+    return AppendAndTranslateEntities(aStr, aOutputStr, aEntityTable,
+                                      LargestIndex, aStringTable);
+  }
+
+  /**
+   * Max index that can be used with some of our entity tables.
+   */
+  static const uint16_t kGTVal = 62;
+
+  /**
    * retrieve the text content of the node and append it to the given string
    * It doesn't increment the column position
    */
   nsresult AppendTextData(nsIContent* aNode,
                           int32_t aStartOffset,
                           int32_t aEndOffset,
                           nsAString& aStr,
                           bool aTranslateEntities);
new file mode 100644
--- /dev/null
+++ b/dom/base/test/unit/test_serializers_entities.js
@@ -0,0 +1,82 @@
+const encoders = {
+    xml: (doc) => {
+	let enc = Cu.createDocumentEncoder("text/xml");
+	enc.init(doc, "text/xml", 0);
+	return enc;
+    },
+    html: (doc) => {
+	let enc = Cu.createDocumentEncoder("text/html");
+	enc.init(doc, "text/html", 0);
+	return enc;
+    },
+    htmlBasic: (doc) => {
+	let enc = Cu.createDocumentEncoder("text/html");
+	enc.init(doc, "text/html", Ci.nsIDocumentEncoder.OutputEncodeBasicEntities);
+	return enc;
+    },
+    xhtml: (doc) => {
+	let enc = Cu.createDocumentEncoder("application/xhtml+xml");
+	enc.init(doc, "application/xhtml+xml", 0);
+	return enc;
+    },
+};
+
+// Which characters should we encode as entities?  It depends on the serializer.
+const encodeAll = { html: true, htmlBasic: true, xhtml: true, xml: true };
+const encodeHTMLBasic = { html: false, htmlBasic: true, xhtml: false, xml: false };
+const encodeXML = { html: false, htmlBasic: false, xhtml: true, xml: true };
+const encodeNone = { html: false, htmlBasic: false, xhtml: false, xml: false };
+const encodingInfoMap = new Map([
+    // Basic sanity chars '<', '>', '"', '&' get encoded in all cases.
+    [ '<',  encodeAll ],
+    [ '>',  encodeAll ],
+    [ '&',  encodeAll ],
+    // nbsp is only encoded with the HTML encoder when encoding basic entities.
+    [ '\xA0', encodeHTMLBasic ],
+]);
+
+const encodingMap = new Map([
+    [ '<', '&lt;' ],
+    [ '>', '&gt;' ],
+    [ '&', '&amp;' ],
+    // nbsp is only encoded with the HTML encoder when encoding basic entities.
+    [ '\xA0', '&nbsp;' ],
+]);
+
+function encodingInfoForChar(c) {
+    var info = encodingInfoMap.get(c);
+    if (info) {
+	return info;
+    }
+    return encodeNone;
+}
+
+function encodingForChar(c, type) {
+    var info = encodingInfoForChar(c);
+    if (!info[type]) {
+	return c;
+    }
+    return encodingMap.get(c);
+}
+
+const doc = new DOMParser().parseFromString("<root></root>", "text/xml")
+const root = doc.documentElement;
+for (let i = 0; i < 255; ++i) {
+    let el = doc.createElement("span");
+    el.textContent = String.fromCharCode(i);
+    root.appendChild(el);
+}
+for (let type of ["xml", "xhtml", "htmlBasic", "html"]) {
+    let str = encoders[type](doc).encodeToString();
+    const prefix = "<root><span>";
+    const suffix = "</span></root>";
+    Assert.ok(str.startsWith(prefix), `${type} serialization starts correctly`);
+    Assert.ok(str.endsWith(suffix), `${type} serialization ends correctly`);
+    str = str.substring(prefix.length, str.length - suffix.length);
+    let encodings = str.split("</span><span>");
+    for (let i = 0; i < 255; ++i) {
+	let c = String.fromCharCode(i);
+	Assert.equal(encodingForChar(c, type), encodings[i],
+		     `${type} encoding of char ${i} is correct`);
+    }
+}
new file mode 100644
--- /dev/null
+++ b/dom/base/test/unit/test_serializers_entities_in_attr.js
@@ -0,0 +1,91 @@
+const encoders = {
+    xml: (doc) => {
+	let enc = Cu.createDocumentEncoder("text/xml");
+	enc.init(doc, "text/xml", 0);
+	return enc;
+    },
+    html: (doc) => {
+	let enc = Cu.createDocumentEncoder("text/html");
+	enc.init(doc, "text/html", 0);
+	return enc;
+    },
+    htmlBasic: (doc) => {
+	let enc = Cu.createDocumentEncoder("text/html");
+	enc.init(doc, "text/html", Ci.nsIDocumentEncoder.OutputEncodeBasicEntities);
+	return enc;
+    },
+    xhtml: (doc) => {
+	let enc = Cu.createDocumentEncoder("application/xhtml+xml");
+	enc.init(doc, "application/xhtml+xml", 0);
+	return enc;
+    },
+};
+
+// Which characters should we encode as entities?  It depends on the serializer.
+const encodeAll = { html: true, htmlBasic: true, xhtml: true, xml: true };
+const encodeHTMLBasic = { html: false, htmlBasic: true, xhtml: false, xml: false };
+const encodeXML = { html: false, htmlBasic: false, xhtml: true, xml: true };
+const encodeNone = { html: false, htmlBasic: false, xhtml: false, xml: false };
+const encodingInfoMap = new Map([
+    // Basic sanity chars '<', '>', '"', '&' get encoded in all cases.
+    [ '<',  encodeAll ],
+    [ '>',  encodeAll ],
+    [ '"',  encodeAll ],
+    [ '&',  encodeAll ],
+    // nbsp is only encoded with the HTML encoder when encoding basic entities.
+    [ '\xA0', encodeHTMLBasic ],
+    // Whitespace bits are only encoded in XML.
+    [ '\n', encodeXML ],
+    [ '\r', encodeXML ],
+    [ '\t', encodeXML ],
+]);
+
+const encodingMap = new Map([
+    [ '<', '&lt;' ],
+    [ '>', '&gt;' ],
+    [ '"', '&quot;' ],
+    [ '&', '&amp;' ],
+    [ '\xA0', '&nbsp;' ],
+    [ '\n', '&#xA;' ],
+    [ '\r', '&#xD;' ],
+    [ '\t', '&#9;' ],
+]);
+
+function encodingInfoForChar(c) {
+    var info = encodingInfoMap.get(c);
+    if (info) {
+	return info;
+    }
+    return encodeNone;
+}
+
+function encodingForChar(c, type) {
+    var info = encodingInfoForChar(c);
+    if (!info[type]) {
+	return c;
+    }
+    return encodingMap.get(c);
+}
+
+const doc = new DOMParser().parseFromString("<root></root>", "text/xml")
+const root = doc.documentElement;
+for (let i = 0; i < 255; ++i) {
+    let el = doc.createElement("span");
+    el.setAttribute("x", String.fromCharCode(i));
+    el.textContent = " ";
+    root.appendChild(el);
+}
+for (let type of ["xml", "xhtml", "htmlBasic", "html"]) {
+    let str = encoders[type](doc).encodeToString();
+    const prefix = '<root><span x="';
+    const suffix = '"> </span></root>';
+    Assert.ok(str.startsWith(prefix), `${type} serialization starts correctly`);
+    Assert.ok(str.endsWith(suffix), `${type} serialization ends correctly`);
+    str = str.substring(prefix.length, str.length - suffix.length);
+    let encodings = str.split('"> </span><span x="');
+    for (let i = 0; i < 255; ++i) {
+	let c = String.fromCharCode(i);
+	Assert.equal(encodingForChar(c, type), encodings[i],
+		     `${type} encoding of char ${i} is correct`);
+    }
+}
--- a/dom/base/test/unit/xpcshell.ini
+++ b/dom/base/test/unit/xpcshell.ini
@@ -33,16 +33,18 @@ skip-if = os == 'mac'
 [test_isequalnode.js]
 head = head_xml.js
 [test_nodelist.js]
 head = head_xml.js
 [test_normalize.js]
 head = head_xml.js
 [test_range.js]
 head = head_xml.js
+[test_serializers_entities.js]
+[test_serializers_entities_in_attr.js]
 [test_structuredcloneholder.js]
 [test_thirdpartyutil.js]
 [test_treewalker.js]
 head = head_xml.js
 [test_xhr_document.js]
 [test_xhr_standalone.js]
 [test_xhr_origin_attributes.js]
 [test_xml_parser.js]