Backed out changeset d59b62713c66 (bug 1395527)
authorSebastian Hengst <archaeopteryx@coole-files.de>
Sat, 02 Sep 2017 10:38:20 +0200
changeset 427933 cae6eeaf3f0c73cb62b87f465e9dd69ea2132bd3
parent 427932 d59b62713c66023954833e89374c5ecc2b92df72
child 427934 e990298e1596c5d5df0a73c929c0dacef88f5e53
push id7761
push userjlund@mozilla.com
push dateFri, 15 Sep 2017 00:19:52 +0000
treeherdermozilla-beta@c38455951db4 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
bugs1395527
milestone57.0a1
backs outd59b62713c66023954833e89374c5ecc2b92df72
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Backed out changeset d59b62713c66 (bug 1395527)
netwerk/protocol/websocket/WebSocketChannel.cpp
xpcom/string/nsReadableUtils.cpp
xpcom/string/nsReadableUtils.h
--- a/netwerk/protocol/websocket/WebSocketChannel.cpp
+++ b/netwerk/protocol/websocket/WebSocketChannel.cpp
@@ -1740,17 +1740,17 @@ WebSocketChannel::ProcessInput(uint8_t *
         } else {
           if (!utf8Data.Assign((const char *)payload, payloadLength,
                                mozilla::fallible)) {
             return NS_ERROR_OUT_OF_MEMORY;
           }
         }
 
         // Section 8.1 says to fail connection if invalid utf-8 in text message
-        if (!IsUTF8(utf8Data)) {
+        if (!IsUTF8(utf8Data, false)) {
           LOG(("WebSocketChannel:: text frame invalid utf-8\n"));
           return NS_ERROR_CANNOT_CONVERT_DATA;
         }
 
         RefPtr<WebSocketFrame> frame =
           mService->CreateFrameIfNeeded(finBit, rsvBit1, rsvBit2, rsvBit3,
                                         opcode, maskBit, mask, utf8Data);
 
@@ -1791,17 +1791,17 @@ WebSocketChannel::ProcessInput(uint8_t *
             mServerCloseReason.SetLength(msglen);
             memcpy(mServerCloseReason.BeginWriting(),
                    (const char *)payload + 2, msglen);
 
             // section 8.1 says to replace received non utf-8 sequences
             // (which are non-conformant to send) with u+fffd,
             // but secteam feels that silently rewriting messages is
             // inappropriate - so we will fail the connection instead.
-            if (!IsUTF8(mServerCloseReason)) {
+            if (!IsUTF8(mServerCloseReason, false)) {
               LOG(("WebSocketChannel:: close frame invalid utf-8\n"));
               return NS_ERROR_CANNOT_CONVERT_DATA;
             }
 
             LOG(("WebSocketChannel:: close msg %s\n",
                  mServerCloseReason.get()));
           }
         }
--- a/xpcom/string/nsReadableUtils.cpp
+++ b/xpcom/string/nsReadableUtils.cpp
@@ -612,16 +612,126 @@ IsASCII(const nsAString& aString)
     if (*c++ & NOT_ASCII) {
       return false;
     }
   }
 
   return true;
 }
 
+bool
+IsASCII(const nsACString& aString)
+{
+  static const char NOT_ASCII = char(~0x7F);
+
+
+  // Don't want to use |copy_string| for this task, since we can stop at the first non-ASCII character
+
+  nsACString::const_iterator iter, done_reading;
+  aString.BeginReading(iter);
+  aString.EndReading(done_reading);
+
+  const char* c = iter.get();
+  const char* end = done_reading.get();
+
+  while (c < end) {
+    if (*c++ & NOT_ASCII) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool
+IsUTF8(const nsACString& aString, bool aRejectNonChar)
+{
+  nsReadingIterator<char> done_reading;
+  aString.EndReading(done_reading);
+
+  int32_t state = 0;
+  bool overlong = false;
+  bool surrogate = false;
+  bool nonchar = false;
+  uint16_t olupper = 0; // overlong byte upper bound.
+  uint16_t slower = 0;  // surrogate byte lower bound.
+
+  nsReadingIterator<char> iter;
+  aString.BeginReading(iter);
+
+  const char* ptr = iter.get();
+  const char* end = done_reading.get();
+  while (ptr < end) {
+    uint8_t c;
+
+    if (0 == state) {
+      c = *ptr++;
+
+      if (UTF8traits::isASCII(c)) {
+        continue;
+      }
+
+      if (c <= 0xC1) { // [80-BF] where not expected, [C0-C1] for overlong.
+        return false;
+      } else if (UTF8traits::is2byte(c)) {
+        state = 1;
+      } else if (UTF8traits::is3byte(c)) {
+        state = 2;
+        if (c == 0xE0) { // to exclude E0[80-9F][80-BF]
+          overlong = true;
+          olupper = 0x9F;
+        } else if (c == 0xED) { // ED[A0-BF][80-BF] : surrogate codepoint
+          surrogate = true;
+          slower = 0xA0;
+        } else if (c == 0xEF) { // EF BF [BE-BF] : non-character
+          nonchar = true;
+        }
+      } else if (c <= 0xF4) { // XXX replace /w UTF8traits::is4byte when it's updated to exclude [F5-F7].(bug 199090)
+        state = 3;
+        nonchar = true;
+        if (c == 0xF0) { // to exclude F0[80-8F][80-BF]{2}
+          overlong = true;
+          olupper = 0x8F;
+        } else if (c == 0xF4) { // to exclude F4[90-BF][80-BF]
+          // actually not surrogates but codepoints beyond 0x10FFFF
+          surrogate = true;
+          slower = 0x90;
+        }
+      } else {
+        return false;  // Not UTF-8 string
+      }
+    }
+
+    if (nonchar && !aRejectNonChar) {
+      nonchar = false;
+    }
+
+    while (ptr < end && state) {
+      c = *ptr++;
+      --state;
+
+      // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
+      if (nonchar &&
+          ((!state && c < 0xBE) ||
+           (state == 1 && c != 0xBF)  ||
+           (state == 2 && 0x0F != (0x0F & c)))) {
+        nonchar = false;
+      }
+
+      if (!UTF8traits::isInSeq(c) || (overlong && c <= olupper) ||
+          (surrogate && slower <= c) || (nonchar && !state)) {
+        return false;  // Not UTF-8 string
+      }
+
+      overlong = surrogate = false;
+    }
+  }
+  return !state; // state != 0 at the end indicates an invalid UTF-8 seq.
+}
+
 /**
  * A character sink for in-place case conversion.
  */
 class ConvertToUpperCase
 {
 public:
   typedef char value_type;
 
--- a/xpcom/string/nsReadableUtils.h
+++ b/xpcom/string/nsReadableUtils.h
@@ -13,22 +13,16 @@
  * According to our conventions, they should be |NS_xxx|.
  */
 
 #include "mozilla/Assertions.h"
 #include "nsAString.h"
 
 #include "nsTArrayForwardDeclare.h"
 
-// Can't include mozilla/Encoding.h here
-extern "C" {
-  size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
-  size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
-}
-
 inline size_t
 Distance(const nsReadingIterator<char16_t>& aStart,
          const nsReadingIterator<char16_t>& aEnd)
 {
   MOZ_ASSERT(aStart.get() <= aEnd.get());
   return static_cast<size_t>(aEnd.get() - aStart.get());
 }
 inline size_t
@@ -254,67 +248,50 @@ void AppendUnicodeTo(const nsAString::co
  */
 bool IsASCII(const nsAString& aString);
 
 /**
  * Returns |true| if |aString| contains only ASCII characters, that is, characters in the range (0x00, 0x7F).
  *
  * @param aString a 8-bit wide string to scan
  */
-inline bool IsASCII(const nsACString& aString)
-{
-  size_t length = aString.Length();
-  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
-  // For short strings, calling into Rust is a pessimization, and the SIMD
-  // code won't have a chance to kick in anyway. Additionally, handling the
-  // case of the empty string here makes null-checking ptr unnecessary.
-  // (Passing nullptr to Rust would technically be UB.)
-  if (length < 16) {
-    size_t accu = 0;
-    for (size_t i = 0; i < length; i++) {
-      accu |= ptr[i];
-    }
-    return accu < 0x80;
-  }
-  // This is not quite optimal, because it's not fail-fast when the by-register
-  // check already finds non-ASCII. Also, input to this function is almost
-  // always ASCII, so even the by-register check wouldn't need to be fail-fast
-  // and could be more like the loop above.
-  return length == encoding_ascii_valid_up_to(ptr, length);
-}
+bool IsASCII(const nsACString& aString);
 
 /**
  * Returns |true| if |aString| is a valid UTF-8 string.
+ * XXX This is not bullet-proof and nor an all-purpose UTF-8 validator.
+ * It is mainly written to replace and roughly equivalent to
  *
- * Note that this doesn't check whether the string might look like a valid
- * string in another encoding, too, e.g. ISO-2022-JP.
+ *    str.Equals(NS_ConvertUTF16toUTF8(NS_ConvertUTF8toUTF16(str)))
+ *
+ * (see bug 191541)
+ * As such,  it does not check for non-UTF-8 7bit encodings such as
+ * ISO-2022-JP and HZ.
+ *
+ * It rejects sequences with the following errors:
+ *
+ * byte sequences that cannot be decoded into characters according to
+ *   UTF-8's rules (including cases where the input is part of a valid
+ *   UTF-8 sequence but starts or ends mid-character)
+ * overlong sequences (i.e., cases where a character was encoded
+ *   non-canonically by using more bytes than necessary)
+ * surrogate codepoints (i.e., the codepoints reserved for
+     representing astral characters in UTF-16)
+ * codepoints above the unicode range (i.e., outside the first 17
+ *   planes; higher than U+10FFFF), in accordance with
+ *   http://tools.ietf.org/html/rfc3629
+ * when aRejectNonChar is true (the default), any codepoint whose low
+ *   16 bits are 0xFFFE or 0xFFFF
+
  *
  * @param aString an 8-bit wide string to scan
+ * @param aRejectNonChar a boolean to control the rejection of utf-8
+ *        non characters
  */
-inline bool IsUTF8(const nsACString& aString)
-{
-  size_t length = aString.Length();
-  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
-  // For short strings, calling into Rust is a pessimization, and the SIMD
-  // code won't have a chance to kick in anyway. Additionally, handling the
-  // case of the empty string here makes null-checking ptr unnecessary.
-  // (Passing nullptr to Rust would technically be UB.)
-  if (length < 16) {
-    for (size_t i = 0; i < length; i++) {
-      if (ptr[i] >= 0x80) {
-        ptr += i;
-        length -= i;
-        goto end;
-      }
-    }
-    return true;
-  }
-  end:
-  return length == encoding_utf8_valid_up_to(ptr, length);
-}
+bool IsUTF8(const nsACString& aString, bool aRejectNonChar = true);
 
 bool ParseString(const nsACString& aAstring, char aDelimiter,
                  nsTArray<nsCString>& aArray);
 
 /**
  * Converts case in place in the argument string.
  */
 void ToUpperCase(nsACString&);