Bug 1426909 - Introduce a new mfbt/Utf8.h header for UTF-8-related functionality, including a UTF-8 code unit type that is compatible with, but doesn't directly interconvert with, |char|. r=froydnj
authorJeff Walden <jwalden@mit.edu>
Thu, 11 Jan 2018 11:29:53 -0700
changeset 480371 d7fcfaa2c82d744eb07a138d8fb4870bd039127f
parent 480370 135b5bfa762aacc5a4e273f6f925172740c9b250
child 480372 1031a09274e0bc148e11f0dae2426e64e107c69a
push id9719
push userffxbld-merge
push dateFri, 24 Aug 2018 17:49:46 +0000
treeherdermozilla-beta@719ec98fba77 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersfroydnj
bugs1426909
milestone63.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1426909 - Introduce a new mfbt/Utf8.h header for UTF-8-related functionality, including a UTF-8 code unit type that is compatible with, but doesn't directly interconvert with, |char|. r=froydnj
js/public/CharacterEncoding.h
js/src/vm/CharacterEncoding.cpp
js/src/wasm/WasmValidate.cpp
mfbt/Utf8.cpp
mfbt/Utf8.h
mfbt/moz.build
mfbt/tests/TestUtf8.cpp
mfbt/tests/moz.build
--- a/js/public/CharacterEncoding.h
+++ b/js/public/CharacterEncoding.h
@@ -321,21 +321,14 @@ LossyUTF8CharsToNewLatin1CharsZ(JSContex
 
 /*
  * Returns true if all characters in the given null-terminated string are
  * ASCII, i.e. < 0x80, false otherwise.
  */
 extern JS_PUBLIC_API(bool)
 StringIsASCII(const char* s);
 
-/*
- * Returns true if the given length-delimited string is a valid UTF-8 string,
- * false otherwise.
- */
-extern JS_PUBLIC_API(bool)
-StringIsUTF8(const uint8_t* s, uint32_t length);
-
 } // namespace JS
 
 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
 
 #endif /* js_CharacterEncoding_h */
--- a/js/src/vm/CharacterEncoding.cpp
+++ b/js/src/vm/CharacterEncoding.cpp
@@ -488,48 +488,8 @@ JS::StringIsASCII(const char* s)
 {
     while (*s) {
         if (*s & 0x80)
             return false;
         s++;
     }
     return true;
 }
-
-bool
-JS::StringIsUTF8(const uint8_t* s, uint32_t length)
-{
-    const uint8_t* limit = s + length;
-    while (s < limit) {
-        uint32_t len;
-        uint32_t min;
-        uint32_t n = *s;
-        if ((n & 0x80) == 0) {
-            len = 1;
-            min = 0;
-        } else if ((n & 0xE0) == 0xC0) {
-            len = 2;
-            min = 0x80;
-            n &= 0x1F;
-        } else if ((n & 0xF0) == 0xE0) {
-            len = 3;
-            min = 0x800;
-            n &= 0x0F;
-        } else if ((n & 0xF8) == 0xF0) {
-            len = 4;
-            min = 0x10000;
-            n &= 0x07;
-        } else {
-            return false;
-        }
-        if (s + len > limit)
-            return false;
-        for (uint32_t i = 1; i < len; i++) {
-            if ((s[i] & 0xC0) != 0x80)
-                return false;
-            n = (n << 6) | (s[i] & 0x3F);
-        }
-        if (n < min || (0xD800 <= n && n < 0xE000) || n >= 0x110000)
-            return false;
-        s += len;
-    }
-    return true;
-}
--- a/js/src/wasm/WasmValidate.cpp
+++ b/js/src/wasm/WasmValidate.cpp
@@ -15,28 +15,30 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 #include "wasm/WasmValidate.h"
 
 #include "mozilla/CheckedInt.h"
 #include "mozilla/Unused.h"
+#include "mozilla/Utf8.h"
 
 #include "jit/JitOptions.h"
 #include "js/Printf.h"
 #include "vm/JSContext.h"
 #include "vm/Realm.h"
 #include "wasm/WasmOpIter.h"
 
 using namespace js;
 using namespace js::jit;
 using namespace js::wasm;
 
 using mozilla::CheckedInt;
+using mozilla::IsValidUtf8;
 using mozilla::Unused;
 
 // Decoder implementation.
 
 bool
 Decoder::failf(const char* msg, ...)
 {
     va_list ap;
@@ -1282,17 +1284,17 @@ DecodeName(Decoder& d)
 
     if (numBytes > MaxStringBytes)
         return nullptr;
 
     const uint8_t* bytes;
     if (!d.readBytes(numBytes, &bytes))
         return nullptr;
 
-    if (!JS::StringIsUTF8(bytes, numBytes))
+    if (!IsValidUtf8(bytes, numBytes))
         return nullptr;
 
     UniqueChars name(js_pod_malloc<char>(numBytes + 1));
     if (!name)
         return nullptr;
 
     memcpy(name.get(), bytes, numBytes);
     name[numBytes] = '\0';
new file mode 100644
--- /dev/null
+++ b/mfbt/Utf8.cpp
@@ -0,0 +1,79 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/Types.h"
+#include "mozilla/Utf8.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+MFBT_API bool
+mozilla::IsValidUtf8(const void* aCodeUnits, size_t aCount)
+{
+  const auto* s = static_cast<const unsigned char*>(aCodeUnits);
+  const auto* limit = s + aCount;
+
+  while (s < limit) {
+    uint32_t n = *s++;
+
+    // If the first byte is ASCII, it's the only one in the code point.  Have a
+    // fast path that avoids all the rest of the work and looping in that case.
+    if ((n & 0x80) == 0) {
+      continue;
+    }
+
+    // The leading code unit determines the length of the next code point and
+    // the number of bits of the leading code unit that contribute to the code
+    // point's value.
+    uint_fast8_t remaining;
+    uint32_t min;
+    if ((n & 0xE0) == 0xC0) {
+      remaining = 1;
+      min = 0x80;
+      n &= 0x1F;
+    } else if ((n & 0xF0) == 0xE0) {
+      remaining = 2;
+      min = 0x800;
+      n &= 0x0F;
+    } else if ((n & 0xF8) == 0xF0) {
+      remaining = 3;
+      min = 0x10000;
+      n &= 0x07;
+    } else {
+      // UTF-8 used to have a hyper-long encoding form, but it's been removed
+      // for years now.  So in this case, the string is not valid UTF-8.
+      return false;
+    }
+
+    // If the code point would require more code units than remain, the encoding
+    // is invalid.
+    if (s + remaining > limit) {
+      return false;
+    }
+
+    for (uint_fast8_t i = 0; i < remaining; i++) {
+      // Every non-leading code unit in properly encoded UTF-8 has its high bit
+      // set and the next-highest bit unset.
+      if ((s[i] & 0xC0) != 0x80) {
+        return false;
+      }
+
+      // The code point being encoded is the concatenation of all the
+      // unconstrained bits.
+      n = (n << 6) | (s[i] & 0x3F);
+    }
+
+    // Don't consider code points that are overlong, UTF-16 surrogates, or
+    // exceed the maximum code point to be valid.
+    if (n < min || (0xD800 <= n && n < 0xE000) || n >= 0x110000) {
+      return false;
+    }
+
+    s += remaining;
+  }
+
+  return true;
+}
new file mode 100644
--- /dev/null
+++ b/mfbt/Utf8.h
@@ -0,0 +1,210 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * UTF-8-related functionality, including a type-safe structure representing a
+ * UTF-8 code unit.
+ */
+
+#ifndef mozilla_Utf8_h
+#define mozilla_Utf8_h
+
+#include "mozilla/Types.h" // for MFBT_API
+
+#include <limits.h> // for CHAR_BIT
+#include <stddef.h> // for size_t
+#include <stdint.h> // for uint8_t
+
+namespace mozilla {
+
+union Utf8Unit;
+
+static_assert(CHAR_BIT == 8,
+              "Utf8Unit won't work so well with non-octet chars");
+
+/**
+ * A code unit within a UTF-8 encoded string.  (A code unit is the smallest
+ * unit within the Unicode encoding of a string.  For UTF-8 this is an 8-bit
+ * number; for UTF-16 it would be a 16-bit number.)
+ *
+ * This is *not* the same as a single code point: in UTF-8, non-ASCII code
+ * points are constituted by multiple code units.
+ */
+union Utf8Unit
+{
+private:
+  // Utf8Unit is a union wrapping a raw |char|.  The C++ object model and C++
+  // requirements as to how objects may be accessed with respect to their actual
+  // types (almost?) uniquely compel this choice.
+  //
+  // Our requirements for a UTF-8 code unit representation are:
+  //
+  //   1. It must be "compatible" with C++ character/string literals that use
+  //      the UTF-8 encoding.  Given a properly encoded C++ literal, you should
+  //      be able to use |Utf8Unit| and friends to access it; given |Utf8Unit|
+  //      and friends (particularly UnicodeData), you should be able to access
+  //      C++ character types for their contents.
+  //   2. |Utf8Unit| and friends must convert to/from |char| and |char*| only by
+  //      explicit operation.
+  //   3. |Utf8Unit| must participate in overload resolution and template type
+  //      equivalence (that is, given |template<class> class X|, when |X<T>| and
+  //      |X<U>| are the same type) distinctly from the C++ character types.
+  //
+  // And a few nice-to-haves (at least for the moment):
+  //
+  //   4. The representation should use unsigned numbers, to avoid undefined
+  //      behavior that can arise with signed types, and because Unicode code
+  //      points and code units are unsigned.
+  //   5. |Utf8Unit| and friends should be convertible to/from |unsigned char|
+  //      and |unsigned char*|, for APIs that (because of #4 above) use those
+  //      types as the "natural" choice for UTF-8 data.
+  //
+  // #1 requires that |Utf8Unit| "incorporate" a C++ character type: one of
+  // |{,{un,}signed} char|.[0]  |uint8_t| won't work because it might not be a
+  // C++ character type.
+  //
+  // #2 and #3 mean that |Utf8Unit| can't *be* such a type (or a typedef to one:
+  // typedefs don't generate *new* types, just type aliases).  This requires a
+  // compound type.
+  //
+  // The ultimate representation (and character type in it) is constrained by
+  // C++14 [basic.lval]p10 that defines how objects may be accessed, with
+  // respect to the dynamic type in memory and the actual type used to access
+  // them.  It reads:
+  //
+  //     If a program attempts to access the stored value of an object
+  //     through a glvalue of other than one of the following types the
+  //     behavior is undefined:
+  //
+  //       1. the dynamic type of the object,
+  //       2. a cv-qualified version of the dynamic type of the object,
+  //       ...other types irrelevant here...
+  //       3. an aggregate or union type that includes one of the
+  //          aforementioned types among its elements or non-static data
+  //          members (including, recursively, an element or non-static
+  //          data member of a subaggregate or contained union),
+  //       ...more irrelevant types...
+  //       4. a char or unsigned char type.
+  //
+  // Accessing (wrapped) UTF-8 data as |char|/|unsigned char| is allowed no
+  // matter the representation by #4.  (Briefly set aside what values are seen.)
+  // (And #2 allows |const| on either the dynamic type or the accessing type.)
+  // (|signed char| is really only useful for small signed numbers, not
+  // characters, so we ignore it.)
+  //
+  // If we interpret contents as |char|/|unsigned char| contrary to the actual
+  // type stored there, what happens?  C++14 [basic.fundamental]p1 requires
+  // character types be identically aligned/sized; C++14 [basic.fundamental]p3
+  // requires |signed char| and |unsigned char| have the same value
+  // representation.  C++ doesn't require identical bitwise representation, tho.
+  // Practically we could assume it, but this verges on C++ spec bits best not
+  // *relied* on for correctness, if possible.
+  //
+  // So we don't expose |Utf8Unit|'s contents as |unsigned char*|: only |char|
+  // and |char*|.  Instead we safely expose |unsigned char| by fully-defined
+  // *integral conversion* (C++14 [conv.integral]p2).  Integral conversion from
+  // |unsigned char| → |char| has only implementation-defined behavior.  It'd be
+  // better not to depend on that, but given twos-complement won, it should be
+  // okay.  (Also |unsigned char*| is awkward enough to work with for strings
+  // that it probably doesn't appear in string manipulation much anyway, only in
+  // places that should really use |Utf8Unit| directly.)
+  //
+  // The opposite direction -- interpreting |char| or |char*| data through
+  // |Utf8Unit| -- isn't tricky as long as |Utf8Unit| contains a |char| as
+  // decided above, using #3.  An "aggregate or union" will work that contains a
+  // |char|.  Oddly, an aggregate won't work: C++14 [dcl.init.aggr]p1 says
+  // aggregates must have "no private or protected non-static data members", and
+  // we want to keep the inner |char| hidden.  So a |struct| is out, and only
+  // |union| remains.
+  //
+  // (Enums are not "an aggregate or union type", so [maybe surprisingly] we
+  // can't make |Utf8Unit| an enum class with |char| underlying type, because we
+  // are given no license to treat |char| memory as such an |enum|'s memory.)
+  //
+  // Therefore |Utf8Unit| is a union type with a |char| non-static data member.
+  // This satisfies all our requirements.  It also supports the nice-to-haves of
+  // creating a |Utf8Unit| from an |unsigned char|, and being convertible to
+  // |unsigned char|.  It doesn't satisfy the nice-to-haves of using an
+  // |unsigned char| internally, nor of letting us wrap an existing
+  // |unsigned char| or pointer to one.  We probably *could* do these, if we
+  // were willing to rely harder on implementation-defined behaviors, but for
+  // now we privilege C++'s main character type over some conceptual purity.
+  //
+  // 0. There's a proposal for a UTF-8 character type distinct from the existing
+  //    C++ narrow character types:
+  //
+  //      http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0482r0.html
+  //
+  //    but it hasn't been standardized (and might never be), and none of the
+  //    compilers we really care about have implemented it.  Maybe someday we
+  //    can change our implementation to it without too much trouble, if we're
+  //    lucky...
+  char mValue;
+
+public:
+  explicit constexpr Utf8Unit(char aUnit)
+    : mValue(aUnit)
+  {}
+
+  explicit constexpr Utf8Unit(unsigned char aUnit)
+    : mValue(static_cast<char>(aUnit))
+  {
+    // Per the above comment, the prior cast is integral conversion with
+    // implementation-defined semantics, and we regretfully but unavoidably
+    // assume the conversion does what we want it to.
+  }
+
+  constexpr bool operator==(const Utf8Unit& aOther) const
+  {
+    return mValue == aOther.mValue;
+  }
+
+  constexpr bool operator!=(const Utf8Unit& aOther) const
+  {
+    return !(*this == aOther);
+  }
+
+  /** Convert a UTF-8 code unit to a raw char. */
+  constexpr char toChar() const
+  {
+    // Only a |char| is ever permitted to be written into this location, so this
+    // is both permissible and returns the desired value.
+    return mValue;
+  }
+
+  /** Convert a UTF-8 code unit to a raw unsigned char. */
+  constexpr unsigned char toUnsignedChar() const
+  {
+    // Per the above comment, this is well-defined integral conversion.
+    return static_cast<unsigned char>(mValue);
+  }
+
+  /** Convert a UTF-8 code unit to a uint8_t. */
+  constexpr uint8_t toUint8() const
+  {
+    // Per the above comment, this is well-defined integral conversion.
+    return static_cast<uint8_t>(mValue);
+  }
+
+  // We currently don't expose |&mValue|.  |UnicodeData| sort of does, but
+  // that's a somewhat separate concern, justified in different comments in
+  // that other code.
+};
+
+/**
+ * Returns true if the given length-delimited memory consists of a valid UTF-8
+ * string, false otherwise.
+ *
+ * A valid UTF-8 string contains no overlong-encoded code points (as one would
+ * expect) and contains no code unit sequence encoding a UTF-16 surrogate.  The
+ * string *may* contain U+0000 NULL code points.
+ */
+extern MFBT_API bool
+IsValidUtf8(const void* aCodeUnits, size_t aCount);
+
+} // namespace mozilla
+
+#endif /* mozilla_Utf8_h */
--- a/mfbt/moz.build
+++ b/mfbt/moz.build
@@ -98,16 +98,17 @@ EXPORTS.mozilla = [
     'ToString.h',
     'Tuple.h',
     'TypedEnumBits.h',
     'Types.h',
     'TypeTraits.h',
     'UniquePtr.h',
     'UniquePtrExtensions.h',
     'Unused.h',
+    'Utf8.h',
     'Variant.h',
     'Vector.h',
     'WeakPtr.h',
     'WrappingOperations.h',
     'XorShift128PlusRNG.h',
 ]
 
 EXPORTS["double-conversion"] = [
@@ -141,16 +142,17 @@ UNIFIED_SOURCES += [
     'double-conversion/double-conversion/strtod.cc',
     'FloatingPoint.cpp',
     'HashFunctions.cpp',
     'JSONWriter.cpp',
     'Poison.cpp',
     'SHA1.cpp',
     'TaggedAnonymousMemory.cpp',
     'Unused.cpp',
+    'Utf8.cpp',
 ]
 
 DEFINES['IMPL_MFBT'] = True
 
 SOURCES += [
     'Compression.cpp',
     'decimal/Decimal.cpp',
     'lz4.c',
new file mode 100644
--- /dev/null
+++ b/mfbt/tests/TestUtf8.cpp
@@ -0,0 +1,115 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mozilla/Utf8.h"
+
+#include "mozilla/ArrayUtils.h"
+#include "mozilla/Assertions.h"
+
+using mozilla::ArrayLength;
+using mozilla::IsValidUtf8;
+using mozilla::Utf8Unit;
+
+static void
+TestUtf8Unit()
+{
+  Utf8Unit c('A');
+  MOZ_RELEASE_ASSERT(c.toChar() == 'A');
+  MOZ_RELEASE_ASSERT(c == Utf8Unit('A'));
+  MOZ_RELEASE_ASSERT(c != Utf8Unit('B'));
+  MOZ_RELEASE_ASSERT(c.toUint8() == 0x41);
+
+  unsigned char asUnsigned = 'A';
+  MOZ_RELEASE_ASSERT(c.toUnsignedChar() == asUnsigned);
+  MOZ_RELEASE_ASSERT(Utf8Unit('B').toUnsignedChar() != asUnsigned);
+
+  Utf8Unit first('@');
+  Utf8Unit second('#');
+
+  MOZ_RELEASE_ASSERT(first != second);
+
+  first = second;
+  MOZ_RELEASE_ASSERT(first == second);
+}
+
+static void
+TestIsValidUtf8()
+{
+  // Note we include the U+0000 NULL in this one -- and that's fine.
+  static const char asciiBytes[] = u8"How about a nice game of chess?";
+  MOZ_RELEASE_ASSERT(IsValidUtf8(asciiBytes, ArrayLength(asciiBytes)));
+
+  static const char endNonAsciiBytes[] = u8"Life is like a 🌯";
+  MOZ_RELEASE_ASSERT(IsValidUtf8(endNonAsciiBytes, ArrayLength(endNonAsciiBytes) - 1));
+
+  static const unsigned char badLeading[] = { 0x80 };
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(badLeading, ArrayLength(badLeading)));
+
+  // Byte-counts
+
+  // 1
+  static const char oneBytes[] = u8"A"; // U+0041 LATIN CAPITAL LETTER A
+  constexpr size_t oneBytesLen = ArrayLength(oneBytes);
+  static_assert(oneBytesLen == 2, "U+0041 plus nul");
+  MOZ_RELEASE_ASSERT(IsValidUtf8(oneBytes, oneBytesLen));
+
+  // 2
+  static const char twoBytes[] = u8"؆"; // U+0606 ARABIC-INDIC CUBE ROOT
+  constexpr size_t twoBytesLen = ArrayLength(twoBytes);
+  static_assert(twoBytesLen == 3, "U+0606 in two bytes plus nul");
+  MOZ_RELEASE_ASSERT(IsValidUtf8(twoBytes, twoBytesLen));
+
+  // 3
+  static const char threeBytes[] = u8"᨞"; // U+1A1E BUGINESE PALLAWA
+  constexpr size_t threeBytesLen = ArrayLength(threeBytes);
+  static_assert(threeBytesLen == 4, "U+1A1E in three bytes plus nul");
+  MOZ_RELEASE_ASSERT(IsValidUtf8(threeBytes, threeBytesLen));
+
+  // 4
+  static const char fourBytes[] = u8"🁡"; // U+1F061 DOMINO TILE HORIZONTAL-06-06
+  constexpr size_t fourBytesLen = ArrayLength(fourBytes);
+  static_assert(fourBytesLen == 5, "U+1F061 in four bytes plus nul");
+  MOZ_RELEASE_ASSERT(IsValidUtf8(fourBytes, fourBytesLen));
+
+  // Max code point
+  static const char maxCodePoint[] = u8"􏿿"; // U+10FFFF
+  constexpr size_t maxCodePointLen = ArrayLength(maxCodePoint);
+  static_assert(maxCodePointLen == 5, "U+10FFFF in four bytes plus nul");
+  MOZ_RELEASE_ASSERT(IsValidUtf8(maxCodePoint, maxCodePointLen));
+
+  // One past max code point
+  static unsigned const char onePastMaxCodePoint[] = { 0xF4, 0x90, 0x80, 0x80 };
+  constexpr size_t onePastMaxCodePointLen = ArrayLength(onePastMaxCodePoint);
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(onePastMaxCodePoint, onePastMaxCodePointLen));
+
+  // Surrogate-related testing
+
+  static const unsigned char justBeforeSurrogates[] = { 0xED, 0x9F, 0xBF };
+  MOZ_RELEASE_ASSERT(IsValidUtf8(justBeforeSurrogates, ArrayLength(justBeforeSurrogates)));
+
+  static const unsigned char leastSurrogate[] = { 0xED, 0xA0, 0x80 };
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(leastSurrogate, ArrayLength(leastSurrogate)));
+
+  static const unsigned char arbitraryHighSurrogate[] = { 0xED, 0xA2, 0x87 };
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(arbitraryHighSurrogate, ArrayLength(arbitraryHighSurrogate)));
+
+  static const unsigned char arbitraryLowSurrogate[] = { 0xED, 0xB7, 0xAF };
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(arbitraryLowSurrogate, ArrayLength(arbitraryLowSurrogate)));
+
+  static const unsigned char greatestSurrogate[] = { 0xED, 0xBF, 0xBF };
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(greatestSurrogate, ArrayLength(greatestSurrogate)));
+
+  static const unsigned char justAfterSurrogates[] = { 0xEE, 0x80, 0x80 };
+  MOZ_RELEASE_ASSERT(IsValidUtf8(justAfterSurrogates, ArrayLength(justAfterSurrogates)));
+}
+
+int
+main()
+{
+  TestUtf8Unit();
+  TestIsValidUtf8();
+  return 0;
+}
--- a/mfbt/tests/moz.build
+++ b/mfbt/tests/moz.build
@@ -54,16 +54,17 @@ CppUnitTests([
     'TestSPSCQueue',
     'TestTemplateLib',
     'TestTextUtils',
     'TestThreadSafeWeakPtr',
     'TestTuple',
     'TestTypedEnum',
     'TestTypeTraits',
     'TestUniquePtr',
+    'TestUtf8',
     'TestVariant',
     'TestVector',
     'TestWeakPtr',
     'TestWrappingOperations',
     'TestXorShift128PlusRNG',
 ])
 
 if not CONFIG['MOZ_ASAN']: