Bug 1426909 - Abstract out mozilla::DecodeOneUtf8CodePoint for decoding a UTF-8 code point after having consumed a non-ASCII lead unit, with configurable error notification through optional user-provided functors. r=froydnj
authorJeff Walden <jwalden@mit.edu>
Thu, 12 Jul 2018 17:41:31 -0700
changeset 427399 8258ce540165d59bd100a953e89ea316fe5962ac
parent 427398 7d07cfa666bf8bb2ec9aef3c7f88654f66e9a433
child 427400 6db05e2cc96ff33736d32787bb6e333386a45e0e
push id105457
push userjwalden@mit.edu
push dateFri, 20 Jul 2018 06:25:54 +0000
treeherdermozilla-inbound@8258ce540165 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersfroydnj
bugs1426909
milestone63.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1426909 - Abstract out mozilla::DecodeOneUtf8CodePoint for decoding a UTF-8 code point after having consumed a non-ASCII lead unit, with configurable error notification through optional user-provided functors. r=froydnj
mfbt/Utf8.cpp
mfbt/Utf8.h
mfbt/tests/TestUtf8.cpp
--- a/mfbt/Utf8.cpp
+++ b/mfbt/Utf8.cpp
@@ -1,79 +1,39 @@
 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
+#include "mozilla/Maybe.h"
+#include "mozilla/TextUtils.h"
 #include "mozilla/Types.h"
 #include "mozilla/Utf8.h"
 
 #include <stddef.h>
 #include <stdint.h>
 
 MFBT_API bool
 mozilla::IsValidUtf8(const void* aCodeUnits, size_t aCount)
 {
   const auto* s = static_cast<const unsigned char*>(aCodeUnits);
-  const auto* limit = s + aCount;
+  const auto* const limit = s + aCount;
 
   while (s < limit) {
-    uint32_t n = *s++;
+    unsigned char c = *s++;
 
     // If the first byte is ASCII, it's the only one in the code point.  Have a
     // fast path that avoids all the rest of the work and looping in that case.
-    if ((n & 0x80) == 0) {
+    if (IsAscii(c)) {
       continue;
     }
 
-    // The leading code unit determines the length of the next code point and
-    // the number of bits of the leading code unit that contribute to the code
-    // point's value.
-    uint_fast8_t remaining;
-    uint32_t min;
-    if ((n & 0xE0) == 0xC0) {
-      remaining = 1;
-      min = 0x80;
-      n &= 0x1F;
-    } else if ((n & 0xF0) == 0xE0) {
-      remaining = 2;
-      min = 0x800;
-      n &= 0x0F;
-    } else if ((n & 0xF8) == 0xF0) {
-      remaining = 3;
-      min = 0x10000;
-      n &= 0x07;
-    } else {
-      // UTF-8 used to have a hyper-long encoding form, but it's been removed
-      // for years now.  So in this case, the string is not valid UTF-8.
+    Maybe<char32_t> maybeCodePoint =
+      DecodeOneUtf8CodePoint(Utf8Unit(c), &s, limit);
+    if (maybeCodePoint.isNothing()) {
       return false;
     }
-
-    // If the code point would require more code units than remain, the encoding
-    // is invalid.
-    if (s + remaining > limit) {
-      return false;
-    }
-
-    for (uint_fast8_t i = 0; i < remaining; i++) {
-      // Every non-leading code unit in properly encoded UTF-8 has its high bit
-      // set and the next-highest bit unset.
-      if ((s[i] & 0xC0) != 0x80) {
-        return false;
-      }
-
-      // The code point being encoded is the concatenation of all the
-      // unconstrained bits.
-      n = (n << 6) | (s[i] & 0x3F);
-    }
-
-    // Don't consider code points that are overlong, UTF-16 surrogates, or
-    // exceed the maximum code point to be valid.
-    if (n < min || (0xD800 <= n && n < 0xE000) || n >= 0x110000) {
-      return false;
-    }
-
-    s += remaining;
   }
 
+  MOZ_ASSERT(s == limit);
   return true;
 }
--- a/mfbt/Utf8.h
+++ b/mfbt/Utf8.h
@@ -7,16 +7,20 @@
 /*
  * UTF-8-related functionality, including a type-safe structure representing a
  * UTF-8 code unit.
  */
 
 #ifndef mozilla_Utf8_h
 #define mozilla_Utf8_h
 
+#include "mozilla/Casting.h" // for mozilla::AssertedCast
+#include "mozilla/Likely.h" // for MOZ_UNLIKELY
+#include "mozilla/Maybe.h" // for mozilla::Maybe
+#include "mozilla/TextUtils.h" // for mozilla::IsAscii
 #include "mozilla/Types.h" // for MFBT_API
 
 #include <limits.h> // for CHAR_BIT
 #include <stddef.h> // for size_t
 #include <stdint.h> // for uint8_t
 
 namespace mozilla {
 
@@ -200,11 +204,222 @@ public:
  *
  * A valid UTF-8 string contains no overlong-encoded code points (as one would
  * expect) and contains no code unit sequence encoding a UTF-16 surrogate.  The
  * string *may* contain U+0000 NULL code points.
  */
 extern MFBT_API bool
 IsValidUtf8(const void* aCodeUnits, size_t aCount);
 
+/**
+ * Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter|
+ * that (initially) itself points one unit past |aLeadUnit|, and
+ * |const EndIter aEnd| that denotes the end of the UTF-8 data when compared
+ * against |*aIter| using |aEnd - *aIter|:
+ *
+ * If |aLeadUnit| and subsequent code units computed using |*aIter| (up to
+ * |aEnd|) encode a valid code point -- not exceeding Unicode's range, not a
+ * surrogate, in shortest form -- then return Some(that code point) and advance
+ * |*aIter| past those code units.
+ *
+ * Otherwise decrement |*aIter| (so that it points at |aLeadUnit|) and return
+ * Nothing().
+ *
+ * |Iter| and |EndIter| are generalized concepts most easily understood as if
+ * they were |const char*|, |const unsigned char*|, or |const Utf8Unit*|:
+ * iterators that when dereferenced can be used to construct a |Utf8Unit| and
+ * that can be compared and modified in certain limited ways.  (Carefully note
+ * that this function mutates |*aIter|.)  |Iter| and |EndIter| are template
+ * parameters to support more-complicated adaptor iterators.
+ *
+ * The template parameters after |Iter| allow users to implement custom handling
+ * for various forms of invalid UTF-8.  A version of this function that defaults
+ * all such handling to no-ops is defined below this function.  To learn how to
+ * define your own custom handling, consult the implementation of that function,
+ * which documents exactly how custom handler functors are invoked.
+ *
+ * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
+ * of this function without the "Inline" suffix on the name.
+ */
+template<typename Iter,
+         typename EndIter,
+         class OnBadLeadUnit,
+         class OnNotEnoughUnits,
+         class OnBadTrailingUnit,
+         class OnBadCodePoint,
+         class OnNotShortestForm>
+MOZ_ALWAYS_INLINE Maybe<char32_t>
+DecodeOneUtf8CodePointInline(const Utf8Unit aLeadUnit,
+                             Iter* aIter, const EndIter aEnd,
+                             OnBadLeadUnit aOnBadLeadUnit,
+                             OnNotEnoughUnits aOnNotEnoughUnits,
+                             OnBadTrailingUnit aOnBadTrailingUnit,
+                             OnBadCodePoint aOnBadCodePoint,
+                             OnNotShortestForm aOnNotShortestForm)
+{
+  MOZ_ASSERT(Utf8Unit((*aIter)[-1]) == aLeadUnit);
+
+  char32_t n = aLeadUnit.toUint8();
+  MOZ_ASSERT(!IsAscii(n));
+
+  // |aLeadUnit| determines the number of trailing code units in the code point
+  // and the bits of |aLeadUnit| that contribute to the code point's value.
+  uint8_t remaining;
+  uint32_t min;
+  if ((n & 0b1110'0000) == 0b1100'0000) {
+    remaining = 1;
+    min = 0x80;
+    n &= 0b0001'1111;
+  } else if ((n & 0b1111'0000) == 0b1110'0000) {
+    remaining = 2;
+    min = 0x800;
+    n &= 0b0000'1111;
+  } else if ((n & 0b1111'1000) == 0b1111'0000) {
+    remaining = 3;
+    min = 0x10000;
+    n &= 0b0000'0111;
+  } else {
+    *aIter -= 1;
+    aOnBadLeadUnit();
+    return Nothing();
+  }
+
+  // If the code point would require more code units than remain, the encoding
+  // is invalid.
+  auto actual = aEnd - *aIter;
+  if (MOZ_UNLIKELY(actual < remaining)) {
+    *aIter -= 1;
+    aOnNotEnoughUnits(AssertedCast<uint8_t>(actual + 1), remaining + 1);
+    return Nothing();
+  }
+
+  for (uint8_t i = 0; i < remaining; i++) {
+    uint8_t unit = Utf8Unit(*(*aIter)++).toUint8();
+
+    // Every non-leading code unit in properly encoded UTF-8 has its high
+    // bit set and the next-highest bit unset.
+    if (MOZ_UNLIKELY((unit & 0b1100'0000) != 0b1000'0000)) {
+      uint8_t unitsObserved = i + 1 + 1;
+      *aIter -= unitsObserved;
+      aOnBadTrailingUnit(unitsObserved);
+      return Nothing();
+    }
+
+    // The code point being encoded is the concatenation of all the
+    // unconstrained bits.
+    n = (n << 6) | (unit & 0b0011'1111);
+  }
+
+  // UTF-16 surrogates and values outside the Unicode range are invalid.
+  if (MOZ_UNLIKELY(n > 0x10FFFF || (0xD800 <= n && n <= 0xDFFF))) {
+    uint8_t unitsObserved = remaining + 1;
+    *aIter -= unitsObserved;
+    aOnBadCodePoint(n, unitsObserved);
+    return Nothing();
+  }
+
+  // Overlong code points are also invalid.
+  if (MOZ_UNLIKELY(n < min)) {
+    uint8_t unitsObserved = remaining + 1;
+    *aIter -= unitsObserved;
+    aOnNotShortestForm(n, unitsObserved);
+    return Nothing();
+  }
+
+  return Some(n);
+}
+
+/**
+ * Identical to the above function, but not forced to be instantiated inline --
+ * the compiler is permitted to common up separate invocations if it chooses.
+ */
+template<typename Iter,
+         typename EndIter,
+         class OnBadLeadUnit,
+         class OnNotEnoughUnits,
+         class OnBadTrailingUnit,
+         class OnBadCodePoint,
+         class OnNotShortestForm>
+inline Maybe<char32_t>
+DecodeOneUtf8CodePoint(const Utf8Unit aLeadUnit,
+                       Iter* aIter, const EndIter aEnd,
+                       OnBadLeadUnit aOnBadLeadUnit,
+                       OnNotEnoughUnits aOnNotEnoughUnits,
+                       OnBadTrailingUnit aOnBadTrailingUnit,
+                       OnBadCodePoint aOnBadCodePoint,
+                       OnNotShortestForm aOnNotShortestForm)
+{
+  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd,
+                                      aOnBadLeadUnit, aOnNotEnoughUnits,
+                                      aOnBadTrailingUnit, aOnBadCodePoint,
+                                      aOnNotShortestForm);
+}
+
+/**
+ * Like the always-inlined function above, but with no-op behavior from all
+ * trailing if-invalid notifier functors.
+ *
+ * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
+ * of this function without the "Inline" suffix on the name.
+ */
+template<typename Iter, typename EndIter>
+MOZ_ALWAYS_INLINE Maybe<char32_t>
+DecodeOneUtf8CodePointInline(const Utf8Unit aLeadUnit,
+                             Iter* aIter, const EndIter aEnd)
+{
+  // aOnBadLeadUnit is called when |aLeadUnit| itself is an invalid lead unit in
+  // a multi-unit code point.  It is passed no arguments: the caller already has
+  // |aLeadUnit| on hand, so no need to provide it again.
+  auto onBadLeadUnit = []() {};
+
+  // aOnNotEnoughUnits is called when |aLeadUnit| properly indicates a code
+  // point length, but there aren't enough units from |*aIter| to |aEnd| to
+  // satisfy that length.  It is passed the number of code units actually
+  // available (according to |aEnd - *aIter|) and the number of code units that
+  // |aLeadUnit| indicates are needed.  Both numbers include the contribution
+  // of |aLeadUnit| itself: so |aUnitsAvailable <= 3|, |aUnitsNeeded <= 4|, and
+  // |aUnitsAvailable < aUnitsNeeded|.  As above, it also is not passed the lead
+  // code unit.
+  auto onNotEnoughUnits = [](uint8_t aUnitsAvailable, uint8_t aUnitsNeeded) {};
+
+  // aOnBadTrailingUnit is called when one of the trailing code units implied by
+  // |aLeadUnit| doesn't match the 0b10xx'xxxx bit pattern that all UTF-8
+  // trailing code units must satisfy.  It is passed the total count of units
+  // observed (including |aLeadUnit|).  The bad trailing code unit will
+  // conceptually be at |(*aIter)[aUnitsObserved - 1]| if this functor is
+  // called, and so |aUnitsObserved <= 4|.
+  auto onBadTrailingUnit = [](uint8_t aUnitsObserved) {};
+
+  // aOnBadCodePoint is called when a structurally-correct code point encoding
+  // is found, but the *value* that is encoded is not a valid code point: either
+  // because it exceeded the U+10FFFF Unicode maximum code point, or because it
+  // was a UTF-16 surrogate.  It is passed the non-code point value and the
+  // number of code units used to encode it.
+  auto onBadCodePoint = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {};
+
+  // aOnNotShortestForm is called when structurally-correct encoding is found,
+  // but the encoded value should have been encoded in fewer code units (e.g.
+  // mis-encoding U+0000 as 0b1100'0000 0b1000'0000 in two code units instead of
+  // as 0b0000'0000).  It is passed the mis-encoded code point (which will be
+  // valid and not a surrogate) and the count of code units that mis-encoded it.
+  auto onNotShortestForm = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {};
+
+  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd,
+                                      onBadLeadUnit, onNotEnoughUnits,
+                                      onBadTrailingUnit, onBadCodePoint,
+                                      onNotShortestForm);
+}
+
+/**
+ * Identical to the above function, but not forced to be instantiated inline --
+ * the compiler/linker are allowed to common up separate invocations.
+ */
+template<typename Iter, typename EndIter>
+inline Maybe<char32_t>
+DecodeOneUtf8CodePoint(const Utf8Unit aLeadUnit,
+                       Iter* aIter, const EndIter aEnd)
+{
+  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd);
+}
+
 } // namespace mozilla
 
 #endif /* mozilla_Utf8_h */
--- a/mfbt/tests/TestUtf8.cpp
+++ b/mfbt/tests/TestUtf8.cpp
@@ -3,18 +3,25 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "mozilla/Utf8.h"
 
 #include "mozilla/ArrayUtils.h"
 #include "mozilla/Assertions.h"
+#include "mozilla/EnumSet.h"
+#include "mozilla/IntegerRange.h"
+#include "mozilla/TextUtils.h"
 
 using mozilla::ArrayLength;
+using mozilla::DecodeOneUtf8CodePoint;
+using mozilla::EnumSet;
+using mozilla::IntegerRange;
+using mozilla::IsAscii;
 using mozilla::IsValidUtf8;
 using mozilla::Utf8Unit;
 
 static void
 TestUtf8Unit()
 {
   Utf8Unit c('A');
   MOZ_RELEASE_ASSERT(c.toChar() == 'A');
@@ -30,16 +37,252 @@ TestUtf8Unit()
   Utf8Unit second('#');
 
   MOZ_RELEASE_ASSERT(first != second);
 
   first = second;
   MOZ_RELEASE_ASSERT(first == second);
 }
 
+template<typename Char>
+struct ToUtf8Units
+{
+public:
+  explicit ToUtf8Units(const Char* aStart, const Char* aEnd)
+    : lead(Utf8Unit(aStart[0]))
+    , iter(aStart + 1)
+    , end(aEnd)
+  {
+    MOZ_RELEASE_ASSERT(!IsAscii(aStart[0]));
+  }
+
+  const Utf8Unit lead;
+  const Char* iter;
+  const Char* const end;
+};
+
+class AssertIfCalled
+{
+public:
+  template<typename... Args>
+  void operator()(Args&&... aArgs) {
+    MOZ_RELEASE_ASSERT(false, "AssertIfCalled instance was called");
+  }
+};
+
+// NOTE: For simplicity in treating |aCharN| identically regardless whether it's
+//       a string literal or a more-generalized array, we require |aCharN| be
+//       null-terminated.
+
+template<typename Char, size_t N>
+static void
+ExpectValidCodePoint(const Char (&aCharN)[N],
+                     char32_t aExpectedCodePoint)
+{
+  MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
+                     "array must be null-terminated for |aCharN + N - 1| to "
+                     "compute the value of |aIter| as altered by "
+                     "DecodeOneUtf8CodePoint");
+
+  ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
+  auto simple =
+    DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
+  MOZ_RELEASE_ASSERT(simple.isSome());
+  MOZ_RELEASE_ASSERT(*simple == aExpectedCodePoint);
+  MOZ_RELEASE_ASSERT(simpleUnit.iter == simpleUnit.end);
+
+  ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
+  auto complex =
+    DecodeOneUtf8CodePoint(complexUnit.lead, &complexUnit.iter, complexUnit.end,
+                           AssertIfCalled(),
+                           AssertIfCalled(),
+                           AssertIfCalled(),
+                           AssertIfCalled(),
+                           AssertIfCalled());
+  MOZ_RELEASE_ASSERT(complex.isSome());
+  MOZ_RELEASE_ASSERT(*complex == aExpectedCodePoint);
+  MOZ_RELEASE_ASSERT(complexUnit.iter == complexUnit.end);
+}
+
+enum class InvalidUtf8Reason
+{
+  BadLeadUnit,
+  NotEnoughUnits,
+  BadTrailingUnit,
+  BadCodePoint,
+  NotShortestForm,
+};
+
+template<typename Char, size_t N>
+static void
+ExpectInvalidCodePointHelper(const Char (&aCharN)[N],
+                             InvalidUtf8Reason aExpectedReason,
+                             uint8_t aExpectedUnitsAvailable,
+                             uint8_t aExpectedUnitsNeeded,
+                             char32_t aExpectedBadCodePoint,
+                             uint8_t aExpectedUnitsObserved)
+{
+  MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
+                     "array must be null-terminated for |aCharN + N - 1| to "
+                     "compute the value of |aIter| as altered by "
+                     "DecodeOneUtf8CodePoint");
+
+  ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
+  auto simple =
+    DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
+  MOZ_RELEASE_ASSERT(simple.isNothing());
+  MOZ_RELEASE_ASSERT(static_cast<const void*>(simpleUnit.iter) == aCharN);
+
+  EnumSet<InvalidUtf8Reason> reasons;
+  uint8_t unitsAvailable;
+  uint8_t unitsNeeded;
+  char32_t badCodePoint;
+  uint8_t unitsObserved;
+
+  struct OnNotShortestForm
+  {
+    EnumSet<InvalidUtf8Reason>& reasons;
+    char32_t& badCodePoint;
+    uint8_t& unitsObserved;
+
+    void operator()(char32_t aBadCodePoint, uint8_t aUnitsObserved) {
+      reasons += InvalidUtf8Reason::NotShortestForm;
+      badCodePoint = aBadCodePoint;
+      unitsObserved = aUnitsObserved;
+    }
+  };
+
+  ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
+  auto complex =
+    DecodeOneUtf8CodePoint(complexUnit.lead, &complexUnit.iter, complexUnit.end,
+                           [&reasons]() {
+                             reasons += InvalidUtf8Reason::BadLeadUnit;
+                           },
+                           [&reasons, &unitsAvailable, &unitsNeeded](uint8_t aUnitsAvailable,
+                                                                     uint8_t aUnitsNeeded)
+                           {
+                             reasons += InvalidUtf8Reason::NotEnoughUnits;
+                             unitsAvailable = aUnitsAvailable;
+                             unitsNeeded = aUnitsNeeded;
+                           },
+                           [&reasons, &unitsObserved](uint8_t aUnitsObserved)
+                           {
+                             reasons += InvalidUtf8Reason::BadTrailingUnit;
+                             unitsObserved = aUnitsObserved;
+                           },
+                           [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
+                                                                     uint8_t aUnitsObserved)
+                           {
+                             reasons += InvalidUtf8Reason::BadCodePoint;
+                             badCodePoint = aBadCodePoint;
+                             unitsObserved = aUnitsObserved;
+                           },
+                           [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
+                                                                     uint8_t aUnitsObserved)
+                           {
+                             reasons += InvalidUtf8Reason::NotShortestForm;
+                             badCodePoint = aBadCodePoint;
+                             unitsObserved = aUnitsObserved;
+                           });
+  MOZ_RELEASE_ASSERT(complex.isNothing());
+  MOZ_RELEASE_ASSERT(static_cast<const void*>(complexUnit.iter) == aCharN);
+
+  bool alreadyIterated = false;
+  for (InvalidUtf8Reason reason : reasons) {
+    MOZ_RELEASE_ASSERT(!alreadyIterated);
+    alreadyIterated = true;
+
+    switch (reason) {
+    case InvalidUtf8Reason::BadLeadUnit:
+      break;
+
+    case InvalidUtf8Reason::NotEnoughUnits:
+      MOZ_RELEASE_ASSERT(unitsAvailable == aExpectedUnitsAvailable);
+      MOZ_RELEASE_ASSERT(unitsNeeded == aExpectedUnitsNeeded);
+      break;
+
+    case InvalidUtf8Reason::BadTrailingUnit:
+      MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
+      break;
+
+    case InvalidUtf8Reason::BadCodePoint:
+      MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
+      MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
+      break;
+
+    case InvalidUtf8Reason::NotShortestForm:
+      MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
+      MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
+      break;
+    }
+  }
+}
+
+// NOTE: For simplicity in treating |aCharN| identically regardless whether it's
+//       a string literal or a more-generalized array, we require |aCharN| be
+//       null-terminated in all these functions.
+
+template<typename Char, size_t N>
+static void
+ExpectBadLeadUnit(const Char (&aCharN)[N])
+{
+  ExpectInvalidCodePointHelper(aCharN,
+                               InvalidUtf8Reason::BadLeadUnit,
+                               0xFF, 0xFF, 0xFFFFFFFF, 0xFF);
+}
+
+template<typename Char, size_t N>
+static void
+ExpectNotEnoughUnits(const Char (&aCharN)[N],
+                     uint8_t aExpectedUnitsAvailable,
+                     uint8_t aExpectedUnitsNeeded)
+{
+  ExpectInvalidCodePointHelper(aCharN,
+                               InvalidUtf8Reason::NotEnoughUnits,
+                               aExpectedUnitsAvailable, aExpectedUnitsNeeded,
+                               0xFFFFFFFF, 0xFF);
+}
+
+template<typename Char, size_t N>
+static void
+ExpectBadTrailingUnit(const Char (&aCharN)[N],
+                      uint8_t aExpectedUnitsObserved)
+{
+  ExpectInvalidCodePointHelper(aCharN,
+                               InvalidUtf8Reason::BadTrailingUnit,
+                               0xFF, 0xFF, 0xFFFFFFFF,
+                               aExpectedUnitsObserved);
+}
+
+template<typename Char, size_t N>
+static void
+ExpectNotShortestForm(const Char (&aCharN)[N],
+                      char32_t aExpectedBadCodePoint,
+                      uint8_t aExpectedUnitsObserved)
+{
+  ExpectInvalidCodePointHelper(aCharN,
+                               InvalidUtf8Reason::NotShortestForm,
+                               0xFF, 0xFF,
+                               aExpectedBadCodePoint,
+                               aExpectedUnitsObserved);
+}
+
+template<typename Char, size_t N>
+static void
+ExpectBadCodePoint(const Char (&aCharN)[N],
+                   char32_t aExpectedBadCodePoint,
+                   uint8_t aExpectedUnitsObserved)
+{
+  ExpectInvalidCodePointHelper(aCharN,
+                               InvalidUtf8Reason::BadCodePoint,
+                               0xFF, 0xFF,
+                               aExpectedBadCodePoint,
+                               aExpectedUnitsObserved);
+}
+
 static void
 TestIsValidUtf8()
 {
   // Note we include the U+0000 NULL in this one -- and that's fine.
   static const char asciiBytes[] = u8"How about a nice game of chess?";
   MOZ_RELEASE_ASSERT(IsValidUtf8(asciiBytes, ArrayLength(asciiBytes)));
 
   static const char endNonAsciiBytes[] = u8"Life is like a 🌯";
@@ -57,59 +300,481 @@ TestIsValidUtf8()
   MOZ_RELEASE_ASSERT(IsValidUtf8(oneBytes, oneBytesLen));
 
   // 2
   static const char twoBytes[] = u8"؆"; // U+0606 ARABIC-INDIC CUBE ROOT
   constexpr size_t twoBytesLen = ArrayLength(twoBytes);
   static_assert(twoBytesLen == 3, "U+0606 in two bytes plus nul");
   MOZ_RELEASE_ASSERT(IsValidUtf8(twoBytes, twoBytesLen));
 
+  ExpectValidCodePoint(twoBytes, 0x0606);
+
   // 3
   static const char threeBytes[] = u8"᨞"; // U+1A1E BUGINESE PALLAWA
   constexpr size_t threeBytesLen = ArrayLength(threeBytes);
   static_assert(threeBytesLen == 4, "U+1A1E in three bytes plus nul");
   MOZ_RELEASE_ASSERT(IsValidUtf8(threeBytes, threeBytesLen));
 
+  ExpectValidCodePoint(threeBytes, 0x1A1E);
+
   // 4
   static const char fourBytes[] = u8"🁡"; // U+1F061 DOMINO TILE HORIZONTAL-06-06
   constexpr size_t fourBytesLen = ArrayLength(fourBytes);
   static_assert(fourBytesLen == 5, "U+1F061 in four bytes plus nul");
   MOZ_RELEASE_ASSERT(IsValidUtf8(fourBytes, fourBytesLen));
 
+  ExpectValidCodePoint(fourBytes, 0x1F061);
+
   // Max code point
   static const char maxCodePoint[] = u8"􏿿"; // U+10FFFF
   constexpr size_t maxCodePointLen = ArrayLength(maxCodePoint);
   static_assert(maxCodePointLen == 5, "U+10FFFF in four bytes plus nul");
   MOZ_RELEASE_ASSERT(IsValidUtf8(maxCodePoint, maxCodePointLen));
 
+  ExpectValidCodePoint(maxCodePoint, 0x10FFFF);
+
   // One past max code point
-  static unsigned const char onePastMaxCodePoint[] = { 0xF4, 0x90, 0x80, 0x80 };
+  static const unsigned char onePastMaxCodePoint[] = { 0xF4, 0x90, 0x80, 0x80, 0x0 };
   constexpr size_t onePastMaxCodePointLen = ArrayLength(onePastMaxCodePoint);
   MOZ_RELEASE_ASSERT(!IsValidUtf8(onePastMaxCodePoint, onePastMaxCodePointLen));
 
+  ExpectBadCodePoint(onePastMaxCodePoint, 0x110000, 4);
+
   // Surrogate-related testing
 
-  static const unsigned char justBeforeSurrogates[] = { 0xED, 0x9F, 0xBF };
-  MOZ_RELEASE_ASSERT(IsValidUtf8(justBeforeSurrogates, ArrayLength(justBeforeSurrogates)));
+  // (Note that the various code unit sequences here are null-terminated to
+  // simplify life for ExpectValidCodePoint, which presumes null termination.)
+
+  static const unsigned char justBeforeSurrogates[] = { 0xED, 0x9F, 0xBF, 0x0 };
+  constexpr size_t justBeforeSurrogatesLen = ArrayLength(justBeforeSurrogates) - 1;
+  MOZ_RELEASE_ASSERT(IsValidUtf8(justBeforeSurrogates, justBeforeSurrogatesLen));
+
+  ExpectValidCodePoint(justBeforeSurrogates, 0xD7FF);
+
+  static const unsigned char leastSurrogate[] = { 0xED, 0xA0, 0x80, 0x0 };
+  constexpr size_t leastSurrogateLen = ArrayLength(leastSurrogate) - 1;
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(leastSurrogate, leastSurrogateLen));
+
+  ExpectBadCodePoint(leastSurrogate, 0xD800, 3);
+
+  static const unsigned char arbitraryHighSurrogate[] = { 0xED, 0xA2, 0x87, 0x0 };
+  constexpr size_t arbitraryHighSurrogateLen = ArrayLength(arbitraryHighSurrogate) - 1;
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(arbitraryHighSurrogate, arbitraryHighSurrogateLen));
+
+  ExpectBadCodePoint(arbitraryHighSurrogate, 0xD887, 3);
+
+  static const unsigned char arbitraryLowSurrogate[] = { 0xED, 0xB7, 0xAF, 0x0 };
+  constexpr size_t arbitraryLowSurrogateLen = ArrayLength(arbitraryLowSurrogate) - 1;
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(arbitraryLowSurrogate, arbitraryLowSurrogateLen));
+
+  ExpectBadCodePoint(arbitraryLowSurrogate, 0xDDEF, 3);
+
+  static const unsigned char greatestSurrogate[] = { 0xED, 0xBF, 0xBF, 0x0 };
+  constexpr size_t greatestSurrogateLen = ArrayLength(greatestSurrogate) - 1;
+  MOZ_RELEASE_ASSERT(!IsValidUtf8(greatestSurrogate, greatestSurrogateLen));
+
+  ExpectBadCodePoint(greatestSurrogate, 0xDFFF, 3);
+
+  static const unsigned char justAfterSurrogates[] = { 0xEE, 0x80, 0x80, 0x0 };
+  constexpr size_t justAfterSurrogatesLen = ArrayLength(justAfterSurrogates) - 1;
+  MOZ_RELEASE_ASSERT(IsValidUtf8(justAfterSurrogates, justAfterSurrogatesLen));
+
+  ExpectValidCodePoint(justAfterSurrogates, 0xE000);
+}
+
+static void
+TestDecodeOneValidUtf8CodePoint()
+{
+  // NOTE: DecodeOneUtf8CodePoint decodes only *non*-ASCII code points that
+  //       consist of multiple code units, so there are no ASCII tests below.
+
+  // Length two.
+
+  ExpectValidCodePoint(u8"€", 0x80); // <control>
+  ExpectValidCodePoint(u8"©", 0xA9); // COPYRIGHT SIGN
+  ExpectValidCodePoint(u8"¶", 0xB6); // PILCROW SIGN
+  ExpectValidCodePoint(u8"¾", 0xBE); // VULGAR FRACTION THREE QUARTERS
+  ExpectValidCodePoint(u8"÷", 0xF7); // DIVISION SIGN
+  ExpectValidCodePoint(u8"ÿ", 0xFF); // LATIN SMALL LETTER Y WITH DIAERESIS
+  ExpectValidCodePoint(u8"Ā", 0x100); // LATIN CAPITAL LETTER A WITH MACRON
+  ExpectValidCodePoint(u8"IJ", 0x132); // LATIN CAPITAL LETTER LIGATURE IJ
+  ExpectValidCodePoint(u8"ͼ", 0x37C); // GREEK SMALL DOTTED LUNATE SIGMA SYMBOL
+  ExpectValidCodePoint(u8"Ӝ", 0x4DC); // CYRILLIC CAPITAL LETTER ZHE WITTH DIAERESIS
+  ExpectValidCodePoint(u8"۩", 0x6E9); // ARABIC PLACE OF SAJDAH
+  ExpectValidCodePoint(u8"߿", 0x7FF); // <not assigned>
+
+  // Length three.
+
+  ExpectValidCodePoint(u8"ࠀ", 0x800); // SAMARITAN LETTER ALAF
+  ExpectValidCodePoint(u8"ࡁ", 0x841); // MANDAIC LETTER AB
+  ExpectValidCodePoint(u8"ࣿ", 0x8FF); // ARABIC MARK SIDEWAYS NOON GHUNNA
+  ExpectValidCodePoint(u8"ஆ", 0xB86); // TAMIL LETTER AA
+  ExpectValidCodePoint(u8"༃", 0xF03); // TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA
+  ExpectValidCodePoint(u8"࿉", 0xFC9); // TIBETAN SYMBOL NOR BU (but on my system it really looks like SOFT-SERVE ICE CREAM FROM ABOVE THE PLANE if you ask me)
+  ExpectValidCodePoint(u8"ဪ", 0x102A); // MYANMAR LETTER AU
+  ExpectValidCodePoint(u8"ᚏ", 0x168F); // OGHAM LETTER RUIS
+  ExpectValidCodePoint("\xE2\x80\xA8", 0x2028); // (the hated) LINE SEPARATOR
+  ExpectValidCodePoint("\xE2\x80\xA9", 0x2029); // (the hated) PARAGRAPH SEPARATOR
+  ExpectValidCodePoint(u8"☬", 0x262C); // ADI SHAKTI
+  ExpectValidCodePoint(u8"㊮", 0x32AE); // CIRCLED IDEOGRAPH RESOURCE
+  ExpectValidCodePoint(u8"㏖", 0x33D6); // SQUARE MOL
+  ExpectValidCodePoint(u8"ꔄ", 0xA504); // VAI SYLLABLE WEEN
+  ExpectValidCodePoint(u8"ퟕ", 0xD7D5); // HANGUL JONGSEONG RIEUL-SSANGKIYEOK
+  ExpectValidCodePoint(u8"퟿", 0xD7FF); // <not assigned>
+  ExpectValidCodePoint(u8"", 0xE000); // <Private Use>
+  ExpectValidCodePoint(u8"鱗", 0xF9F2); // CJK COMPATIBILITY IDEOGRAPH-F9F
+  ExpectValidCodePoint(u8"﷽", 0xFDFD); // ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHHHEEEEM
+  ExpectValidCodePoint(u8"￿", 0xFFFF); // <not assigned>
+
+  // Length four.
+  ExpectValidCodePoint(u8"𐀀", 0x10000); // LINEAR B SYLLABLE B008 A
+  ExpectValidCodePoint(u8"𔑀", 0x14440); // ANATOLIAN HIEROGLYPH A058
+  ExpectValidCodePoint(u8"𝛗", 0x1D6D7); // MATHEMATICAL BOLD SMALL PHI
+  ExpectValidCodePoint(u8"💩", 0x1F4A9); // PILE OF POO
+  ExpectValidCodePoint(u8"🔫", 0x1F52B); // PISTOL
+  ExpectValidCodePoint(u8"🥌", 0x1F94C); // CURLING STONE
+  ExpectValidCodePoint(u8"🥏", 0x1F94F); // FLYING DISC
+  ExpectValidCodePoint(u8"𠍆", 0x20346); // CJK UNIFIED IDEOGRAPH-20346
+  ExpectValidCodePoint(u8"𡠺", 0x2183A); // CJK UNIFIED IDEOGRAPH-2183A
+  ExpectValidCodePoint(u8"񁟶", 0x417F6); // <not assigned>
+  ExpectValidCodePoint(u8"񾠶", 0x7E836); // <not assigned>
+  ExpectValidCodePoint(u8"󾽧", 0xFEF67); // <Plane 15 Private Use>
+  ExpectValidCodePoint(u8"􏿿", 0x10FFFF); //
+}
+
+static void
+TestDecodeBadLeadUnit()
+{
+  // These tests are actually exhaustive.
 
-  static const unsigned char leastSurrogate[] = { 0xED, 0xA0, 0x80 };
-  MOZ_RELEASE_ASSERT(!IsValidUtf8(leastSurrogate, ArrayLength(leastSurrogate)));
+  unsigned char badLead[] = { '\0', '\0' };
+
+  for (uint8_t lead : IntegerRange(0b1000'0000, 0b1100'0000)) {
+    badLead[0] = lead;
+    ExpectBadLeadUnit(badLead);
+  }
+
+  {
+    uint8_t lead = 0b1111'1000;
+    do {
+      badLead[0] = lead;
+      ExpectBadLeadUnit(badLead);
+      if (lead == 0b1111'1111) {
+        break;
+      }
+
+      lead++;
+    } while (true);
+  }
+}
+
+static void
+TestTooFewOrBadTrailingUnits()
+{
+  // Lead unit indicates a two-byte code point.
+
+  char truncatedTwo[] = { '\0', '\0' };
+  char badTrailTwo[] = { '\0', '\0', '\0' };
+
+  for (uint8_t lead : IntegerRange(0b1100'0000, 0b1110'0000)) {
+    truncatedTwo[0] = lead;
+    ExpectNotEnoughUnits(truncatedTwo, 1, 2);
+
+    badTrailTwo[0] = lead;
+    for (uint8_t trail : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailTwo[1] = trail;
+      ExpectBadTrailingUnit(badTrailTwo, 2);
+    }
+
+    for (uint8_t trail : IntegerRange(0b1100'0000, 0b1111'1111)) {
+      badTrailTwo[1] = trail;
+      ExpectBadTrailingUnit(badTrailTwo, 2);
+    }
+  }
+
+  // Lead unit indicates a three-byte code point.
+
+  char truncatedThreeOne[] = { '\0', '\0' };
+  char truncatedThreeTwo[] = { '\0', '\0', '\0' };
+  unsigned char badTrailThree[] = { '\0', '\0', '\0', '\0' };
 
-  static const unsigned char arbitraryHighSurrogate[] = { 0xED, 0xA2, 0x87 };
-  MOZ_RELEASE_ASSERT(!IsValidUtf8(arbitraryHighSurrogate, ArrayLength(arbitraryHighSurrogate)));
+  for (uint8_t lead : IntegerRange(0b1110'0000, 0b1111'0000)) {
+    truncatedThreeOne[0] = lead;
+    ExpectNotEnoughUnits(truncatedThreeOne, 1, 3);
+
+    truncatedThreeTwo[0] = lead;
+    ExpectNotEnoughUnits(truncatedThreeTwo, 2, 3);
+
+    badTrailThree[0] = lead;
+    badTrailThree[2] = 0b1011'1111; // make valid to test overreads
+    for (uint8_t mid : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailThree[1] = mid;
+      ExpectBadTrailingUnit(badTrailThree, 2);
+    }
+    {
+      uint8_t mid = 0b1100'0000;
+      do {
+        badTrailThree[1] = mid;
+        ExpectBadTrailingUnit(badTrailThree, 2);
+        if (mid == 0b1111'1111) {
+          break;
+        }
+
+        mid++;
+      } while (true);
+    }
+
+    badTrailThree[1] = 0b1011'1111;
+    for (uint8_t last : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailThree[2] = last;
+      ExpectBadTrailingUnit(badTrailThree, 3);
+    }
+    {
+      uint8_t last = 0b1100'0000;
+      do {
+        badTrailThree[2] = last;
+        ExpectBadTrailingUnit(badTrailThree, 3);
+        if (last == 0b1111'1111) {
+          break;
+        }
+
+        last++;
+      } while (true);
+    }
+  }
+
+  // Lead unit indicates a four-byte code point.
+
+  char truncatedFourOne[] = { '\0', '\0' };
+  char truncatedFourTwo[] = { '\0', '\0', '\0' };
+  char truncatedFourThree[] = { '\0', '\0', '\0', '\0' };
+
+  unsigned char badTrailFour[] = { '\0', '\0', '\0', '\0', '\0' };
+
+  for (uint8_t lead : IntegerRange(0b1111'0000, 0b1111'1000)) {
+    truncatedFourOne[0] = lead;
+    ExpectNotEnoughUnits(truncatedFourOne, 1, 4);
 
-  static const unsigned char arbitraryLowSurrogate[] = { 0xED, 0xB7, 0xAF };
-  MOZ_RELEASE_ASSERT(!IsValidUtf8(arbitraryLowSurrogate, ArrayLength(arbitraryLowSurrogate)));
+    truncatedFourTwo[0] = lead;
+    ExpectNotEnoughUnits(truncatedFourTwo, 2, 4);
+
+    truncatedFourThree[0] = lead;
+    ExpectNotEnoughUnits(truncatedFourThree, 3, 4);
+
+    badTrailFour[0] = lead;
+    badTrailFour[2] = badTrailFour[3] = 0b1011'1111; // test for overreads
+    for (uint8_t second : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailFour[1] = second;
+      ExpectBadTrailingUnit(badTrailFour, 2);
+    }
+    {
+      uint8_t second = 0b1100'0000;
+      do {
+        badTrailFour[1] = second;
+        ExpectBadTrailingUnit(badTrailFour, 2);
+        if (second == 0b1111'1111) {
+          break;
+        }
+
+        second++;
+      } while (true);
+    }
+
+    badTrailFour[1] = badTrailFour[3] = 0b1011'1111; // test for overreads
+    for (uint8_t third : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailFour[2] = third;
+      ExpectBadTrailingUnit(badTrailFour, 3);
+    }
+    {
+      uint8_t third = 0b1100'0000;
+      do {
+        badTrailFour[2] = third;
+        ExpectBadTrailingUnit(badTrailFour, 3);
+        if (third == 0b1111'1111) {
+          break;
+        }
+
+        third++;
+      } while (true);
+    }
+
+    badTrailFour[2] = 0b1011'1111;
+    for (uint8_t fourth : IntegerRange(0b0000'0000, 0b1000'0000)) {
+      badTrailFour[3] = fourth;
+      ExpectBadTrailingUnit(badTrailFour, 4);
+    }
+    {
+      uint8_t fourth = 0b1100'0000;
+      do {
+        badTrailFour[3] = fourth;
+        ExpectBadTrailingUnit(badTrailFour, 4);
+        if (fourth == 0b1111'1111) {
+          break;
+        }
+
+        fourth++;
+      } while (true);
+    }
+  }
+}
+
+static void
+TestBadSurrogate()
+{
+  // These tests are actually exhaustive.
+
+  ExpectValidCodePoint("\xED\x9F\xBF", 0xD7FF); // last before surrogates
+  ExpectValidCodePoint("\xEE\x80\x80", 0xE000); // first after surrogates
+
+  // First invalid surrogate encoding is { 0xED, 0xA0, 0x80 }.  Last invalid
+  // surrogate encoding is { 0xED, 0xBF, 0xBF }.
+
+  char badSurrogate[] = { '\xED', '\0', '\0', '\0' };
+
+  for (char32_t c = 0xD800; c < 0xE000; c++) {
+    badSurrogate[1] = 0b1000'0000 ^ ((c & 0b1111'1100'0000) >> 6);
+    badSurrogate[2] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));
+
+    ExpectBadCodePoint(badSurrogate, c, 3);
+  }
+}
+
+static void
+TestBadTooBig()
+{
+  // These tests are actually exhaustive.
+
+  ExpectValidCodePoint("\xF4\x8F\xBF\xBF", 0x10'FFFF); // last code point
+
+  // Four-byte code points are
+  //
+  //   0b1111'0xxx 0b10xx'xxxx 0b10xx'xxxx 0b10xx'xxxx
+  //
+  // with 3 + 6 + 6 + 6 == 21 unconstrained bytes, so the structurally
+  // representable limit (exclusive) is 2**21 - 1 == 2097152.
+
+  char tooLargeCodePoint[] = { '\0', '\0', '\0', '\0', '\0' };
+
+  for (char32_t c = 0x11'0000; c < (1 << 21); c++) {
+    tooLargeCodePoint[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
+    tooLargeCodePoint[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
+    tooLargeCodePoint[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
+    tooLargeCodePoint[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
 
-  static const unsigned char greatestSurrogate[] = { 0xED, 0xBF, 0xBF };
-  MOZ_RELEASE_ASSERT(!IsValidUtf8(greatestSurrogate, ArrayLength(greatestSurrogate)));
+    ExpectBadCodePoint(tooLargeCodePoint, c, 4);
+  }
+}
+
+static void
+TestBadCodePoint()
+{
+  TestBadSurrogate();
+  TestBadTooBig();
+}
+
+static void
+TestNotShortestForm()
+{
+  {
+    // One-byte in two-byte.
+
+    char oneInTwo[] = { '\0', '\0', '\0' };
+
+    for (char32_t c = '\0'; c < 0x80; c++) {
+      oneInTwo[0] = 0b1100'0000 ^ ((c & 0b0111'1100'0000) >> 6);
+      oneInTwo[1] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));
+
+      ExpectNotShortestForm(oneInTwo, c, 2);
+    }
+
+    // One-byte in three-byte.
+
+    char oneInThree[] = { '\0', '\0', '\0', '\0' };
+
+    for (char32_t c = '\0'; c < 0x80; c++) {
+      oneInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
+      oneInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
+      oneInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));
+
+      ExpectNotShortestForm(oneInThree, c, 3);
+    }
+
+    // One-byte in four-byte.
+
+    char oneInFour[] = { '\0', '\0', '\0', '\0', '\0' };
+
+    for (char32_t c = '\0'; c < 0x80; c++) {
+      oneInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
+      oneInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
+      oneInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
+      oneInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
+
+      ExpectNotShortestForm(oneInFour, c, 4);
+    }
+  }
+
+  {
+    // Two-byte in three-byte.
 
-  static const unsigned char justAfterSurrogates[] = { 0xEE, 0x80, 0x80 };
-  MOZ_RELEASE_ASSERT(IsValidUtf8(justAfterSurrogates, ArrayLength(justAfterSurrogates)));
+    char twoInThree[] = { '\0', '\0', '\0', '\0' };
+
+    for (char32_t c = 0x80; c < 0x800; c++) {
+      twoInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
+      twoInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
+      twoInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));
+
+      ExpectNotShortestForm(twoInThree, c, 3);
+    }
+
+    // Two-byte in four-byte.
+
+    char twoInFour[] = { '\0', '\0', '\0', '\0', '\0' };
+
+    for (char32_t c = 0x80; c < 0x800; c++) {
+      twoInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
+      twoInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
+      twoInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
+      twoInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
+
+      ExpectNotShortestForm(twoInFour, c, 4);
+    }
+  }
+
+  {
+    // Three-byte in four-byte.
+
+    char threeInFour[] = { '\0', '\0', '\0', '\0', '\0' };
+
+    for (char32_t c = 0x800; c < 0x1'0000; c++) {
+      threeInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
+      threeInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
+      threeInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
+      threeInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
+
+      ExpectNotShortestForm(threeInFour, c, 4);
+    }
+  }
+}
+
+static void
+TestDecodeOneInvalidUtf8CodePoint()
+{
+  TestDecodeBadLeadUnit();
+  TestTooFewOrBadTrailingUnits();
+  TestBadCodePoint();
+  TestNotShortestForm();
+}
+
+static void
+TestDecodeOneUtf8CodePoint()
+{
+  TestDecodeOneValidUtf8CodePoint();
+  TestDecodeOneInvalidUtf8CodePoint();
 }
 
 int
 main()
 {
   TestUtf8Unit();
   TestIsValidUtf8();
+  TestDecodeOneUtf8CodePoint();
   return 0;
 }