Bug 1478587 - Implement mozilla::Utf8AsUnsignedChars to centralize UTF-8-to-unsigned-chars casts and their justifications. r=froydnj
☠☠ backed out by fd12c0a357b3 ☠ ☠
authorJeff Walden <jwalden@mit.edu>
Thu, 26 Jul 2018 19:43:33 -0700
changeset 428896 d1cd66e6d3c3a76928f19ffa7f2f0a045999c8d6
parent 428895 63ccd68e1da3a41f1e99d4e7bb991b6c9308761b
child 428897 aa8a0735f30395cbc9ff637a87feec2a0422f624
push id105801
push userjwalden@mit.edu
push dateSun, 29 Jul 2018 04:09:23 +0000
treeherdermozilla-inbound@185a4564afa5 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersfroydnj
bugs1478587
milestone63.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1478587 - Implement mozilla::Utf8AsUnsignedChars to centralize UTF-8-to-unsigned-chars casts and their justifications. r=froydnj
mfbt/Utf8.h
--- a/mfbt/Utf8.h
+++ b/mfbt/Utf8.h
@@ -193,16 +193,50 @@ public:
     return static_cast<uint8_t>(mValue);
   }
 
   // We currently don't expose |&mValue|.  |UnicodeData| sort of does, but
   // that's a somewhat separate concern, justified in different comments in
   // that other code.
 };
 
+/**
+ * Reinterpret the address of a UTF-8 code unit as |const unsigned char*|.
+ *
+ * Assuming proper backing has been set up, the resulting |const unsigned char*|
+ * may validly be dereferenced.
+ *
+ * No access is provided to mutate this underlying memory as |unsigned char|.
+ * Presently memory inside |Utf8Unit| is *only* stored as |char|, and we are
+ * loath to offer a way to write non-|char| data until absolutely necessary.
+ */
+inline const unsigned char*
+Utf8AsUnsignedChars(const Utf8Unit* aUnits)
+{
+  static_assert(sizeof(Utf8Unit) == sizeof(unsigned char),
+                "sizes must match to permissibly reinterpret_cast<>");
+  static_assert(alignof(Utf8Unit) == alignof(unsigned char),
+                "alignment must match to permissibly reinterpret_cast<>");
+
+  // The static_asserts above only enable the reinterpret_cast<> to occur.
+  //
+  // Dereferencing the resulting pointer is a separate question.  Any object's
+  // memory may be interpreted as |unsigned char| per C++11 [basic.lval]p10, but
+  // this doesn't guarantee what values will be observed.  If |char| is
+  // implemented to act like |unsigned char|, we're good to go: memory for the
+  // |char| in |Utf8Unit| acts as we need.  But if |char| is implemented to act
+  // like |signed char|, dereferencing produces the right value only if the
+  // |char| types all use two's-complement representation.  Every modern
+  // compiler does this, and there's a C++ proposal to standardize it.
+  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0907r0.html   So
+  // *technically* this is implementation-defined -- but everyone does it and
+  // this behavior is being standardized.
+  return reinterpret_cast<const unsigned char*>(aUnits);
+}
+
 /** Returns true iff |aUnit| is an ASCII value. */
 inline bool
 IsAscii(Utf8Unit aUnit)
 {
   return IsAscii(aUnit.toUint8());
 }
 
 /**