Bug 1495571 - Part 8: Add support for early escape from the loop. Eliminate the Action enum. r=efaust
authorJason Orendorff <jorendorff@mozilla.com>
Tue, 02 Oct 2018 14:26:35 +0000
changeset 494940 995cd5fca351d5bf3c00e745f3cc22a2fab6fd72
parent 494939 b999f2758e3ea42c804b504dc45ac322bcd7fa80
child 494941 07184f24860bafda10d0eac583236ae2e4544fef
push id9984
push userffxbld-merge
push dateMon, 15 Oct 2018 21:07:35 +0000
treeherdermozilla-beta@183d27ea8570 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersefaust
bugs1495571
milestone64.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1495571 - Part 8: Add support for early escape from the loop. Eliminate the Action enum. r=efaust Depends on D7376 Differential Revision: https://phabricator.services.mozilla.com/D7377
js/src/vm/CharacterEncoding.cpp
--- a/js/src/vm/CharacterEncoding.cpp
+++ b/js/src/vm/CharacterEncoding.cpp
@@ -254,21 +254,19 @@ ReportBufferTooSmall(JSContext* cx, uint
 static void
 ReportTooBigCharacter(JSContext* cx, uint32_t v)
 {
     char buffer[10];
     SprintfLiteral(buffer, "0x%x", v + 0x10000);
     JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
 }
 
-enum InflateUTF8Action {
-    Count,
-    Nop,
-    Copy,
-    FindEncoding
+enum class LoopDisposition {
+    Break,
+    Continue,
 };
 
 enum class OnUTF8Error {
     InsertReplacementCharacter,
     InsertQuestionMark,
     Throw,
     Crash,
 };
@@ -279,38 +277,28 @@ static const char16_t REPLACEMENT_CHARAC
 
 // If making changes to this algorithm, make sure to also update
 // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
 //
 // Scan UTF8 input and (internally, at least) convert it to a series of UTF-16
 // code units. But you can also do odd things like pass an empty lambda for
 // `dst`, in which case the output is discarded entirely--the only effect of
 // calling the template that way is error-checking.
-template <InflateUTF8Action Action, OnUTF8Error ErrorAction, typename OutputFn>
+template <OnUTF8Error ErrorAction, typename OutputFn>
 static bool
-InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst,
-                   JS::SmallestEncoding *smallestEncoding)
+InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst)
 {
-    if (Action != Nop) {
-        *smallestEncoding = JS::SmallestEncoding::ASCII;
-    }
-    auto RequireLatin1 = [&smallestEncoding]{
-        *smallestEncoding = std::max(JS::SmallestEncoding::Latin1, *smallestEncoding);
-    };
-    auto RequireUTF16 = [&smallestEncoding]{
-        *smallestEncoding = JS::SmallestEncoding::UTF16;
-    };
-
     size_t srclen = src.length();
     for (uint32_t i = 0; i < srclen; i++) {
         uint32_t v = uint32_t(src[i]);
         if (!(v & 0x80)) {
             // ASCII code unit.  Simple copy.
-            dst(uint16_t(v));
-
+            if (dst(uint16_t(v)) == LoopDisposition::Break) {
+                break;
+            }
         } else {
             // Non-ASCII code unit.  Determine its length in bytes (n).
             uint32_t n = 1;
             while (v & (0x80 >> n)) {
                 n++;
             }
 
         #define INVALID(report, arg, n2)                                \
@@ -323,17 +311,19 @@ InflateUTF8ToUTF16(JSContext* cx, const 
                 } else {                                                \
                     char16_t replacement;                               \
                     if (ErrorAction == OnUTF8Error::InsertReplacementCharacter) { \
                         replacement = REPLACEMENT_CHARACTER;            \
                     } else {                                            \
                         MOZ_ASSERT(ErrorAction == OnUTF8Error::InsertQuestionMark); \
                         replacement = '?';                              \
                     }                                                   \
-                    dst(replacement);                                   \
+                    if (dst(replacement) == LoopDisposition::Break) {   \
+                        break;                                          \
+                    }                                                   \
                     n = n2;                                             \
                     goto invalidMultiByteCodeUnit;                      \
                 }                                                       \
             } while (0)
 
             // Check the leading byte.
             if (n < 2 || n > 4) {
                 INVALID(ReportInvalidCharacter, i, 1);
@@ -358,49 +348,42 @@ InflateUTF8ToUTF16(JSContext* cx, const 
             for (uint32_t m = 1; m < n; m++) {
                 if ((src[i + m] & 0xC0) != 0x80) {
                     INVALID(ReportInvalidCharacter, i, m);
                 }
             }
 
             // Determine the code unit's length in CharT and act accordingly.
             v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n);
-            if (Action != Nop) {
-                if (v > 0xff) {
-                    RequireUTF16();
-                    if (Action == FindEncoding) {
-                        return true;
-                    }
-                } else {
-                    RequireLatin1();
-                }
-            }
             if (v < 0x10000) {
                 // The n-byte UTF8 code unit will fit in a single CharT.
-                dst(char16_t(v));
+                if (dst(char16_t(v)) == LoopDisposition::Break) {
+                    break;
+                }
             } else {
                 v -= 0x10000;
                 if (v <= 0xFFFFF) {
                     // The n-byte UTF8 code unit will fit in two CharT units.
-                    dst(char16_t((v >> 10) + 0xD800));
-                    dst(char16_t((v & 0x3FF) + 0xDC00));
+                    if (dst(char16_t((v >> 10) + 0xD800)) == LoopDisposition::Break) {
+                        break;
+                    }
+                    if (dst(char16_t((v & 0x3FF) + 0xDC00)) == LoopDisposition::Break) {
+                        break;
+                    }
                 } else {
                     // The n-byte UTF8 code unit won't fit in two CharT units.
                     INVALID(ReportTooBigCharacter, v, 1);
                 }
             }
 
           invalidMultiByteCodeUnit:
-            // Move i to the last byte of the multi-byte code unit;  the loop
+            // Move i to the last byte of the multi-byte code unit; the loop
             // header will do the final i++ to move to the start of the next
             // code unit.
             i += n - 1;
-            if (Action != Nop) {
-                RequireUTF16();
-            }
         }
     }
 
     return true;
 }
 
 template <OnUTF8Error ErrorAction, typename CharsT>
 static CharsT
@@ -408,43 +391,50 @@ InflateUTF8StringHelper(JSContext* cx, c
 {
     using CharT = typename CharsT::CharT;
     static_assert(std::is_same<CharT, char16_t>::value ||
                   std::is_same<CharT, Latin1Char>::value,
                   "bad CharT");
 
     *outlen = 0;
 
-    JS::SmallestEncoding encoding;
     size_t len = 0;
-    auto count = [&](char16_t) { len++; };
-    if (!InflateUTF8ToUTF16<Count, ErrorAction>(cx, src, count, &encoding)) {
+    bool allASCII = true;
+    auto count = [&](char16_t c) -> LoopDisposition {
+        len++;
+        allASCII &= (c < 0x80);
+        return LoopDisposition::Continue;
+    };
+    if (!InflateUTF8ToUTF16<ErrorAction>(cx, src, count)) {
         return CharsT();
     }
     *outlen = len;
 
     CharT* dst = cx->template pod_malloc<CharT>(*outlen + 1);  // +1 for NUL
     if (!dst) {
         ReportOutOfMemory(cx);
         return CharsT();
     }
 
-    if (encoding == JS::SmallestEncoding::ASCII) {
+    if (allASCII) {
         size_t srclen = src.length();
         MOZ_ASSERT(*outlen == srclen);
         for (uint32_t i = 0; i < srclen; i++) {
             dst[i] = CharT(src[i]);
         }
     } else {
         constexpr OnUTF8Error errorMode = std::is_same<CharT, Latin1Char>::value
             ? OnUTF8Error::InsertQuestionMark
             : OnUTF8Error::InsertReplacementCharacter;
         size_t j = 0;
-        auto push = [&](char16_t c) { dst[j++] = CharT(c); };
-        MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<Copy, errorMode>(cx, src, push, &encoding)));
+        auto push = [&](char16_t c) -> LoopDisposition {
+            dst[j++] = CharT(c);
+            return LoopDisposition::Continue;
+        };
+        MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<errorMode>(cx, src, push)));
         MOZ_ASSERT(j == len);
     }
     dst[*outlen] = 0;    // NUL char
 
     return CharsT(dst, *outlen);
 }
 
 TwoByteCharsZ
@@ -471,22 +461,30 @@ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSC
 {
     UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
     return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter, TwoByteCharsZ>(cx, chars, outlen);
 }
 
 JS::SmallestEncoding
 JS::FindSmallestEncoding(UTF8Chars utf8)
 {
-    JS::SmallestEncoding encoding;
-    MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<FindEncoding, OnUTF8Error::InsertReplacementCharacter>(
-                         /* cx = */ nullptr,
-                         utf8,
-                         [](char16_t) {},
-                         &encoding)));
+    JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII;
+    auto onChar = [&](char16_t c) -> LoopDisposition {
+        if (c >= 0x80) {
+            if (c < 0x100) {
+                encoding = JS::SmallestEncoding::Latin1;
+            } else {
+                encoding = JS::SmallestEncoding::UTF16;
+                return LoopDisposition::Break;
+            }
+        }
+        return LoopDisposition::Continue;
+    };
+    MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<OnUTF8Error::InsertReplacementCharacter>(
+                         /* cx = */ nullptr, utf8, onChar)));
     return encoding;
 }
 
 Latin1CharsZ
 JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
 {
     return InflateUTF8StringHelper<OnUTF8Error::Throw, Latin1CharsZ>(cx, utf8, outlen);
 }
@@ -498,21 +496,18 @@ JS::LossyUTF8CharsToNewLatin1CharsZ(JSCo
 }
 
 #ifdef DEBUG
 void
 JS::ConstUTF8CharsZ::validate(size_t aLength)
 {
     MOZ_ASSERT(data_);
     UTF8Chars chars(data_, aLength);
-    InflateUTF8ToUTF16<Nop, OnUTF8Error::Crash>(
-        /* cx = */ nullptr,
-        chars,
-        [](char16_t) {},
-        /* smallestEncoding = */ nullptr);
+    auto nop = [](char16_t) -> LoopDisposition { return LoopDisposition::Continue; };
+    InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, chars, nop);
 }
 #endif
 
 bool
 JS::StringIsASCII(const char* s)
 {
     while (*s) {
         if (*s & 0x80) {