Bug 1289003 - Part 2: Add FindSmallestEncoding. r=jwalden
☠☠ backed out by 6135cb7ef5bd ☠ ☠
authorTooru Fujisawa <arai_a@mac.com>
Mon, 15 Aug 2016 15:50:15 +0900
changeset 312397 1bbdfed5b149b231a27692542b6cee5b9f5138a8
parent 312396 b4c7d481bf10a39a99cb41b89bc677434d80dc0c
child 312398 3abe6a1579f9bc79816ab781869b5232a1b3e483
push id20447
push userkwierso@gmail.com
push dateFri, 02 Sep 2016 20:36:44 +0000
treeherderfx-team@969397f22187 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjwalden
bugs1289003
milestone51.0a1
Bug 1289003 - Part 2: Add FindSmallestEncoding. r=jwalden
js/public/CharacterEncoding.h
js/src/vm/CharacterEncoding.cpp
--- a/js/public/CharacterEncoding.h
+++ b/js/public/CharacterEncoding.h
@@ -284,16 +284,34 @@ GetDeflatedUTF8StringLength(JSFlatString
  * than the length of the string, if the buffer is exhausted before the string
  * is fully encoded).
  */
 JS_PUBLIC_API(void)
 DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
                           size_t* dstlenp = nullptr, size_t* numcharsp = nullptr);
 
 /*
+ * The smallest character encoding capable of fully representing a particular
+ * string.
+ */
+enum class SmallestEncoding {
+    ASCII,
+    Latin1,
+    UTF16
+};
+
+/*
+ * Returns the smallest encoding possible for the given string: if all
+ * codepoints are <128 then ASCII, otherwise if all codepoints are <256
+ * Latin-1, else UTF16.
+ */
+JS_PUBLIC_API(SmallestEncoding)
+FindSmallestEncoding(UTF8Chars utf8);
+
+/*
   * Return a null-terminated Latin-1 string copied from the input string,
   * storing its length (excluding null terminator) in |*outlen|.  Fail and
   * report an error if the string contains non-Latin-1 codepoints.  Returns
   * Latin1CharsZ() on failure.
  */
 extern Latin1CharsZ
 UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
 
--- a/js/src/vm/CharacterEncoding.cpp
+++ b/js/src/vm/CharacterEncoding.cpp
@@ -3,16 +3,17 @@
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "js/CharacterEncoding.h"
 
 #include "mozilla/Range.h"
 
+#include <algorithm>
 #include <type_traits>
 
 #include "jscntxt.h"
 #include "jsprf.h"
 
 using namespace js;
 
 Latin1CharsZ
@@ -247,47 +248,50 @@ ReportTooBigCharacter(JSContext* cx, uin
     JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, GetErrorMessage, nullptr,
                                  JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
 }
 
 enum InflateUTF8Action {
     CountAndReportInvalids,
     CountAndIgnoreInvalids,
     AssertNoInvalids,
-    Copy
+    Copy,
+    FindEncoding
 };
 
 static const char16_t REPLACE_UTF8 = 0xFFFD;
 static const Latin1Char REPLACE_UTF8_LATIN1 = '?';
 
 // If making changes to this algorithm, make sure to also update
 // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
 template <InflateUTF8Action Action, typename CharT>
 static bool
 InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, CharT* dst, size_t* dstlenp,
-                          bool* isAsciip)
+                          JS::SmallestEncoding *smallestEncoding)
 {
-    if (Action != AssertNoInvalids)
-        *isAsciip = true;
+    auto RequireLatin1 = [&smallestEncoding]{
+        *smallestEncoding = std::max(JS::SmallestEncoding::Latin1, *smallestEncoding);
+    };
+    auto RequireUTF16 = [&smallestEncoding]{
+        *smallestEncoding = JS::SmallestEncoding::UTF16;
+    };
 
     // Count how many code units need to be in the inflated string.
     // |i| is the index into |src|, and |j| is the the index into |dst|.
     size_t srclen = src.length();
     uint32_t j = 0;
     for (uint32_t i = 0; i < srclen; i++, j++) {
         uint32_t v = uint32_t(src[i]);
         if (!(v & 0x80)) {
             // ASCII code unit.  Simple copy.
             if (Action == Copy)
                 dst[j] = CharT(v);
 
         } else {
             // Non-ASCII code unit.  Determine its length in bytes (n).
-            if (Action != AssertNoInvalids)
-                *isAsciip = false;
             uint32_t n = 1;
             while (v & (0x80 >> n))
                 n++;
 
         #define INVALID(report, arg, n2)                                \
             do {                                                        \
                 if (Action == CountAndReportInvalids) {                 \
                     report(cx, arg);                                    \
@@ -296,17 +300,18 @@ InflateUTF8StringToBuffer(JSContext* cx,
                     MOZ_CRASH("invalid UTF-8 string: " # report);       \
                 } else {                                                \
                     if (Action == Copy) {                               \
                         if (std::is_same<decltype(dst[0]), Latin1Char>::value) \
                             dst[j] = CharT(REPLACE_UTF8_LATIN1);        \
                         else                                            \
                             dst[j] = CharT(REPLACE_UTF8);               \
                     } else {                                            \
-                        MOZ_ASSERT(Action == CountAndIgnoreInvalids);   \
+                        MOZ_ASSERT(Action == CountAndIgnoreInvalids ||  \
+                                   Action == FindEncoding);             \
                     }                                                   \
                     n = n2;                                             \
                     goto invalidMultiByteCodeUnit;                      \
                 }                                                       \
             } while (0)
 
             // Check the leading byte.
             if (n < 2 || n > 4)
@@ -322,22 +327,34 @@ InflateUTF8StringToBuffer(JSContext* cx,
                 (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) ||  // ED 80~9F
                 (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) ||  // F0 90~BF
                 (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80))    // F4 80~8F
             {
                 INVALID(ReportInvalidCharacter, i, 1);
             }
 
             // Check the continuation bytes.
-            for (uint32_t m = 1; m < n; m++)
+            for (uint32_t m = 1; m < n; m++) {
                 if ((src[i + m] & 0xC0) != 0x80)
                     INVALID(ReportInvalidCharacter, i, m);
+            }
 
             // Determine the code unit's length in CharT and act accordingly.
             v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n);
+            if (Action != AssertNoInvalids) {
+                if (v > 0xff) {
+                    RequireUTF16();
+                    if (Action == FindEncoding) {
+                        MOZ_ASSERT(dst == nullptr);
+                        return true;
+                    }
+                } else {
+                    RequireLatin1();
+                }
+            }
             if (v < 0x10000) {
                 // The n-byte UTF8 code unit will fit in a single CharT.
                 if (Action == Copy)
                     dst[j] = CharT(v);
             } else {
                 v -= 0x10000;
                 if (v <= 0xFFFFF) {
                     // The n-byte UTF8 code unit will fit in two CharT units.
@@ -356,46 +373,46 @@ InflateUTF8StringToBuffer(JSContext* cx,
           invalidMultiByteCodeUnit:
             // Move i to the last byte of the multi-byte code unit;  the loop
             // header will do the final i++ to move to the start of the next
             // code unit.
             i += n - 1;
         }
     }
 
-    if (Action != AssertNoInvalids)
+    if (Action != AssertNoInvalids || Action != FindEncoding)
         *dstlenp = j;
 
     return true;
 }
 
 template <InflateUTF8Action Action, typename CharsT>
 static CharsT
 InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
 {
     using CharT = typename CharsT::CharT;
     *outlen = 0;
 
-    bool isAscii;
-    if (!InflateUTF8StringToBuffer<Action, CharT>(cx, src, /* dst = */ nullptr, outlen, &isAscii))
+    JS::SmallestEncoding encoding;
+    if (!InflateUTF8StringToBuffer<Action, CharT>(cx, src, /* dst = */ nullptr, outlen, &encoding))
         return CharsT();
 
     CharT* dst = cx->pod_malloc<CharT>(*outlen + 1);  // +1 for NUL
     if (!dst) {
         ReportOutOfMemory(cx);
         return CharsT();
     }
 
-    if (isAscii) {
+    if (encoding == JS::SmallestEncoding::ASCII) {
         size_t srclen = src.length();
         MOZ_ASSERT(*outlen == srclen);
         for (uint32_t i = 0; i < srclen; i++)
             dst[i] = CharT(src[i]);
     } else {
-        JS_ALWAYS_TRUE((InflateUTF8StringToBuffer<Copy, CharT>(cx, src, dst, outlen, &isAscii)));
+        MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<Copy, CharT>(cx, src, dst, outlen, &encoding)));
     }
 
     dst[*outlen] = 0;    // NUL char
 
     return CharsT(dst, *outlen);
 }
 
 TwoByteCharsZ
@@ -419,16 +436,29 @@ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSC
 
 TwoByteCharsZ
 JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
 {
     UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
     return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, chars, outlen);
 }
 
+JS::SmallestEncoding
+JS::FindSmallestEncoding(UTF8Chars utf8)
+{
+    JS::SmallestEncoding encoding;
+    MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<FindEncoding, char16_t>(
+                         /* cx = */ nullptr,
+                         utf8,
+                         /* dst = */ nullptr,
+                         /* dstlen = */ nullptr,
+                         &encoding)));
+    return encoding;
+}
+
 Latin1CharsZ
 JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
 {
     return InflateUTF8StringHelper<CountAndReportInvalids, Latin1CharsZ>(cx, utf8, outlen);
 }
 
 Latin1CharsZ
 JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)