Bug 1362154: Part 1: Rewrite FlatStringReader r=mgaudet
authorIain Ireland <iireland@mozilla.com>
Wed, 20 May 2020 17:30:56 +0000
changeset 531646 7f7a204fa0246f1aa85bc58137bfc023ee2b8d43
parent 531645 d0e52aa0a9f0af8fcafddacdcc2851ac375dc250
child 531647 e7d2300295143bf951d8bdecd7619ea78722bb80
push id37441
push userapavel@mozilla.com
push dateFri, 22 May 2020 21:38:53 +0000
treeherdermozilla-central@d6abd35b54ad [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersmgaudet
bugs1362154
milestone78.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1362154: Part 1: Rewrite FlatStringReader r=mgaudet My initial shim of FlatStringReader used AutoCheckCannotGC, but the API we need to support doesn't actually require that. We only need to be able to get characters by index, which at most requires a HandleLinearString. Getting rid of this AutoCheckCannotGC helps pave the way for allocating GC things while parsing regexps, which is a huge simplification for the FixedArray that irregexp uses to return named capture information. Differential Revision: https://phabricator.services.mozilla.com/D76033
js/src/new-regexp/RegExpAPI.cpp
js/src/new-regexp/regexp-shim.h
--- a/js/src/new-regexp/RegExpAPI.cpp
+++ b/js/src/new-regexp/RegExpAPI.cpp
@@ -242,17 +242,17 @@ static bool CheckPatternSyntaxImpl(JSCon
 
   HandleScope handleScope(cx->isolate);
   return RegExpParser::ParseRegExp(cx->isolate, &zone, pattern, flags, result);
 }
 
 bool CheckPatternSyntax(JSContext* cx, TokenStreamAnyChars& ts,
                         const mozilla::Range<const char16_t> chars,
                         JS::RegExpFlags flags) {
-  FlatStringReader reader(chars.begin().get(), chars.length());
+  FlatStringReader reader(chars);
   RegExpCompileData result;
   if (!CheckPatternSyntaxImpl(cx, &reader, flags, &result)) {
     ReportSyntaxError(ts, result, chars.begin().get(), chars.length());
     return false;
   }
   if (!result.capture_name_map.is_null()) {
     // We can parse named captures, but we don't support them yet.
     // Until we do, this will report a syntax error:
@@ -261,17 +261,17 @@ bool CheckPatternSyntax(JSContext* cx, T
     ReportSyntaxError(ts, result, chars.begin().get(), chars.length());
     return false;
   }
   return true;
 }
 
 bool CheckPatternSyntax(JSContext* cx, TokenStreamAnyChars& ts,
                         HandleAtom pattern, JS::RegExpFlags flags) {
-  FlatStringReader reader(pattern);
+  FlatStringReader reader(cx, pattern);
   RegExpCompileData result;
   if (!CheckPatternSyntaxImpl(cx, &reader, flags, &result)) {
     ReportSyntaxError(ts, result, pattern);
     return false;
   }
   if (!result.capture_name_map.is_null()) {
     // We can parse named captures, but we don't support them yet.
     // Until we do, this will report a syntax error:
@@ -319,28 +319,27 @@ static bool UseBoyerMoore(HandleAtom pat
   if (pattern->hasLatin1Chars()) {
     return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length);
   }
   MOZ_ASSERT(pattern->hasTwoByteChars());
   return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length);
 }
 
 // Sample character frequency information for use in Boyer-Moore.
-static void SampleCharacters(HandleLinearString input,
+static void SampleCharacters(FlatStringReader* sample_subject,
                              RegExpCompiler& compiler) {
   static const int kSampleSize = 128;
   int chars_sampled = 0;
 
-  FlatStringReader sample_subject(input);
-  int length = sample_subject.length();
+  int length = sample_subject->length();
 
   int half_way = (length - kSampleSize) / 2;
   for (int i = std::max(0, half_way); i < length && chars_sampled < kSampleSize;
        i++, chars_sampled++) {
-    compiler.frequency_collator()->CountCharacter(sample_subject.Get(i));
+    compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
   }
 }
 
 enum class AssembleResult {
   Success,
   TooLarge,
   OutOfMemory,
 };
@@ -458,17 +457,17 @@ bool CompilePattern(JSContext* cx, Mutab
                     HandleLinearString input, RegExpShared::CodeKind codeKind) {
   RootedAtom pattern(cx, re->getSource());
   JS::RegExpFlags flags = re->getFlags();
   LifoAllocScope allocScope(&cx->tempLifoAlloc());
   Zone zone(allocScope.alloc());
 
   RegExpCompileData data;
   {
-    FlatStringReader patternBytes(pattern);
+    FlatStringReader patternBytes(cx, pattern);
     if (!RegExpParser::ParseRegExp(cx->isolate, &zone, &patternBytes, flags,
                                    &data)) {
       JS::CompileOptions options(cx);
       DummyTokenStream dummyTokenStream(cx, options);
       ReportSyntaxError(dummyTokenStream, data, pattern);
       return false;
     }
   }
@@ -504,17 +503,18 @@ bool CompilePattern(JSContext* cx, Mutab
   MOZ_ASSERT(re->kind() == RegExpShared::Kind::RegExp);
 
   HandleScope handleScope(cx->isolate);
   RegExpCompiler compiler(cx->isolate, &zone, data.capture_count,
                           input->hasLatin1Chars());
 
   bool isLatin1 = input->hasLatin1Chars();
 
-  SampleCharacters(input, compiler);
+  FlatStringReader sample_subject(cx, input);
+  SampleCharacters(&sample_subject, compiler);
   data.node = compiler.PreprocessRegExp(&data, flags, isLatin1);
   data.error = AnalyzeRegExp(cx->isolate, isLatin1, data.node);
   if (data.error != RegExpError::kNone) {
     MOZ_ASSERT(data.error == RegExpError::kAnalysisStackOverflow);
     ReportOverRecursed(cx);
     return false;
   }
 
--- a/js/src/new-regexp/regexp-shim.h
+++ b/js/src/new-regexp/regexp-shim.h
@@ -855,56 +855,39 @@ template <>
 inline Vector<const uc16> String::GetCharVector(
     const DisallowHeapAllocation& no_gc) {
   String::FlatContent flat = GetFlatContent(no_gc);
   MOZ_ASSERT(flat.IsTwoByte());
   return flat.ToUC16Vector();
 }
 
 // A flat string reader provides random access to the contents of a
-// string independent of the character width of the string.  The handle
-// must be valid as long as the reader is being used.
-// Origin:
-// https://github.com/v8/v8/blob/84f3877c15bc7f8956d21614da4311337525a3c8/src/objects/string.h#L807-L825
+// string independent of the character width of the string.
 class MOZ_STACK_CLASS FlatStringReader {
  public:
-  FlatStringReader(JSLinearString* string)
-    : length_(string->length()),
-      is_latin1_(string->hasLatin1Chars()) {
+  FlatStringReader(JSContext* cx, js::HandleLinearString string)
+    : string_(string), length_(string->length()) {}
 
-    if (is_latin1_) {
-      latin1_chars_ = string->latin1Chars(nogc_);
-    } else {
-      two_byte_chars_ = string->twoByteChars(nogc_);
-    }
-  }
-  FlatStringReader(const char16_t* chars, size_t length)
-    : two_byte_chars_(chars),
-      length_(length),
-      is_latin1_(false) {}
+  FlatStringReader(const mozilla::Range<const char16_t> range)
+    : string_(nullptr), range_(range), length_(range.length()) {}
 
   int length() { return length_; }
 
   inline char16_t Get(size_t index) {
     MOZ_ASSERT(index < length_);
-    if (is_latin1_) {
-      return latin1_chars_[index];
-    } else {
-      return two_byte_chars_[index];
+    if (string_) {
+      return string_->latin1OrTwoByteChar(index);
     }
+    return range_[index];
   }
 
  private:
-  union {
-    const JS::Latin1Char *latin1_chars_;
-    const char16_t* two_byte_chars_;
-  };
+  js::HandleLinearString string_;
+  const mozilla::Range<const char16_t> range_;
   size_t length_;
-  bool is_latin1_;
-  JS::AutoCheckCannotGC nogc_;
 };
 
 class JSRegExp : public HeapObject {
  public:
   JSRegExp() : HeapObject() {}
   JSRegExp(js::RegExpShared* re) { setValue(JS::PrivateGCThingValue(re)); }
 
   // ******************************************************