Bug 1026438 part 5 - Make irregexp work with Latin1 strings. r=bhackett
authorJan de Mooij <jdemooij@mozilla.com>
Thu, 19 Jun 2014 12:59:40 +0200
changeset 189603 4df3e4664b11ff9d8f49d3443de567f815f9f6b1
parent 189547 3087b33b8037dfbad52c622f140a95aee6d59aba
child 189604 95ce40562f510e1c5f2ab25ab6228d23c0ff59ce
push id1
push userroot
push dateMon, 20 Oct 2014 17:29:22 +0000
reviewersbhackett
bugs1026438
milestone33.0a1
Bug 1026438 part 5 - Make irregexp work with Latin1 strings. r=bhackett
js/src/irregexp/NativeRegExpMacroAssembler.cpp
js/src/irregexp/NativeRegExpMacroAssembler.h
js/src/irregexp/RegExpEngine.cpp
js/src/irregexp/RegExpEngine.h
js/src/irregexp/RegExpInterpreter.cpp
js/src/irregexp/RegExpMacroAssembler.cpp
js/src/irregexp/RegExpMacroAssembler.h
js/src/jit-test/tests/latin1/regexp.js
js/src/vm/RegExpObject.cpp
--- a/js/src/irregexp/NativeRegExpMacroAssembler.cpp
+++ b/js/src/irregexp/NativeRegExpMacroAssembler.cpp
@@ -750,17 +750,18 @@ NativeRegExpMacroAssembler::CheckNotBack
         // Parameters are
         //   Address byte_offset1 - Address captured substring's start.
         //   Address byte_offset2 - Address of current character position.
         //   size_t byte_length - length of capture in bytes(!)
         masm.setupUnalignedABICall(3, temp0);
         masm.passABIArg(current_character);
         masm.passABIArg(current_position);
         masm.passABIArg(temp1);
-        masm.callWithABI(JS_FUNC_TO_DATA_PTR(void *, CaseInsensitiveCompareStrings));
+        int (*fun)(const jschar*, const jschar*, size_t) = CaseInsensitiveCompareStrings;
+        masm.callWithABI(JS_FUNC_TO_DATA_PTR(void *, fun));
         masm.storeCallResult(temp0);
 
         masm.PopRegsInMask(volatileRegs);
 
         // Check if function returned non-zero for success or zero for failure.
         masm.branchTest32(Assembler::Zero, temp0, temp0, BranchOrBacktrack(on_no_match));
 
         // On success, increment position by length of capture.
@@ -807,19 +808,22 @@ NativeRegExpMacroAssembler::CheckCharact
     masm.branch32(Assembler::Above, temp0, Imm32(to - from), BranchOrBacktrack(on_not_in_range));
 }
 
 void
 NativeRegExpMacroAssembler::CheckBitInTable(uint8_t *table, Label *on_bit_set)
 {
     IonSpew(SPEW_PREFIX "CheckBitInTable");
 
-    JS_ASSERT(mode_ != ASCII); // Ascii case not handled here.
+    masm.movePtr(ImmPtr(table), temp0);
 
-    masm.movePtr(ImmPtr(table), temp0);
+    // kTableMask is currently 127, so we need to mask even if the input is
+    // Latin1. V8 has the same issue.
+    static_assert(JSString::MAX_LATIN1_CHAR > kTableMask,
+                  "No need to mask if MAX_LATIN1_CHAR <= kTableMask");
     masm.move32(Imm32(kTableSize - 1), temp1);
     masm.and32(current_character, temp1);
 
     masm.load8ZeroExtend(BaseIndex(temp0, temp1, TimesOne), temp0);
     masm.branchTest32(Assembler::NonZero, temp0, temp0, BranchOrBacktrack(on_bit_set));
 }
 
 void
@@ -870,17 +874,25 @@ NativeRegExpMacroAssembler::LoadCurrentC
 }
 
 void
 NativeRegExpMacroAssembler::LoadCurrentCharacterUnchecked(int cp_offset, int characters)
 {
     IonSpew(SPEW_PREFIX "LoadCurrentCharacterUnchecked(%d, %d)", cp_offset, characters);
 
     if (mode_ == ASCII) {
-        MOZ_ASSUME_UNREACHABLE("Ascii loading not implemented");
+        BaseIndex address(input_end_pointer, current_position, TimesOne, cp_offset);
+        if (characters == 4) {
+            masm.load32(address, current_character);
+        } else if (characters == 2) {
+            masm.load16ZeroExtend(address, current_character);
+        } else {
+            JS_ASSERT(characters = 1);
+            masm.load8ZeroExtend(address, current_character);
+        }
     } else {
         JS_ASSERT(mode_ == JSCHAR);
         JS_ASSERT(characters <= 2);
         BaseIndex address(input_end_pointer, current_position, TimesOne, cp_offset * sizeof(jschar));
         if (characters == 2)
             masm.load32(address, current_character);
         else
             masm.load16ZeroExtend(address, current_character);
--- a/js/src/irregexp/NativeRegExpMacroAssembler.h
+++ b/js/src/irregexp/NativeRegExpMacroAssembler.h
@@ -35,29 +35,30 @@
 
 #include "irregexp/RegExpMacroAssembler.h"
 
 namespace js {
 namespace irregexp {
 
 struct InputOutputData
 {
-    const jschar *inputStart;
-    const jschar *inputEnd;
+    const void *inputStart;
+    const void *inputEnd;
 
     // Index into inputStart (in chars) at which to begin matching.
     size_t startIndex;
 
     MatchPairs *matches;
 
     // RegExpMacroAssembler::Result for non-global regexps, number of captures
     // for global regexps.
     int32_t result;
 
-    InputOutputData(const jschar *inputStart, const jschar *inputEnd,
+    template <typename CharT>
+    InputOutputData(const CharT *inputStart, const CharT *inputEnd,
                     size_t startIndex, MatchPairs *matches)
       : inputStart(inputStart),
         inputEnd(inputEnd),
         startIndex(startIndex),
         matches(matches),
         result(0)
     {}
 };
--- a/js/src/irregexp/RegExpEngine.cpp
+++ b/js/src/irregexp/RegExpEngine.cpp
@@ -1661,34 +1661,47 @@ irregexp::CompilePattern(JSContext *cx, 
         assembler->set_global_mode((data->tree->min_match() > 0)
                                    ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
                                    : RegExpMacroAssembler::GLOBAL);
     }
 
     return compiler.Assemble(cx, assembler, node, data->capture_count);
 }
 
+template <typename CharT>
 RegExpRunStatus
-irregexp::ExecuteCode(JSContext *cx, jit::JitCode *codeBlock,
-                      const jschar *chars, size_t start, size_t length, MatchPairs *matches)
+irregexp::ExecuteCode(JSContext *cx, jit::JitCode *codeBlock, const CharT *chars, size_t start,
+                      size_t length, MatchPairs *matches)
 {
 #ifdef JS_ION
     typedef void (*RegExpCodeSignature)(InputOutputData *);
 
     InputOutputData data(chars, chars + length, start, matches);
 
     RegExpCodeSignature function = reinterpret_cast<RegExpCodeSignature>(codeBlock->raw());
-    CALL_GENERATED_REGEXP(function, &data);
+
+    {
+        JS::AutoSuppressGCAnalysis nogc;
+        CALL_GENERATED_REGEXP(function, &data);
+    }
 
     return (RegExpRunStatus) data.result;
 #else
     MOZ_CRASH();
 #endif
 }
 
+template RegExpRunStatus
+irregexp::ExecuteCode(JSContext *cx, jit::JitCode *codeBlock, const Latin1Char *chars, size_t start,
+                      size_t length, MatchPairs *matches);
+
+template RegExpRunStatus
+irregexp::ExecuteCode(JSContext *cx, jit::JitCode *codeBlock, const jschar *chars, size_t start,
+                      size_t length, MatchPairs *matches);
+
 // -------------------------------------------------------------------
 // Tree to graph conversion
 
 RegExpNode *
 RegExpAtom::ToNode(RegExpCompiler* compiler, RegExpNode* on_success)
 {
     TextElementVector *elms =
         compiler->alloc()->newInfallible<TextElementVector>(*compiler->alloc());
--- a/js/src/irregexp/RegExpEngine.h
+++ b/js/src/irregexp/RegExpEngine.h
@@ -103,23 +103,25 @@ struct RegExpCode
 
 RegExpCode
 CompilePattern(JSContext *cx, RegExpShared *shared, RegExpCompileData *data,
                HandleLinearString sample,  bool is_global, bool ignore_case = false,
                bool is_ascii = false);
 
 // Note: this may return RegExpRunStatus_Error if an interrupt was requested
 // while the code was executing.
+template <typename CharT>
 RegExpRunStatus
-ExecuteCode(JSContext *cx, jit::JitCode *codeBlock,
-            const jschar *chars, size_t start, size_t length, MatchPairs *matches);
+ExecuteCode(JSContext *cx, jit::JitCode *codeBlock, const CharT *chars, size_t start,
+            size_t length, MatchPairs *matches);
 
+template <typename CharT>
 RegExpRunStatus
-InterpretCode(JSContext *cx, const uint8_t *byteCode,
-              const jschar *chars, size_t start, size_t length, MatchPairs *matches);
+InterpretCode(JSContext *cx, const uint8_t *byteCode, const CharT *chars, size_t start,
+              size_t length, MatchPairs *matches);
 
 #define FOR_EACH_NODE_TYPE(VISIT)                                    \
   VISIT(End)                                                         \
   VISIT(Action)                                                      \
   VISIT(Choice)                                                      \
   VISIT(BackReference)                                               \
   VISIT(Assertion)                                                   \
   VISIT(Text)
--- a/js/src/irregexp/RegExpInterpreter.cpp
+++ b/js/src/irregexp/RegExpInterpreter.cpp
@@ -99,23 +99,24 @@ static int32_t
 Load16Aligned(const uint8_t* pc)
 {
     JS_ASSERT((reinterpret_cast<uintptr_t>(pc) & 1) == 0);
     return *reinterpret_cast<const uint16_t *>(pc);
 }
 
 #define BYTECODE(name)  case BC_##name:
 
+template <typename CharT>
 RegExpRunStatus
-irregexp::InterpretCode(JSContext *cx, const uint8_t *byteCode,
-                        const jschar *chars, size_t current, size_t length, MatchPairs *matches)
+irregexp::InterpretCode(JSContext *cx, const uint8_t *byteCode, const CharT *chars, size_t current,
+                        size_t length, MatchPairs *matches)
 {
     const uint8_t* pc = byteCode;
 
-    jschar current_char = current ? chars[current - 1] : '\n';
+    uint32_t current_char = current ? chars[current - 1] : '\n';
 
     RegExpStackCursor stack(cx);
 
     int32_t numRegisters = Load32Aligned(pc);
     pc += 4;
 
     Vector<int32_t, 0, SystemAllocPolicy> registers;
     if (!registers.growByUninitialized(numRegisters))
@@ -221,18 +222,18 @@ irregexp::InterpretCode(JSContext *cx, c
             pc += BC_LOAD_CURRENT_CHAR_UNCHECKED_LENGTH;
             break;
           }
           BYTECODE(LOAD_2_CURRENT_CHARS) {
             size_t pos = current + (insn >> BYTECODE_SHIFT);
             if (pos + 2 > length) {
                 pc = byteCode + Load32Aligned(pc + 4);
             } else {
-                jschar next = chars[pos + 1];
-                current_char = (chars[pos] | (next << (kBitsPerByte * sizeof(jschar))));
+                CharT next = chars[pos + 1];
+                current_char = (chars[pos] | (next << (kBitsPerByte * sizeof(CharT))));
                 pc += BC_LOAD_2_CURRENT_CHARS_LENGTH;
             }
             break;
           }
           BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) {
             int pos = current + (insn >> BYTECODE_SHIFT);
             jschar next = chars[pos + 1];
             current_char = (chars[pos] | (next << (kBitsPerByte * sizeof(jschar))));
@@ -416,17 +417,17 @@ irregexp::InterpretCode(JSContext *cx, c
             if (from < 0 || len <= 0) {
                 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
                 break;
             }
             if (current + len > length) {
                 pc = byteCode + Load32Aligned(pc + 4);
                 break;
             }
-            if (CaseInsensitiveCompareStrings(chars + from, chars + current, len * 2)) {
+            if (CaseInsensitiveCompareStrings(chars + from, chars + current, len * sizeof(CharT))) {
                 current += len;
                 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
             } else {
                 pc = byteCode + Load32Aligned(pc + 4);
             }
             break;
           }
           BYTECODE(CHECK_AT_START)
@@ -451,8 +452,16 @@ irregexp::InterpretCode(JSContext *cx, c
             break;
           }
           default:
             MOZ_ASSUME_UNREACHABLE("Bad bytecode");
             break;
         }
     }
 }
+
+template RegExpRunStatus
+irregexp::InterpretCode(JSContext *cx, const uint8_t *byteCode, const Latin1Char *chars, size_t current,
+                        size_t length, MatchPairs *matches);
+
+template RegExpRunStatus
+irregexp::InterpretCode(JSContext *cx, const uint8_t *byteCode, const jschar *chars, size_t current,
+                        size_t length, MatchPairs *matches);
--- a/js/src/irregexp/RegExpMacroAssembler.cpp
+++ b/js/src/irregexp/RegExpMacroAssembler.cpp
@@ -30,37 +30,46 @@
 
 #include "irregexp/RegExpMacroAssembler.h"
 
 #include "irregexp/RegExpBytecode.h"
 
 using namespace js;
 using namespace js::irregexp;
 
+template <typename CharT>
 int
-irregexp::CaseInsensitiveCompareStrings(const jschar *substring1, const jschar *substring2,
+irregexp::CaseInsensitiveCompareStrings(const CharT *substring1, const CharT *substring2,
 					size_t byteLength)
 {
-    JS_ASSERT(byteLength % 2 == 0);
-    size_t length = byteLength >> 1;
+    JS_ASSERT(byteLength % sizeof(CharT) == 0);
+    size_t length = byteLength / sizeof(CharT);
 
     for (size_t i = 0; i < length; i++) {
         jschar c1 = substring1[i];
         jschar c2 = substring2[i];
         if (c1 != c2) {
             c1 = unicode::ToLowerCase(c1);
             c2 = unicode::ToLowerCase(c2);
             if (c1 != c2)
                 return 0;
         }
     }
 
     return 1;
 }
 
+template int
+irregexp::CaseInsensitiveCompareStrings(const Latin1Char *substring1, const Latin1Char *substring2,
+					size_t byteLength);
+
+template int
+irregexp::CaseInsensitiveCompareStrings(const jschar *substring1, const jschar *substring2,
+					size_t byteLength);
+
 InterpretedRegExpMacroAssembler::InterpretedRegExpMacroAssembler(LifoAlloc *alloc, RegExpShared *shared,
                                                                  size_t numSavedRegisters)
   : RegExpMacroAssembler(*alloc, shared, numSavedRegisters),
     pc_(0),
     advance_current_start_(0),
     advance_current_offset_(0),
     advance_current_end_(kInvalidPC),
     buffer_(nullptr),
--- a/js/src/irregexp/RegExpMacroAssembler.h
+++ b/js/src/irregexp/RegExpMacroAssembler.h
@@ -212,18 +212,19 @@ class MOZ_STACK_CLASS RegExpMacroAssembl
         if (num_registers_ <= reg)
             num_registers_ = reg + 1;
     }
 
   public:
     RegExpShared *shared;
 };
 
+template <typename CharT>
 int
-CaseInsensitiveCompareStrings(const jschar *substring1, const jschar *substring2, size_t byteLength);
+CaseInsensitiveCompareStrings(const CharT *substring1, const CharT *substring2, size_t byteLength);
 
 class MOZ_STACK_CLASS InterpretedRegExpMacroAssembler : public RegExpMacroAssembler
 {
   public:
     InterpretedRegExpMacroAssembler(LifoAlloc *alloc, RegExpShared *shared, size_t numSavedRegisters);
     ~InterpretedRegExpMacroAssembler();
 
     // Inherited virtual methods.
--- a/js/src/jit-test/tests/latin1/regexp.js
+++ b/js/src/jit-test/tests/latin1/regexp.js
@@ -8,8 +8,26 @@ assertEq(re.sticky, false);
 
 // TwoByte
 re = new RegExp("foo[bB]a\\r\u1200", "im");
 assertEq(isLatin1(re.source), false);
 assertEq(re.source, "foo[bB]a\\r\u1200");
 assertEq(re.multiline, true);
 assertEq(re.ignoreCase, true);
 assertEq(re.sticky, false);
+
+re = /b[aA]r/;
+
+// Latin1
+assertEq(toLatin1("foobAr1234").search(re), 3);
+assertEq(toLatin1("bar1234").search(re), 0);
+assertEq(toLatin1("foobbr1234").search(re), -1);
+
+// TwoByte
+assertEq("foobAr1234\u1200".search(re), 3);
+assertEq("bar1234\u1200".search(re), 0);
+assertEq("foobbr1234\u1200".search(re), -1);
+
+re = /abcdefghijklm[0-5]/;
+assertEq(toLatin1("1abcdefghijklm4").search(re), 1);
+assertEq("\u12001abcdefghijklm0".search(re), 2);
+assertEq(toLatin1("1abcdefghijklm8").search(re), -1);
+assertEq("\u12001abcdefghijklm8".search(re), -1);
--- a/js/src/vm/RegExpObject.cpp
+++ b/js/src/vm/RegExpObject.cpp
@@ -23,16 +23,18 @@
 #include "vm/Shape-inl.h"
 
 using namespace js;
 
 using mozilla::DebugOnly;
 using mozilla::Maybe;
 using js::frontend::TokenStream;
 
+using JS::AutoCheckCannotGC;
+
 JS_STATIC_ASSERT(IgnoreCaseFlag == JSREG_FOLD);
 JS_STATIC_ASSERT(GlobalFlag == JSREG_GLOB);
 JS_STATIC_ASSERT(MultilineFlag == JSREG_MULTILINE);
 JS_STATIC_ASSERT(StickyFlag == JSREG_STICKY);
 
 /* RegExpObjectBuilder */
 
 RegExpObjectBuilder::RegExpObjectBuilder(ExclusiveContext *cx, RegExpObject *reobj)
@@ -587,34 +589,51 @@ RegExpShared::execute(JSContext *cx, Han
 
         matches.checkAgainst(origLength);
         *lastIndex = matches[0].limit;
         return RegExpRunStatus_Success;
     }
 
     if (uint8_t *byteCode = maybeByteCode(input->hasLatin1Chars())) {
         AutoTraceLog logInterpreter(logger, TraceLogger::IrregexpExecute);
-        const jschar *chars = input->chars() + charsOffset;
-        RegExpRunStatus result =
-            irregexp::InterpretCode(cx, byteCode, chars, start, length, &matches);
+
+        AutoStableStringChars inputChars(cx, input);
+        if (!inputChars.init())
+            return RegExpRunStatus_Error;
+
+        RegExpRunStatus result;
+        if (inputChars.isLatin1()) {
+            const Latin1Char *chars = inputChars.latin1Range().start().get() + charsOffset;
+            result = irregexp::InterpretCode(cx, byteCode, chars, start, length, &matches);
+        } else {
+            const jschar *chars = inputChars.twoByteRange().start().get() + charsOffset;
+            result = irregexp::InterpretCode(cx, byteCode, chars, start, length, &matches);
+        }
+
         if (result == RegExpRunStatus_Success) {
             matches.displace(displacement);
             matches.checkAgainst(origLength);
             *lastIndex = matches[0].limit;
         }
         return result;
     }
 
 #ifdef JS_ION
     while (true) {
         RegExpRunStatus result;
         {
             AutoTraceLog logJIT(logger, TraceLogger::IrregexpExecute);
-            const jschar *chars = input->chars() + charsOffset;
-            result = irregexp::ExecuteCode(cx, jitCodeTwoByte, chars, start, length, &matches);
+            AutoCheckCannotGC nogc;
+            if (input->hasLatin1Chars()) {
+                const Latin1Char *chars = input->latin1Chars(nogc) + charsOffset;
+                result = irregexp::ExecuteCode(cx, jitCodeLatin1, chars, start, length, &matches);
+            } else {
+                const jschar *chars = input->twoByteChars(nogc) + charsOffset;
+                result = irregexp::ExecuteCode(cx, jitCodeTwoByte, chars, start, length, &matches);
+            }
         }
 
         if (result == RegExpRunStatus_Error) {
             // The RegExp engine might exit with an exception if an interrupt
             // was requested. Check this case and retry until a clean result is
             // obtained.
             bool interrupted;
             {
@@ -880,20 +899,20 @@ js::ParseRegExpFlags(JSContext *cx, JSSt
     if (!linear)
         return false;
 
     size_t len = linear->length();
 
     bool ok;
     jschar lastParsed;
     if (linear->hasLatin1Chars()) {
-        JS::AutoCheckCannotGC nogc;
+        AutoCheckCannotGC nogc;
         ok = ::ParseRegExpFlags(linear->latin1Chars(nogc), len, flagsOut, &lastParsed);
     } else {
-        JS::AutoCheckCannotGC nogc;
+        AutoCheckCannotGC nogc;
         ok = ::ParseRegExpFlags(linear->twoByteChars(nogc), len, flagsOut, &lastParsed);
     }
 
     if (!ok) {
         char charBuf[2];
         charBuf[0] = char(lastParsed);
         charBuf[1] = '\0';
         JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr,