From b8c8d932f2d6a40f07ac867960bf7952c885c6bc Mon Sep 17 00:00:00 2001 From: Fedor Date: Wed, 25 Dec 2019 15:45:50 +0300 Subject: [PATCH] Implement regex lookaround. --- js/src/builtin/TestingFunctions.cpp | 6 +- .../irregexp/NativeRegExpMacroAssembler.cpp | 8 +- js/src/irregexp/NativeRegExpMacroAssembler.h | 7 +- js/src/irregexp/RegExpAST.cpp | 8 +- js/src/irregexp/RegExpAST.h | 33 ++-- js/src/irregexp/RegExpBytecode.h | 23 +-- js/src/irregexp/RegExpEngine.cpp | 149 +++++++++++------- js/src/irregexp/RegExpEngine.h | 31 +++- js/src/irregexp/RegExpInterpreter.cpp | 74 ++++++++- js/src/irregexp/RegExpMacroAssembler.cpp | 17 +- js/src/irregexp/RegExpMacroAssembler.h | 15 +- js/src/irregexp/RegExpParser.cpp | 130 ++++++++++----- js/src/irregexp/RegExpParser.h | 19 ++- 13 files changed, 361 insertions(+), 159 deletions(-) diff --git a/js/src/builtin/TestingFunctions.cpp b/js/src/builtin/TestingFunctions.cpp index 4363c7aed..072a72b75 100644 --- a/js/src/builtin/TestingFunctions.cpp +++ b/js/src/builtin/TestingFunctions.cpp @@ -3900,10 +3900,10 @@ ConvertRegExpTreeToObject(JSContext* cx, irregexp::RegExpTree* tree) return nullptr; return obj; } - if (tree->IsLookahead()) { - if (!StringProp(cx, obj, "type", "Lookahead")) + if (tree->IsLookaround()) { + if (!StringProp(cx, obj, "type", "Lookaround")) return nullptr; - irregexp::RegExpLookahead* t = tree->AsLookahead(); + irregexp::RegExpLookaround* t = tree->AsLookaround(); if (!BooleanProp(cx, obj, "is_positive", t->is_positive())) return nullptr; if (!TreeProp(cx, obj, "body", t->body())) diff --git a/js/src/irregexp/NativeRegExpMacroAssembler.cpp b/js/src/irregexp/NativeRegExpMacroAssembler.cpp index 0fb507297..e17eecb9b 100644 --- a/js/src/irregexp/NativeRegExpMacroAssembler.cpp +++ b/js/src/irregexp/NativeRegExpMacroAssembler.cpp @@ -582,7 +582,7 @@ NativeRegExpMacroAssembler::CheckAtStart(Label* on_at_start) } void -NativeRegExpMacroAssembler::CheckNotAtStart(Label* on_not_at_start) +NativeRegExpMacroAssembler::CheckNotAtStart(int cp_offset, Label* on_not_at_start) { JitSpew(SPEW_PREFIX "CheckNotAtStart"); @@ -673,7 +673,7 @@ NativeRegExpMacroAssembler::CheckGreedyLoop(Label* on_tos_equals_current_positio } void -NativeRegExpMacroAssembler::CheckNotBackReference(int start_reg, Label* on_no_match) +NativeRegExpMacroAssembler::CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) { JitSpew(SPEW_PREFIX "CheckNotBackReference(%d)", start_reg); @@ -744,8 +744,8 @@ NativeRegExpMacroAssembler::CheckNotBackReference(int start_reg, Label* on_no_ma } void -NativeRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(int start_reg, Label* on_no_match, - bool unicode) +NativeRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, + Label* on_no_match, bool unicode) { JitSpew(SPEW_PREFIX "CheckNotBackReferenceIgnoreCase(%d, %d)", start_reg, unicode); diff --git a/js/src/irregexp/NativeRegExpMacroAssembler.h b/js/src/irregexp/NativeRegExpMacroAssembler.h index 7a72e252f..fc582dccf 100644 --- a/js/src/irregexp/NativeRegExpMacroAssembler.h +++ b/js/src/irregexp/NativeRegExpMacroAssembler.h @@ -105,9 +105,10 @@ class MOZ_STACK_CLASS NativeRegExpMacroAssembler final : public RegExpMacroAssem void CheckCharacterGT(char16_t limit, jit::Label* on_greater); void CheckCharacterLT(char16_t limit, jit::Label* on_less); void CheckGreedyLoop(jit::Label* on_tos_equals_current_position); - void CheckNotAtStart(jit::Label* on_not_at_start); - void CheckNotBackReference(int start_reg, jit::Label* on_no_match); - void CheckNotBackReferenceIgnoreCase(int start_reg, jit::Label* on_no_match, bool unicode); + void CheckNotAtStart(int cp_offset, jit::Label* on_not_at_start); + void CheckNotBackReference(int start_reg, bool read_backward, jit::Label* on_no_match); + void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, + jit::Label* on_no_match, bool unicode); void CheckNotCharacter(unsigned c, jit::Label* on_not_equal); void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, jit::Label* on_not_equal); void CheckNotCharacterAfterMinusAnd(char16_t c, char16_t minus, char16_t and_with, diff --git a/js/src/irregexp/RegExpAST.cpp b/js/src/irregexp/RegExpAST.cpp index 8dfd99057..43867c312 100644 --- a/js/src/irregexp/RegExpAST.cpp +++ b/js/src/irregexp/RegExpAST.cpp @@ -250,16 +250,16 @@ RegExpCapture::CaptureRegisters() } // ---------------------------------------------------------------------------- -// RegExpLookahead +// RegExpLookaround Interval -RegExpLookahead::CaptureRegisters() +RegExpLookaround::CaptureRegisters() { return body()->CaptureRegisters(); } bool -RegExpLookahead::IsAnchoredAtStart() +RegExpLookaround::IsAnchoredAtStart() { - return is_positive() && body()->IsAnchoredAtStart(); + return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart(); } diff --git a/js/src/irregexp/RegExpAST.h b/js/src/irregexp/RegExpAST.h index 7bda6fc7e..6f59842bc 100644 --- a/js/src/irregexp/RegExpAST.h +++ b/js/src/irregexp/RegExpAST.h @@ -360,6 +360,7 @@ class RegExpCapture : public RegExpTree virtual int min_match() { return body_->min_match(); } virtual int max_match() { return body_->max_match(); } RegExpTree* body() { return body_; } + void set_body(RegExpTree* body) { body_ = body; } int index() { return index_; } static int StartRegister(int index) { return index * 2; } static int EndRegister(int index) { return index * 2 + 1; } @@ -369,25 +370,29 @@ class RegExpCapture : public RegExpTree int index_; }; -class RegExpLookahead : public RegExpTree +class RegExpLookaround : public RegExpTree { public: - RegExpLookahead(RegExpTree* body, - bool is_positive, - int capture_count, - int capture_from) + enum Type { LOOKAHEAD, LOOKBEHIND }; + + RegExpLookaround(RegExpTree* body, + bool is_positive, + int capture_count, + int capture_from, + Type type) : body_(body), is_positive_(is_positive), capture_count_(capture_count), - capture_from_(capture_from) + capture_from_(capture_from), + type_(type) {} virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); - virtual RegExpLookahead* AsLookahead(); + virtual RegExpLookaround* AsLookaround(); virtual Interval CaptureRegisters(); - virtual bool IsLookahead(); + virtual bool IsLookaround(); virtual bool IsAnchoredAtStart(); virtual int min_match() { return 0; } virtual int max_match() { return 0; } @@ -395,12 +400,14 @@ class RegExpLookahead : public RegExpTree bool is_positive() { return is_positive_; } int capture_count() { return capture_count_; } int capture_from() { return capture_from_; } + Type type() { return type_; } private: RegExpTree* body_; bool is_positive_; int capture_count_; int capture_from_; + Type type_; }; typedef InfallibleVector RegExpCaptureVector; @@ -417,8 +424,14 @@ class RegExpBackReference : public RegExpTree RegExpNode* on_success); virtual RegExpBackReference* AsBackReference(); virtual bool IsBackReference(); - virtual int min_match() { return 0; } - virtual int max_match() { return capture_->max_match(); } + virtual int min_match() override { return 0; } + // The capture may not be completely parsed yet, if the reference occurs + // before the capture. In the ordinary case, nothing has been captured yet, + // so the back reference must have the length 0. If the back reference is + // inside a lookbehind, effectively making it a forward reference, we return + virtual int max_match() override { + return capture_->body() ? capture_->max_match() : 0; + } int index() { return capture_->index(); } RegExpCapture* capture() { return capture_; } private: diff --git a/js/src/irregexp/RegExpBytecode.h b/js/src/irregexp/RegExpBytecode.h index f31b78c59..ea3f80b4f 100644 --- a/js/src/irregexp/RegExpBytecode.h +++ b/js/src/irregexp/RegExpBytecode.h @@ -82,16 +82,19 @@ V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \ V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \ V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ -V(CHECK_NOT_REGS_EQUAL, 39, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ -V(CHECK_REGISTER_LT, 40, 12) /* bc8 reg_idx24 value32 addr32 */ \ -V(CHECK_REGISTER_GE, 41, 12) /* bc8 reg_idx24 value32 addr32 */ \ -V(CHECK_REGISTER_EQ_POS, 42, 8) /* bc8 reg_idx24 addr32 */ \ -V(CHECK_AT_START, 43, 8) /* bc8 pad24 addr32 */ \ -V(CHECK_NOT_AT_START, 44, 8) /* bc8 pad24 addr32 */ \ -V(CHECK_GREEDY, 45, 8) /* bc8 pad24 addr32 */ \ -V(ADVANCE_CP_AND_GOTO, 46, 8) /* bc8 offset24 addr32 */ \ -V(SET_CURRENT_POSITION_FROM_END, 47, 4) /* bc8 idx24 */ \ -V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 48, 8) /* bc8 reg_idx24 addr32 */ +V(CHECK_NOT_BACK_REF_BACKWARD, 39, 8) /* bc8 reg_idx24 addr32 */ \ +V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \ +V(CHECK_NOT_REGS_EQUAL, 41, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ +V(CHECK_REGISTER_LT, 42, 12) /* bc8 reg_idx24 value32 addr32 */ \ +V(CHECK_REGISTER_GE, 43, 12) /* bc8 reg_idx24 value32 addr32 */ \ +V(CHECK_REGISTER_EQ_POS, 44, 8) /* bc8 reg_idx24 addr32 */ \ +V(CHECK_AT_START, 45, 8) /* bc8 pad24 addr32 */ \ +V(CHECK_NOT_AT_START, 46, 8) /* bc8 pad24 addr32 */ \ +V(CHECK_GREEDY, 47, 8) /* bc8 pad24 addr32 */ \ +V(ADVANCE_CP_AND_GOTO, 48, 8) /* bc8 offset24 addr32 */ \ +V(SET_CURRENT_POSITION_FROM_END, 49, 4) /* bc8 idx24 */ \ +V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 50, 8) /* bc8 reg_idx24 addr32 */ \ +V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_UNICODE, 51, 8) /* bc8 reg_idx24 addr32 */ #define DECLARE_BYTECODES(name, code, length) \ static const int BC_##name = code; diff --git a/js/src/irregexp/RegExpEngine.cpp b/js/src/irregexp/RegExpEngine.cpp index 4d691a5dc..62f94c3e7 100644 --- a/js/src/irregexp/RegExpEngine.cpp +++ b/js/src/irregexp/RegExpEngine.cpp @@ -721,6 +721,8 @@ ActionNode::EmptyMatchCheck(int start_register, int TextNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start) { + if (read_backward()) + return 0; int answer = Length(); if (answer >= still_to_find) return answer; @@ -736,8 +738,7 @@ TextNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start) int TextNode::GreedyLoopTextLength() { - TextElement elm = elements()[elements().length() - 1]; - return elm.cp_offset() + elm.length(); + return Length(); } RegExpNode* @@ -887,6 +888,8 @@ AssertionNode::FillInBMInfo(int offset, int budget, BoyerMooreLookahead* bm, boo int BackReferenceNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start) { + if (read_backward()) + return 0; if (budget <= 0) return 0; return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start); @@ -1578,6 +1581,9 @@ class irregexp::RegExpCompiler current_expansion_factor_ = value; } + bool read_backward() { return read_backward_; } + void set_read_backward(bool value) { read_backward_ = value; } + JSContext* cx() const { return cx_; } LifoAlloc* alloc() const { return alloc_; } @@ -1595,6 +1601,7 @@ class irregexp::RegExpCompiler bool unicode_; bool reg_exp_too_big_; int current_expansion_factor_; + bool read_backward_; FrequencyCollator frequency_collator_; JSContext* cx_; LifoAlloc* alloc_; @@ -1624,6 +1631,7 @@ RegExpCompiler::RegExpCompiler(JSContext* cx, LifoAlloc* alloc, int capture_coun unicode_(unicode), reg_exp_too_big_(false), current_expansion_factor_(1), + read_backward_(false), frequency_collator_(), cx_(cx), alloc_(alloc) @@ -1747,7 +1755,7 @@ irregexp::CompilePattern(JSContext* cx, RegExpShared* shared, RegExpCompileData* // at the start of input. ChoiceNode* first_step_node = alloc.newInfallible(&alloc, 2); RegExpNode* char_class = - alloc.newInfallible(alloc.newInfallible('*'), loop_node); + alloc.newInfallible(alloc.newInfallible('*'), false, loop_node); first_step_node->AddAlternative(GuardedAlternative(captured_body)); first_step_node->AddAlternative(GuardedAlternative(char_class)); node = first_step_node; @@ -1850,19 +1858,19 @@ RegExpAtom::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) TextElementVector* elms = compiler->alloc()->newInfallible(*compiler->alloc()); elms->append(TextElement::Atom(this)); - return compiler->alloc()->newInfallible(elms, on_success); + return compiler->alloc()->newInfallible(elms, compiler->read_backward(), on_success); } RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { - return compiler->alloc()->newInfallible(&elements_, on_success); + return compiler->alloc()->newInfallible(&elements_, compiler->read_backward(), on_success); } RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { - return compiler->alloc()->newInfallible(this, on_success); + return compiler->alloc()->newInfallible(this, compiler->read_backward(), on_success); } RegExpNode* @@ -2003,7 +2011,8 @@ RegExpQuantifier::ToNode(int min, alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler, answer))); } answer = alternation; - if (not_at_start) alternation->set_not_at_start(); + if (not_at_start && !compiler->read_backward()) + alternation->set_not_at_start(); } return answer; } @@ -2015,8 +2024,9 @@ RegExpQuantifier::ToNode(int min, int reg_ctr = needs_counter ? compiler->AllocateRegister() : RegExpCompiler::kNoRegister; - LoopChoiceNode* center = alloc->newInfallible(alloc, body->min_match() == 0); - if (not_at_start) + LoopChoiceNode* center = alloc->newInfallible(alloc, body->min_match() == 0, + compiler->read_backward()); + if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); RegExpNode* loop_return = needs_counter ? static_cast(ActionNode::IncrementRegister(reg_ctr, center)) @@ -2092,7 +2102,7 @@ RegExpAssertion::ToNode(RegExpCompiler* compiler, CharacterRange::AddClassEscape(alloc, 'n', newline_ranges); RegExpCharacterClass* newline_atom = alloc->newInfallible('n'); TextNode* newline_matcher = - alloc->newInfallible(newline_atom, + alloc->newInfallible(newline_atom, false, ActionNode::PositiveSubmatchSuccess(stack_pointer_register, position_register, 0, // No captures inside. @@ -2124,6 +2134,7 @@ RegExpBackReference::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { return compiler->alloc()->newInfallible(RegExpCapture::StartRegister(index()), RegExpCapture::EndRegister(index()), + compiler->read_backward(), on_success); } @@ -2134,7 +2145,7 @@ RegExpEmpty::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) } RegExpNode* -RegExpLookahead::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) +RegExpLookaround::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { int stack_pointer_register = compiler->AllocateRegister(); int position_register = compiler->AllocateRegister(); @@ -2145,6 +2156,10 @@ RegExpLookahead::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) int register_start = register_of_first_capture + capture_from_ * registers_per_capture; + RegExpNode* result; + bool was_reading_backward = compiler->read_backward(); + compiler->set_read_backward(type() == LOOKBEHIND); + if (is_positive()) { RegExpNode* bodyNode = body()->ToNode(compiler, @@ -2153,37 +2168,39 @@ RegExpLookahead::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) register_count, register_start, on_success)); - return ActionNode::BeginSubmatch(stack_pointer_register, + result = ActionNode::BeginSubmatch(stack_pointer_register, + position_register, + bodyNode); + } else { + // We use a ChoiceNode for a negative lookahead because it has most of + // the characteristics we need. It has the body of the lookahead as its + // first alternative and the expression after the lookahead of the second + // alternative. If the first alternative succeeds then the + // NegativeSubmatchSuccess will unwind the stack including everything the + // choice node set up and backtrack. If the first alternative fails then + // the second alternative is tried, which is exactly the desired result + // for a negative lookahead. The NegativeLookaheadChoiceNode is a special + // ChoiceNode that knows to ignore the first exit when calculating quick + // checks. + LifoAlloc* alloc = compiler->alloc(); + + RegExpNode* success = + alloc->newInfallible(alloc, + stack_pointer_register, + position_register, + register_count, + register_start); + GuardedAlternative body_alt(body()->ToNode(compiler, success)); + + ChoiceNode* choice_node = + alloc->newInfallible(alloc, body_alt, GuardedAlternative(on_success)); + + result = ActionNode::BeginSubmatch(stack_pointer_register, position_register, - bodyNode); + choice_node); } - - // We use a ChoiceNode for a negative lookahead because it has most of - // the characteristics we need. It has the body of the lookahead as its - // first alternative and the expression after the lookahead of the second - // alternative. If the first alternative succeeds then the - // NegativeSubmatchSuccess will unwind the stack including everything the - // choice node set up and backtrack. If the first alternative fails then - // the second alternative is tried, which is exactly the desired result - // for a negative lookahead. The NegativeLookaheadChoiceNode is a special - // ChoiceNode that knows to ignore the first exit when calculating quick - // checks. - LifoAlloc* alloc = compiler->alloc(); - - RegExpNode* success = - alloc->newInfallible(alloc, - stack_pointer_register, - position_register, - register_count, - register_start); - GuardedAlternative body_alt(body()->ToNode(compiler, success)); - - ChoiceNode* choice_node = - alloc->newInfallible(alloc, body_alt, GuardedAlternative(on_success)); - - return ActionNode::BeginSubmatch(stack_pointer_register, - position_register, - choice_node); + compiler->set_read_backward(was_reading_backward); + return result; } RegExpNode* @@ -2198,8 +2215,14 @@ RegExpCapture::ToNode(RegExpTree* body, RegExpCompiler* compiler, RegExpNode* on_success) { + MOZ_ASSERT(body); int start_reg = RegExpCapture::StartRegister(index); int end_reg = RegExpCapture::EndRegister(index); + if (compiler->read_backward()) { + // std::swap(start_reg, end_reg); + start_reg = RegExpCapture::EndRegister(index); + end_reg = RegExpCapture::StartRegister(index); + } RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); RegExpNode* body_node = body->ToNode(compiler, store_end); return ActionNode::StorePosition(start_reg, true, body_node); @@ -2210,8 +2233,15 @@ RegExpAlternative::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { const RegExpTreeVector& children = nodes(); RegExpNode* current = on_success; - for (int i = children.length() - 1; i >= 0; i--) - current = children[i]->ToNode(compiler, current); + if (compiler->read_backward()) { + for (int i = 0; i < children.length(); i++) { + current = children[i]->ToNode(compiler, current); + } + } else { + for (int i = children.length() - 1; i >= 0; i--) { + current = children[i]->ToNode(compiler, current); + } + } return current; } @@ -2764,7 +2794,6 @@ Trace::InvalidateCurrentCharacter() void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { - MOZ_ASSERT(by > 0); // We don't have an instruction for shifting the current character register // down or for using a shifted value for anything so lets just forget that // we preloaded any characters into it. @@ -3109,9 +3138,9 @@ AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) return; } if (trace->at_start() == Trace::UNKNOWN) { - assembler->CheckNotAtStart(trace->backtrack()); + assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack()); Trace at_start_trace = *trace; - at_start_trace.set_at_start(true); + at_start_trace.set_at_start(Trace::TRUE_VALUE); on_success()->Emit(compiler, &at_start_trace); return; } @@ -3814,9 +3843,10 @@ TextNode::TextEmitPass(RegExpCompiler* compiler, jit::Label* backtrack = trace->backtrack(); QuickCheckDetails* quick_check = trace->quick_check_performed(); int element_count = elements().length(); + int backward_offset = read_backward() ? -Length() : 0; for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { TextElement elm = elements()[i]; - int cp_offset = trace->cp_offset() + elm.cp_offset(); + int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; if (elm.text_type() == TextElement::ATOM) { const CharacterVector& quarks = elm.atom()->data(); for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { @@ -3844,11 +3874,12 @@ TextNode::TextEmitPass(RegExpCompiler* compiler, break; } if (emit_function != nullptr) { + bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); bool bound_checked = emit_function(compiler, quarks[j], backtrack, cp_offset + j, - *checked_up_to < cp_offset + j, + bounds_check, preloaded); if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); } @@ -3859,13 +3890,14 @@ TextNode::TextEmitPass(RegExpCompiler* compiler, if (first_element_checked && i == 0) continue; if (DeterminedAlready(quick_check, elm.cp_offset())) continue; RegExpCharacterClass* cc = elm.char_class(); + bool bounds_check = *checked_up_to < cp_offset || read_backward(); EmitCharClass(alloc(), assembler, cc, ascii, backtrack, cp_offset, - *checked_up_to < cp_offset, + bounds_check, preloaded); UpdateBoundsCheck(cp_offset, checked_up_to); } @@ -3945,8 +3977,11 @@ TextNode::Emit(RegExpCompiler* compiler, Trace* trace) } Trace successor_trace(*trace); - successor_trace.set_at_start(false); - successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); + // If we advance backward, we may end up at the start. + successor_trace.AdvanceCurrentPositionInTrace( + read_backward() ? -Length() : Length(), compiler); + successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN + : Trace::FALSE_VALUE); RecursionCheck rc(compiler); on_success()->Emit(compiler, &successor_trace); } @@ -4118,6 +4153,8 @@ ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_lea RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(RegExpCompiler* compiler) { + if (read_backward()) return NULL; + if (elements().length() != 1) return nullptr; @@ -4165,7 +4202,7 @@ ChoiceNode::GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative) SeqRegExpNode* seq_node = static_cast(node); node = seq_node->on_success(); } - return length; + return read_backward() ? -length : length; } // Creates a list of AlternativeGenerations. If the list has a reasonable @@ -4240,7 +4277,7 @@ ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) jit::Label greedy_loop_label; Trace counter_backtrack_trace; counter_backtrack_trace.set_backtrack(&greedy_loop_label); - if (not_at_start()) counter_backtrack_trace.set_at_start(false); + if (not_at_start()) counter_backtrack_trace.set_at_start(Trace::FALSE_VALUE); if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { // Here we have special handling for greedy loops containing only text nodes @@ -4256,7 +4293,7 @@ ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) current_trace = &counter_backtrack_trace; jit::Label greedy_match_failed; Trace greedy_match_trace; - if (not_at_start()) greedy_match_trace.set_at_start(false); + if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE); greedy_match_trace.set_backtrack(&greedy_match_failed); jit::Label loop_label; macro_assembler->Bind(&loop_label); @@ -4605,11 +4642,14 @@ BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) MOZ_ASSERT(start_reg_ + 1 == end_reg_); if (compiler->ignore_case()) { assembler->CheckNotBackReferenceIgnoreCase(start_reg_, + read_backward(), trace->backtrack(), compiler->unicode()); } else { - assembler->CheckNotBackReference(start_reg_, trace->backtrack()); + assembler->CheckNotBackReference(start_reg_, read_backward(), trace->backtrack()); } + // We are going to advance backward, so we may end up at the start. + if (read_backward()) trace->set_at_start(Trace::UNKNOWN); on_success()->Emit(compiler, trace); } @@ -4977,7 +5017,6 @@ QuickCheckDetails::Clear() void QuickCheckDetails::Advance(int by, bool ascii) { - MOZ_ASSERT(by >= 0); if (by >= characters_) { Clear(); return; diff --git a/js/src/irregexp/RegExpEngine.h b/js/src/irregexp/RegExpEngine.h index 1a8fd4b22..c4409dcca 100644 --- a/js/src/irregexp/RegExpEngine.h +++ b/js/src/irregexp/RegExpEngine.h @@ -119,7 +119,7 @@ InterpretCode(JSContext* cx, const uint8_t* byteCode, const CharT* chars, size_t VISIT(Atom) \ VISIT(Quantifier) \ VISIT(Capture) \ - VISIT(Lookahead) \ + VISIT(Lookaround) \ VISIT(BackReference) \ VISIT(Empty) \ VISIT(Text) @@ -763,15 +763,19 @@ class TextNode : public SeqRegExpNode { public: TextNode(TextElementVector* elements, + bool read_backward, RegExpNode* on_success) : SeqRegExpNode(on_success), - elements_(elements) + elements_(elements), + read_backward_(read_backward) {} TextNode(RegExpCharacterClass* that, + bool read_backward, RegExpNode* on_success) : SeqRegExpNode(on_success), - elements_(alloc()->newInfallible(*alloc())) + elements_(alloc()->newInfallible(*alloc())), + read_backward_(read_backward) { elements_->append(TextElement::CharClass(that)); } @@ -784,6 +788,7 @@ class TextNode : public SeqRegExpNode int characters_filled_in, bool not_at_start); TextElementVector& elements() { return *elements_; } + bool read_backward() { return read_backward_; } void MakeCaseIndependent(bool is_ascii, bool unicode); virtual int GreedyLoopTextLength(); virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( @@ -814,6 +819,7 @@ class TextNode : public SeqRegExpNode int* checked_up_to); int Length(); TextElementVector* elements_; + bool read_backward_; }; class AssertionNode : public SeqRegExpNode @@ -882,15 +888,18 @@ class BackReferenceNode : public SeqRegExpNode public: BackReferenceNode(int start_reg, int end_reg, + bool read_backward, RegExpNode* on_success) : SeqRegExpNode(on_success), start_reg_(start_reg), - end_reg_(end_reg) + end_reg_(end_reg), + read_backward_(read_backward) {} virtual void Accept(NodeVisitor* visitor); int start_register() { return start_reg_; } int end_register() { return end_reg_; } + bool read_backward() { return read_backward_; } virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int recursion_depth, @@ -909,6 +918,7 @@ class BackReferenceNode : public SeqRegExpNode private: int start_reg_; int end_reg_; + bool read_backward_; }; class EndNode : public RegExpNode @@ -1053,6 +1063,7 @@ class ChoiceNode : public RegExpNode void set_being_calculated(bool b) { being_calculated_ = b; } virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; } virtual RegExpNode* FilterASCII(int depth, bool ignore_case, bool unicode); + virtual bool read_backward() { return false; } protected: int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative); @@ -1111,11 +1122,13 @@ class NegativeLookaheadChoiceNode : public ChoiceNode class LoopChoiceNode : public ChoiceNode { public: - explicit LoopChoiceNode(LifoAlloc* alloc, bool body_can_be_zero_length) + explicit LoopChoiceNode(LifoAlloc* alloc, bool body_can_be_zero_length, + bool read_backward) : ChoiceNode(alloc, 2), loop_node_(nullptr), continue_node_(nullptr), - body_can_be_zero_length_(body_can_be_zero_length) + body_can_be_zero_length_(body_can_be_zero_length), + read_backward_(read_backward) {} void AddLoopAlternative(GuardedAlternative alt); @@ -1133,6 +1146,7 @@ class LoopChoiceNode : public ChoiceNode RegExpNode* loop_node() { return loop_node_; } RegExpNode* continue_node() { return continue_node_; } bool body_can_be_zero_length() { return body_can_be_zero_length_; } + virtual bool read_backward() { return read_backward_; } virtual void Accept(NodeVisitor* visitor); virtual RegExpNode* FilterASCII(int depth, bool ignore_case, bool unicode); @@ -1147,6 +1161,7 @@ class LoopChoiceNode : public ChoiceNode RegExpNode* loop_node_; RegExpNode* continue_node_; bool body_can_be_zero_length_; + bool read_backward_; }; // Improve the speed that we scan for an initial point where a non-anchored @@ -1422,8 +1437,8 @@ class Trace } TriBool at_start() { return at_start_; } - void set_at_start(bool at_start) { - at_start_ = at_start ? TRUE_VALUE : FALSE_VALUE; + void set_at_start(TriBool at_start) { + at_start_ = at_start; } jit::Label* backtrack() { return backtrack_; } jit::Label* loop_label() { return loop_label_; } diff --git a/js/src/irregexp/RegExpInterpreter.cpp b/js/src/irregexp/RegExpInterpreter.cpp index 7fd2d983a..d09b4671e 100644 --- a/js/src/irregexp/RegExpInterpreter.cpp +++ b/js/src/irregexp/RegExpInterpreter.cpp @@ -222,8 +222,8 @@ irregexp::InterpretCode(JSContext* cx, const uint8_t* byteCode, const CharT* cha } break; BYTECODE(LOAD_CURRENT_CHAR) { - size_t pos = current + (insn >> BYTECODE_SHIFT); - if (pos >= length) { + int pos = current + (insn >> BYTECODE_SHIFT); + if (pos >= (int)length || pos < 0) { pc = byteCode + Load32Aligned(pc + 4); } else { current_char = chars[pos]; @@ -238,8 +238,8 @@ irregexp::InterpretCode(JSContext* cx, const uint8_t* byteCode, const CharT* cha break; } BYTECODE(LOAD_2_CURRENT_CHARS) { - size_t pos = current + (insn >> BYTECODE_SHIFT); - if (pos + 2 > length) { + int pos = current + (insn >> BYTECODE_SHIFT); + if (pos + 2 > (int)length || pos < 0) { pc = byteCode + Load32Aligned(pc + 4); } else { CharT next = chars[pos + 1]; @@ -425,6 +425,30 @@ irregexp::InterpretCode(JSContext* cx, const uint8_t* byteCode, const CharT* cha pc += BC_CHECK_NOT_BACK_REF_LENGTH; break; } + BYTECODE(CHECK_NOT_BACK_REF_BACKWARD) { + int from = registers[insn >> BYTECODE_SHIFT]; + int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; + if (from < 0 || len <= 0) { + pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH; + break; + } + if (int(current) - len < 0) { + pc = byteCode + Load32Aligned(pc + 4); + break; + } else { + int i; + for (i = 0; i < len; i++) { + if (chars[from + i] != chars[int(current) - len + i]) { + pc = byteCode + Load32Aligned(pc + 4); + break; + } + } + if (i < len) break; + current -= len; + } + pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH; + break; + } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; @@ -465,6 +489,46 @@ irregexp::InterpretCode(JSContext* cx, const uint8_t* byteCode, const CharT* cha } break; } + BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) { + int from = registers[insn >> BYTECODE_SHIFT]; + int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; + if (from < 0 || len <= 0) { + pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH; + break; + } + if (int(current) - len < 0) { + pc = byteCode + Load32Aligned(pc + 4); + break; + } + if (CaseInsensitiveCompareStrings(chars + from, chars + int(current) - len, len * sizeof(CharT))) { + current -= len; + pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH; + } else { + pc = byteCode + Load32Aligned(pc + 4); + } + break; + + } + BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_UNICODE) { + int from = registers[insn >> BYTECODE_SHIFT]; + int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; + if (from < 0 || len <= 0) { + pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH; + break; + } + if (int(current) - len < 0) { + pc = byteCode + Load32Aligned(pc + 4); + break; + } + if (CaseInsensitiveCompareUCStrings(chars + from, chars + int(current) - len, len * sizeof(CharT))) { + current -= len; + pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH; + } else { + pc = byteCode + Load32Aligned(pc + 4); + } + break; + + } BYTECODE(CHECK_AT_START) if (current == 0) pc = byteCode + Load32Aligned(pc + 4); @@ -472,7 +536,7 @@ irregexp::InterpretCode(JSContext* cx, const uint8_t* byteCode, const CharT* cha pc += BC_CHECK_AT_START_LENGTH; break; BYTECODE(CHECK_NOT_AT_START) - if (current == 0) + if (current + (insn >> BYTECODE_SHIFT) == 0) pc += BC_CHECK_NOT_AT_START_LENGTH; else pc = byteCode + Load32Aligned(pc + 4); diff --git a/js/src/irregexp/RegExpMacroAssembler.cpp b/js/src/irregexp/RegExpMacroAssembler.cpp index d66d0d204..6b1ceba8a 100644 --- a/js/src/irregexp/RegExpMacroAssembler.cpp +++ b/js/src/irregexp/RegExpMacroAssembler.cpp @@ -226,32 +226,37 @@ InterpretedRegExpMacroAssembler::CheckGreedyLoop(jit::Label* on_tos_equals_curre } void -InterpretedRegExpMacroAssembler::CheckNotAtStart(jit::Label* on_not_at_start) +InterpretedRegExpMacroAssembler::CheckNotAtStart(int cp_offset, jit::Label* on_not_at_start) { - Emit(BC_CHECK_NOT_AT_START, 0); + Emit(BC_CHECK_NOT_AT_START, cp_offset); EmitOrLink(on_not_at_start); } void -InterpretedRegExpMacroAssembler::CheckNotBackReference(int start_reg, jit::Label* on_no_match) +InterpretedRegExpMacroAssembler::CheckNotBackReference(int start_reg, bool read_backward, + jit::Label* on_no_match) { MOZ_ASSERT(start_reg >= 0); MOZ_ASSERT(start_reg <= kMaxRegister); - Emit(BC_CHECK_NOT_BACK_REF, start_reg); + Emit(read_backward ? BC_CHECK_NOT_BACK_REF_BACKWARD : BC_CHECK_NOT_BACK_REF, + start_reg); EmitOrLink(on_no_match); } void InterpretedRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(int start_reg, + bool read_backward, jit::Label* on_no_match, bool unicode) { MOZ_ASSERT(start_reg >= 0); MOZ_ASSERT(start_reg <= kMaxRegister); if (unicode) - Emit(BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE, start_reg); + Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_UNICODE : BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE, + start_reg); else - Emit(BC_CHECK_NOT_BACK_REF_NO_CASE, start_reg); + Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD : BC_CHECK_NOT_BACK_REF_NO_CASE, + start_reg); EmitOrLink(on_no_match); } diff --git a/js/src/irregexp/RegExpMacroAssembler.h b/js/src/irregexp/RegExpMacroAssembler.h index dca2edf90..c5def92f2 100644 --- a/js/src/irregexp/RegExpMacroAssembler.h +++ b/js/src/irregexp/RegExpMacroAssembler.h @@ -110,10 +110,10 @@ class MOZ_STACK_CLASS RegExpMacroAssembler virtual void CheckCharacterGT(char16_t limit, jit::Label* on_greater) = 0; virtual void CheckCharacterLT(char16_t limit, jit::Label* on_less) = 0; virtual void CheckGreedyLoop(jit::Label* on_tos_equals_current_position) = 0; - virtual void CheckNotAtStart(jit::Label* on_not_at_start) = 0; - virtual void CheckNotBackReference(int start_reg, jit::Label* on_no_match) = 0; - virtual void CheckNotBackReferenceIgnoreCase(int start_reg, jit::Label* on_no_match, - bool unicode) = 0; + virtual void CheckNotAtStart(int cp_offset, jit::Label* on_not_at_start) = 0; + virtual void CheckNotBackReference(int start_reg, bool read_backward, jit::Label* on_no_match) = 0; + virtual void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, + jit::Label* on_no_match, bool unicode) = 0; // Check the current character for a match with a literal character. If we // fail to match then goto the on_failure label. End of input always @@ -245,9 +245,10 @@ class MOZ_STACK_CLASS InterpretedRegExpMacroAssembler final : public RegExpMacro void CheckCharacterGT(char16_t limit, jit::Label* on_greater); void CheckCharacterLT(char16_t limit, jit::Label* on_less); void CheckGreedyLoop(jit::Label* on_tos_equals_current_position); - void CheckNotAtStart(jit::Label* on_not_at_start); - void CheckNotBackReference(int start_reg, jit::Label* on_no_match); - void CheckNotBackReferenceIgnoreCase(int start_reg, jit::Label* on_no_match, bool unicode); + void CheckNotAtStart(int cp_offset, jit::Label* on_not_at_start); + void CheckNotBackReference(int start_reg, bool read_backward, jit::Label* on_no_match); + void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, + jit::Label* on_no_match, bool unicode); void CheckNotCharacter(unsigned c, jit::Label* on_not_equal); void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, jit::Label* on_not_equal); void CheckNotCharacterAfterMinusAnd(char16_t c, char16_t minus, char16_t and_with, diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index 8bd88047a..9ef9fe3e2 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -227,6 +227,7 @@ RegExpParser::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, alloc(alloc), captures_(nullptr), next_pos_(chars), + captures_started_(0), end_(end), current_(kEndMarker), capture_count_(0), @@ -418,7 +419,8 @@ RangeAtom(LifoAlloc* alloc, char16_t from, char16_t to) static inline RegExpTree* NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to) { - return alloc->newInfallible(RangeAtom(alloc, from, to), false, 0, 0); + return alloc->newInfallible(RangeAtom(alloc, from, to), false, + 0, 0, RegExpLookaround::LOOKAHEAD); } static bool @@ -1213,6 +1215,38 @@ RegExpParser::ParseBackReferenceIndex(int* index_out) return true; } +template +RegExpCapture* +RegExpParser::GetCapture(int index) { + // The index for the capture groups are one-based. Its index in the list is + // zero-based. + int known_captures = + is_scanned_for_captures_ ? capture_count_ : captures_started_; + MOZ_ASSERT(index <= known_captures); + if (captures_ == NULL) { + captures_ = alloc->newInfallible(*alloc); + } + while ((int)captures_->length() < known_captures) { + RegExpCapture* capture = alloc->newInfallible(nullptr, captures_->length() + 1); + captures_->append(capture); + } + return (*captures_)[index - 1]; +} + + +template +bool +RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { + for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { + if (s->group_type() != CAPTURE) continue; + // Return true if we found the matching capture index. + if (index == s->capture_index()) return true; + // Abort if index is larger than what has been parsed up till this state. + if (index > s->capture_index()) return false; + } + return false; +} + // QuantifierPrefix :: // { DecimalDigits } // { DecimalDigits , } @@ -1423,24 +1457,24 @@ RegExpTree* RegExpParser::ParseDisjunction() { // Used to store current state while parsing subexpressions. - RegExpParserState initial_state(alloc, nullptr, INITIAL, 0); - RegExpParserState* stored_state = &initial_state; + RegExpParserState initial_state(alloc, nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, 0); + RegExpParserState* state = &initial_state; // Cache the builder in a local variable for quick access. RegExpBuilder* builder = initial_state.builder(); while (true) { switch (current()) { case kEndMarker: - if (stored_state->IsSubexpression()) { + if (state->IsSubexpression()) { // Inside a parenthesized group when hitting end of input. return ReportError(JSMSG_MISSING_PAREN); } - MOZ_ASSERT(INITIAL == stored_state->group_type()); + MOZ_ASSERT(INITIAL == state->group_type()); // Parsing completed successfully. return builder->ToRegExp(); case ')': { - if (!stored_state->IsSubexpression()) + if (!state->IsSubexpression()) return ReportError(JSMSG_UNMATCHED_RIGHT_PAREN); - MOZ_ASSERT(INITIAL != stored_state->group_type()); + MOZ_ASSERT(INITIAL != state->group_type()); Advance(); // End disjunction parsing and convert builder content to new single @@ -1449,29 +1483,30 @@ RegExpParser::ParseDisjunction() int end_capture_index = captures_started(); - int capture_index = stored_state->capture_index(); - SubexpressionType group_type = stored_state->group_type(); - - // Restore previous state. - stored_state = stored_state->previous_state(); - builder = stored_state->builder(); + int capture_index = state->capture_index(); + SubexpressionType group_type = state->group_type(); // Build result of subexpression. if (group_type == CAPTURE) { - RegExpCapture* capture = alloc->newInfallible(body, capture_index); - (*captures_)[capture_index - 1] = capture; + RegExpCapture* capture = GetCapture(capture_index); + capture->set_body(body); body = capture; } else if (group_type != GROUPING) { - MOZ_ASSERT(group_type == POSITIVE_LOOKAHEAD || - group_type == NEGATIVE_LOOKAHEAD); - bool is_positive = (group_type == POSITIVE_LOOKAHEAD); - body = alloc->newInfallible(body, + MOZ_ASSERT(group_type == POSITIVE_LOOKAROUND || + group_type == NEGATIVE_LOOKAROUND); + bool is_positive = (group_type == POSITIVE_LOOKAROUND); + body = alloc->newInfallible(body, is_positive, end_capture_index - capture_index, - capture_index); + capture_index, + state->lookaround_type()); } + + // Restore previous state. + state = state->previous_state(); + builder = state->builder(); builder->AddAtom(body); - if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD)) + if (unicode_ && (group_type == POSITIVE_LOOKAROUND || group_type == NEGATIVE_LOOKAROUND)) continue; // For compatability with JSC and ES3, we allow quantifiers after // lookaheads, and break in all cases. @@ -1519,6 +1554,7 @@ RegExpParser::ParseDisjunction() } case '(': { SubexpressionType subexpr_type = CAPTURE; + RegExpLookaround::Type lookaround_type = state->lookaround_type(); Advance(); if (current() == '?') { switch (Next()) { @@ -1526,26 +1562,39 @@ RegExpParser::ParseDisjunction() subexpr_type = GROUPING; break; case '=': - subexpr_type = POSITIVE_LOOKAHEAD; + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = POSITIVE_LOOKAROUND; break; case '!': - subexpr_type = NEGATIVE_LOOKAHEAD; + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = NEGATIVE_LOOKAROUND; break; + case '<': + Advance(); + lookaround_type = RegExpLookaround::LOOKBEHIND; + if (Next() == '=') { + subexpr_type = POSITIVE_LOOKAROUND; + break; + } else if (Next() == '!') { + subexpr_type = NEGATIVE_LOOKAROUND; + break; + } + // We didn't get a positive or negative after '<'. + // That's an error. + return ReportError(JSMSG_INVALID_GROUP); default: return ReportError(JSMSG_INVALID_GROUP); } Advance(2); } else { - if (captures_ == nullptr) - captures_ = alloc->newInfallible(*alloc); if (captures_started() >= kMaxCaptures) return ReportError(JSMSG_TOO_MANY_PARENS); - captures_->append((RegExpCapture*) nullptr); + captures_started_++; } // Store current state and begin new disjunction parsing. - stored_state = alloc->newInfallible(alloc, stored_state, subexpr_type, - captures_started()); - builder = stored_state->builder(); + state = alloc->newInfallible(alloc, state, subexpr_type, + lookaround_type, captures_started_); + builder = state->builder(); continue; } case '[': { @@ -1600,19 +1649,18 @@ RegExpParser::ParseDisjunction() case '7': case '8': case '9': { int index = 0; if (ParseBackReferenceIndex(&index)) { - RegExpCapture* capture = nullptr; - if (captures_ != nullptr && index <= (int) captures_->length()) { - capture = (*captures_)[index - 1]; + if (state->IsInsideCaptureGroup(index)) { + // The backreference is inside the capture group it refers to. + // Nothing can possibly have been captured yet. + builder->AddEmpty(); + } else { + RegExpCapture* capture = GetCapture(index); + RegExpTree* atom = alloc->newInfallible(capture); + if (unicode_) + builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom)); + else + builder->AddAtom(atom); } - if (capture == nullptr) { - builder->AddEmpty(); - break; - } - RegExpTree* atom = alloc->newInfallible(capture); - if (unicode_) - builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom)); - else - builder->AddAtom(atom); break; } if (unicode_) diff --git a/js/src/irregexp/RegExpParser.h b/js/src/irregexp/RegExpParser.h index 0a7e61858..2f02625b5 100644 --- a/js/src/irregexp/RegExpParser.h +++ b/js/src/irregexp/RegExpParser.h @@ -229,7 +229,7 @@ class RegExpParser bool simple() { return simple_; } bool contains_anchor() { return contains_anchor_; } void set_contains_anchor() { contains_anchor_ = true; } - int captures_started() { return captures_ == nullptr ? 0 : captures_->length(); } + int captures_started() { return captures_started_; } const CharT* position() { return next_pos_ - 1; } static const int kMaxCaptures = 1 << 16; @@ -239,8 +239,8 @@ class RegExpParser enum SubexpressionType { INITIAL, CAPTURE, // All positive values represent captures. - POSITIVE_LOOKAHEAD, - NEGATIVE_LOOKAHEAD, + POSITIVE_LOOKAROUND, + NEGATIVE_LOOKAROUND, GROUPING }; @@ -249,10 +249,12 @@ class RegExpParser RegExpParserState(LifoAlloc* alloc, RegExpParserState* previous_state, SubexpressionType group_type, + RegExpLookaround::Type lookaround_type, int disjunction_capture_index) : previous_state_(previous_state), builder_(alloc->newInfallible(alloc)), group_type_(group_type), + lookaround_type_(lookaround_type), disjunction_capture_index_(disjunction_capture_index) {} // Parser state of containing expression, if any. @@ -262,11 +264,16 @@ class RegExpParser RegExpBuilder* builder() { return builder_; } // Type of regexp being parsed (parenthesized group or entire regexp). SubexpressionType group_type() { return group_type_; } + // Lookahead or Lookbehind. + RegExpLookaround::Type lookaround_type() { return lookaround_type_; } // Index in captures array of first capture in this sub-expression, if any. // Also the capture index of this sub-expression itself, if group_type // is CAPTURE. int capture_index() { return disjunction_capture_index_; } + // Check whether the parser is inside a capture group with the given index. + bool IsInsideCaptureGroup(int index); + private: // Linked list implementation of stack of states. RegExpParserState* previous_state_; @@ -274,10 +281,15 @@ class RegExpParser RegExpBuilder* builder_; // Stored disjunction type (capture, look-ahead or grouping), if any. SubexpressionType group_type_; + // Stored read direction. + RegExpLookaround::Type lookaround_type_; // Stored disjunction's capture index (if any). int disjunction_capture_index_; }; + // Return the 1-indexed RegExpCapture object, allocate if necessary. + RegExpCapture* GetCapture(int index); + widechar current() { return current_; } bool has_more() { return has_more_; } bool has_next() { return next_pos_ < end_; } @@ -294,6 +306,7 @@ class RegExpParser const CharT* next_pos_; const CharT* end_; widechar current_; + int captures_started_; // The capture count is only valid after we have scanned for captures. int capture_count_; bool has_more_;