1590 lines
61 KiB
C++
1590 lines
61 KiB
C++
// Copyright 2019 the V8 project authors. All rights reserved.
|
||
// Use of this source code is governed by a BSD-style license that can be
|
||
// found in the LICENSE file.
|
||
|
||
#include "regexp/regexp-compiler.h"
|
||
|
||
#include "regexp/regexp.h"
|
||
#ifdef V8_INTL_SUPPORT
|
||
#include "regexp/special-case.h"
|
||
#endif // V8_INTL_SUPPORT
|
||
|
||
#ifdef V8_INTL_SUPPORT
|
||
#include "unicode/locid.h"
|
||
#include "unicode/uniset.h"
|
||
#include "unicode/utypes.h"
|
||
#endif // V8_INTL_SUPPORT
|
||
|
||
namespace v8 {
|
||
namespace internal {
|
||
|
||
using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
|
||
|
||
// -------------------------------------------------------------------
|
||
// Tree to graph conversion
|
||
|
||
RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
ZoneList<TextElement>* elms =
|
||
new (compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
|
||
elms->Add(TextElement::Atom(this), compiler->zone());
|
||
return new (compiler->zone())
|
||
TextNode(elms, compiler->read_backward(), on_success);
|
||
}
|
||
|
||
RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
return new (compiler->zone())
|
||
TextNode(elements(), compiler->read_backward(), on_success);
|
||
}
|
||
|
||
static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
|
||
const int* special_class, int length) {
|
||
length--; // Remove final marker.
|
||
DCHECK_EQ(kRangeEndMarker, special_class[length]);
|
||
DCHECK_NE(0, ranges->length());
|
||
DCHECK_NE(0, length);
|
||
DCHECK_NE(0, special_class[0]);
|
||
if (ranges->length() != (length >> 1) + 1) {
|
||
return false;
|
||
}
|
||
CharacterRange range = ranges->at(0);
|
||
if (range.from() != 0) {
|
||
return false;
|
||
}
|
||
for (int i = 0; i < length; i += 2) {
|
||
if (special_class[i] != (range.to() + 1)) {
|
||
return false;
|
||
}
|
||
range = ranges->at((i >> 1) + 1);
|
||
if (special_class[i + 1] != range.from()) {
|
||
return false;
|
||
}
|
||
}
|
||
if (range.to() != String::kMaxCodePoint) {
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
static bool CompareRanges(ZoneList<CharacterRange>* ranges,
|
||
const int* special_class, int length) {
|
||
length--; // Remove final marker.
|
||
DCHECK_EQ(kRangeEndMarker, special_class[length]);
|
||
if (ranges->length() * 2 != length) {
|
||
return false;
|
||
}
|
||
for (int i = 0; i < length; i += 2) {
|
||
CharacterRange range = ranges->at(i >> 1);
|
||
if (range.from() != special_class[i] ||
|
||
range.to() != special_class[i + 1] - 1) {
|
||
return false;
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
|
||
bool RegExpCharacterClass::is_standard(Zone* zone) {
|
||
// TODO(lrn): Remove need for this function, by not throwing away information
|
||
// along the way.
|
||
if (is_negated()) {
|
||
return false;
|
||
}
|
||
if (set_.is_standard()) {
|
||
return true;
|
||
}
|
||
if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
|
||
set_.set_standard_set_type('s');
|
||
return true;
|
||
}
|
||
if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
|
||
set_.set_standard_set_type('S');
|
||
return true;
|
||
}
|
||
if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
|
||
kLineTerminatorRangeCount)) {
|
||
set_.set_standard_set_type('.');
|
||
return true;
|
||
}
|
||
if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
|
||
kLineTerminatorRangeCount)) {
|
||
set_.set_standard_set_type('n');
|
||
return true;
|
||
}
|
||
if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
|
||
set_.set_standard_set_type('w');
|
||
return true;
|
||
}
|
||
if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
|
||
set_.set_standard_set_type('W');
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
|
||
// The unicode range splitter categorizes given character ranges into:
|
||
// - Code points from the BMP representable by one code unit.
|
||
// - Code points outside the BMP that need to be split into surrogate pairs.
|
||
// - Lone lead surrogates.
|
||
// - Lone trail surrogates.
|
||
// Lone surrogates are valid code points, even though no actual characters.
|
||
// They require special matching to make sure we do not split surrogate pairs.
|
||
|
||
for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
|
||
}
|
||
|
||
void UnicodeRangeSplitter::AddRange(CharacterRange range) {
|
||
static constexpr uc32 kBmp1Start = 0;
|
||
static constexpr uc32 kBmp1End = kLeadSurrogateStart - 1;
|
||
static constexpr uc32 kBmp2Start = kTrailSurrogateEnd + 1;
|
||
static constexpr uc32 kBmp2End = kNonBmpStart - 1;
|
||
|
||
// Ends are all inclusive.
|
||
STATIC_ASSERT(kBmp1Start == 0);
|
||
STATIC_ASSERT(kBmp1Start < kBmp1End);
|
||
STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart);
|
||
STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd);
|
||
STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
|
||
STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd);
|
||
STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start);
|
||
STATIC_ASSERT(kBmp2Start < kBmp2End);
|
||
STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart);
|
||
STATIC_ASSERT(kNonBmpStart < kNonBmpEnd);
|
||
|
||
static constexpr uc32 kStarts[] = {
|
||
kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
|
||
kBmp2Start, kNonBmpStart,
|
||
};
|
||
|
||
static constexpr uc32 kEnds[] = {
|
||
kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
|
||
};
|
||
|
||
CharacterRangeVector* const kTargets[] = {
|
||
&bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
|
||
};
|
||
|
||
static constexpr int kCount = arraysize(kStarts);
|
||
STATIC_ASSERT(kCount == arraysize(kEnds));
|
||
STATIC_ASSERT(kCount == arraysize(kTargets));
|
||
|
||
for (int i = 0; i < kCount; i++) {
|
||
if (kStarts[i] > range.to()) break;
|
||
const uc32 from = std::max(kStarts[i], range.from());
|
||
const uc32 to = std::min(kEnds[i], range.to());
|
||
if (from > to) continue;
|
||
kTargets[i]->emplace_back(CharacterRange::Range(from, to));
|
||
}
|
||
}
|
||
|
||
namespace {
|
||
|
||
// Translates between new and old V8-isms (SmallVector, ZoneList).
|
||
ZoneList<CharacterRange>* ToCanonicalZoneList(
|
||
const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
|
||
if (v->empty()) return nullptr;
|
||
|
||
ZoneList<CharacterRange>* result =
|
||
new (zone) ZoneList<CharacterRange>(static_cast<int>(v->size()), zone);
|
||
for (size_t i = 0; i < v->size(); i++) {
|
||
result->Add(v->at(i), zone);
|
||
}
|
||
|
||
CharacterRange::Canonicalize(result);
|
||
return result;
|
||
}
|
||
|
||
void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
|
||
RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
|
||
ZoneList<CharacterRange>* bmp =
|
||
ToCanonicalZoneList(splitter->bmp(), compiler->zone());
|
||
if (bmp == nullptr) return;
|
||
JSRegExp::Flags default_flags = JSRegExp::Flags();
|
||
result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
|
||
compiler->zone(), bmp, compiler->read_backward(), on_success,
|
||
default_flags)));
|
||
}
|
||
|
||
void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
|
||
RegExpNode* on_success,
|
||
UnicodeRangeSplitter* splitter) {
|
||
ZoneList<CharacterRange>* non_bmp =
|
||
ToCanonicalZoneList(splitter->non_bmp(), compiler->zone());
|
||
if (non_bmp == nullptr) return;
|
||
DCHECK(!compiler->one_byte());
|
||
Zone* zone = compiler->zone();
|
||
JSRegExp::Flags default_flags = JSRegExp::Flags();
|
||
CharacterRange::Canonicalize(non_bmp);
|
||
for (int i = 0; i < non_bmp->length(); i++) {
|
||
// Match surrogate pair.
|
||
// E.g. [\u10005-\u11005] becomes
|
||
// \ud800[\udc05-\udfff]|
|
||
// [\ud801-\ud803][\udc00-\udfff]|
|
||
// \ud804[\udc00-\udc05]
|
||
uc32 from = non_bmp->at(i).from();
|
||
uc32 to = non_bmp->at(i).to();
|
||
uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
|
||
uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
|
||
uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
|
||
uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
|
||
if (from_l == to_l) {
|
||
// The lead surrogate is the same.
|
||
result->AddAlternative(
|
||
GuardedAlternative(TextNode::CreateForSurrogatePair(
|
||
zone, CharacterRange::Singleton(from_l),
|
||
CharacterRange::Range(from_t, to_t), compiler->read_backward(),
|
||
on_success, default_flags)));
|
||
} else {
|
||
if (from_t != kTrailSurrogateStart) {
|
||
// Add [from_l][from_t-\udfff]
|
||
result->AddAlternative(
|
||
GuardedAlternative(TextNode::CreateForSurrogatePair(
|
||
zone, CharacterRange::Singleton(from_l),
|
||
CharacterRange::Range(from_t, kTrailSurrogateEnd),
|
||
compiler->read_backward(), on_success, default_flags)));
|
||
from_l++;
|
||
}
|
||
if (to_t != kTrailSurrogateEnd) {
|
||
// Add [to_l][\udc00-to_t]
|
||
result->AddAlternative(
|
||
GuardedAlternative(TextNode::CreateForSurrogatePair(
|
||
zone, CharacterRange::Singleton(to_l),
|
||
CharacterRange::Range(kTrailSurrogateStart, to_t),
|
||
compiler->read_backward(), on_success, default_flags)));
|
||
to_l--;
|
||
}
|
||
if (from_l <= to_l) {
|
||
// Add [from_l-to_l][\udc00-\udfff]
|
||
result->AddAlternative(
|
||
GuardedAlternative(TextNode::CreateForSurrogatePair(
|
||
zone, CharacterRange::Range(from_l, to_l),
|
||
CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
|
||
compiler->read_backward(), on_success, default_flags)));
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
|
||
RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
|
||
ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
|
||
JSRegExp::Flags flags) {
|
||
Zone* zone = compiler->zone();
|
||
RegExpNode* match_node = TextNode::CreateForCharacterRanges(
|
||
zone, match, read_backward, on_success, flags);
|
||
int stack_register = compiler->UnicodeLookaroundStackRegister();
|
||
int position_register = compiler->UnicodeLookaroundPositionRegister();
|
||
RegExpLookaround::Builder lookaround(false, match_node, stack_register,
|
||
position_register);
|
||
RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
|
||
zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
|
||
return lookaround.ForMatch(negative_match);
|
||
}
|
||
|
||
RegExpNode* MatchAndNegativeLookaroundInReadDirection(
|
||
RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
|
||
ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
|
||
bool read_backward, JSRegExp::Flags flags) {
|
||
Zone* zone = compiler->zone();
|
||
int stack_register = compiler->UnicodeLookaroundStackRegister();
|
||
int position_register = compiler->UnicodeLookaroundPositionRegister();
|
||
RegExpLookaround::Builder lookaround(false, on_success, stack_register,
|
||
position_register);
|
||
RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
|
||
zone, lookahead, read_backward, lookaround.on_match_success(), flags);
|
||
return TextNode::CreateForCharacterRanges(
|
||
zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
|
||
}
|
||
|
||
void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
|
||
RegExpNode* on_success,
|
||
UnicodeRangeSplitter* splitter) {
|
||
JSRegExp::Flags default_flags = JSRegExp::Flags();
|
||
ZoneList<CharacterRange>* lead_surrogates =
|
||
ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
|
||
if (lead_surrogates == nullptr) return;
|
||
Zone* zone = compiler->zone();
|
||
// E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
|
||
ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
|
||
zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
|
||
|
||
RegExpNode* match;
|
||
if (compiler->read_backward()) {
|
||
// Reading backward. Assert that reading forward, there is no trail
|
||
// surrogate, and then backward match the lead surrogate.
|
||
match = NegativeLookaroundAgainstReadDirectionAndMatch(
|
||
compiler, trail_surrogates, lead_surrogates, on_success, true,
|
||
default_flags);
|
||
} else {
|
||
// Reading forward. Forward match the lead surrogate and assert that
|
||
// no trail surrogate follows.
|
||
match = MatchAndNegativeLookaroundInReadDirection(
|
||
compiler, lead_surrogates, trail_surrogates, on_success, false,
|
||
default_flags);
|
||
}
|
||
result->AddAlternative(GuardedAlternative(match));
|
||
}
|
||
|
||
void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
|
||
RegExpNode* on_success,
|
||
UnicodeRangeSplitter* splitter) {
|
||
JSRegExp::Flags default_flags = JSRegExp::Flags();
|
||
ZoneList<CharacterRange>* trail_surrogates =
|
||
ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
|
||
if (trail_surrogates == nullptr) return;
|
||
Zone* zone = compiler->zone();
|
||
// E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
|
||
ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
|
||
zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
|
||
|
||
RegExpNode* match;
|
||
if (compiler->read_backward()) {
|
||
// Reading backward. Backward match the trail surrogate and assert that no
|
||
// lead surrogate precedes it.
|
||
match = MatchAndNegativeLookaroundInReadDirection(
|
||
compiler, trail_surrogates, lead_surrogates, on_success, true,
|
||
default_flags);
|
||
} else {
|
||
// Reading forward. Assert that reading backward, there is no lead
|
||
// surrogate, and then forward match the trail surrogate.
|
||
match = NegativeLookaroundAgainstReadDirectionAndMatch(
|
||
compiler, lead_surrogates, trail_surrogates, on_success, false,
|
||
default_flags);
|
||
}
|
||
result->AddAlternative(GuardedAlternative(match));
|
||
}
|
||
|
||
RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
// This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
|
||
DCHECK(!compiler->read_backward());
|
||
Zone* zone = compiler->zone();
|
||
// Advance any character. If the character happens to be a lead surrogate and
|
||
// we advanced into the middle of a surrogate pair, it will work out, as
|
||
// nothing will match from there. We will have to advance again, consuming
|
||
// the associated trail surrogate.
|
||
ZoneList<CharacterRange>* range = CharacterRange::List(
|
||
zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
|
||
JSRegExp::Flags default_flags = JSRegExp::Flags();
|
||
return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
|
||
default_flags);
|
||
}
|
||
|
||
void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
|
||
#ifdef V8_INTL_SUPPORT
|
||
DCHECK(CharacterRange::IsCanonical(ranges));
|
||
|
||
// Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
|
||
// See also https://crbug.com/v8/6727.
|
||
// TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
|
||
// which we use frequently internally. But large ranges can also easily be
|
||
// created by the user. We might want to have a more general caching mechanism
|
||
// for such ranges.
|
||
if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
|
||
|
||
// Use ICU to compute the case fold closure over the ranges.
|
||
icu::UnicodeSet set;
|
||
for (int i = 0; i < ranges->length(); i++) {
|
||
set.add(ranges->at(i).from(), ranges->at(i).to());
|
||
}
|
||
ranges->Clear();
|
||
set.closeOver(USET_CASE_INSENSITIVE);
|
||
// Full case mapping map single characters to multiple characters.
|
||
// Those are represented as strings in the set. Remove them so that
|
||
// we end up with only simple and common case mappings.
|
||
set.removeAllStrings();
|
||
for (int i = 0; i < set.getRangeCount(); i++) {
|
||
ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
|
||
zone);
|
||
}
|
||
// No errors and everything we collected have been ranges.
|
||
CharacterRange::Canonicalize(ranges);
|
||
#endif // V8_INTL_SUPPORT
|
||
}
|
||
|
||
} // namespace
|
||
|
||
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
set_.Canonicalize();
|
||
Zone* zone = compiler->zone();
|
||
ZoneList<CharacterRange>* ranges = this->ranges(zone);
|
||
if (NeedsUnicodeCaseEquivalents(flags_)) {
|
||
AddUnicodeCaseEquivalents(ranges, zone);
|
||
}
|
||
if (IsUnicode(flags_) && !compiler->one_byte() &&
|
||
!contains_split_surrogate()) {
|
||
if (is_negated()) {
|
||
ZoneList<CharacterRange>* negated =
|
||
new (zone) ZoneList<CharacterRange>(2, zone);
|
||
CharacterRange::Negate(ranges, negated, zone);
|
||
ranges = negated;
|
||
}
|
||
if (ranges->length() == 0) {
|
||
JSRegExp::Flags default_flags;
|
||
RegExpCharacterClass* fail =
|
||
new (zone) RegExpCharacterClass(zone, ranges, default_flags);
|
||
return new (zone) TextNode(fail, compiler->read_backward(), on_success);
|
||
}
|
||
if (standard_type() == '*') {
|
||
return UnanchoredAdvance(compiler, on_success);
|
||
} else {
|
||
ChoiceNode* result = new (zone) ChoiceNode(2, zone);
|
||
UnicodeRangeSplitter splitter(ranges);
|
||
AddBmpCharacters(compiler, result, on_success, &splitter);
|
||
AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
|
||
AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
|
||
AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
|
||
return result;
|
||
}
|
||
} else {
|
||
return new (zone) TextNode(this, compiler->read_backward(), on_success);
|
||
}
|
||
}
|
||
|
||
int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
|
||
RegExpAtom* atom1 = (*a)->AsAtom();
|
||
RegExpAtom* atom2 = (*b)->AsAtom();
|
||
uc16 character1 = atom1->data().at(0);
|
||
uc16 character2 = atom2->data().at(0);
|
||
if (character1 < character2) return -1;
|
||
if (character1 > character2) return 1;
|
||
return 0;
|
||
}
|
||
|
||
#ifdef V8_INTL_SUPPORT
|
||
|
||
// Case Insensitve comparesion
|
||
int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) {
|
||
RegExpAtom* atom1 = (*a)->AsAtom();
|
||
RegExpAtom* atom2 = (*b)->AsAtom();
|
||
icu::UnicodeString character1(atom1->data().at(0));
|
||
return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT);
|
||
}
|
||
|
||
#else
|
||
|
||
static unibrow::uchar Canonical(
|
||
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
|
||
unibrow::uchar c) {
|
||
unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
|
||
int length = canonicalize->get(c, '\0', chars);
|
||
DCHECK_LE(length, 1);
|
||
unibrow::uchar canonical = c;
|
||
if (length == 1) canonical = chars[0];
|
||
return canonical;
|
||
}
|
||
|
||
int CompareFirstCharCaseIndependent(
|
||
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
|
||
RegExpTree* const* a, RegExpTree* const* b) {
|
||
RegExpAtom* atom1 = (*a)->AsAtom();
|
||
RegExpAtom* atom2 = (*b)->AsAtom();
|
||
unibrow::uchar character1 = atom1->data().at(0);
|
||
unibrow::uchar character2 = atom2->data().at(0);
|
||
if (character1 == character2) return 0;
|
||
if (character1 >= 'a' || character2 >= 'a') {
|
||
character1 = Canonical(canonicalize, character1);
|
||
character2 = Canonical(canonicalize, character2);
|
||
}
|
||
return static_cast<int>(character1) - static_cast<int>(character2);
|
||
}
|
||
#endif // V8_INTL_SUPPORT
|
||
|
||
// We can stable sort runs of atoms, since the order does not matter if they
|
||
// start with different characters.
|
||
// Returns true if any consecutive atoms were found.
|
||
bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
|
||
ZoneList<RegExpTree*>* alternatives = this->alternatives();
|
||
int length = alternatives->length();
|
||
bool found_consecutive_atoms = false;
|
||
for (int i = 0; i < length; i++) {
|
||
while (i < length) {
|
||
RegExpTree* alternative = alternatives->at(i);
|
||
if (alternative->IsAtom()) break;
|
||
i++;
|
||
}
|
||
// i is length or it is the index of an atom.
|
||
if (i == length) break;
|
||
int first_atom = i;
|
||
JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
|
||
i++;
|
||
while (i < length) {
|
||
RegExpTree* alternative = alternatives->at(i);
|
||
if (!alternative->IsAtom()) break;
|
||
if (alternative->AsAtom()->flags() != flags) break;
|
||
i++;
|
||
}
|
||
// Sort atoms to get ones with common prefixes together.
|
||
// This step is more tricky if we are in a case-independent regexp,
|
||
// because it would change /is|I/ to /I|is/, and order matters when
|
||
// the regexp parts don't match only disjoint starting points. To fix
|
||
// this we have a version of CompareFirstChar that uses case-
|
||
// independent character classes for comparison.
|
||
DCHECK_LT(first_atom, alternatives->length());
|
||
DCHECK_LE(i, alternatives->length());
|
||
DCHECK_LE(first_atom, i);
|
||
if (IgnoreCase(flags)) {
|
||
#ifdef V8_INTL_SUPPORT
|
||
alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom,
|
||
i - first_atom);
|
||
#else
|
||
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
|
||
compiler->isolate()->regexp_macro_assembler_canonicalize();
|
||
auto compare_closure = [canonicalize](RegExpTree* const* a,
|
||
RegExpTree* const* b) {
|
||
return CompareFirstCharCaseIndependent(canonicalize, a, b);
|
||
};
|
||
alternatives->StableSort(compare_closure, first_atom, i - first_atom);
|
||
#endif // V8_INTL_SUPPORT
|
||
} else {
|
||
alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
|
||
}
|
||
if (i - first_atom > 1) found_consecutive_atoms = true;
|
||
}
|
||
return found_consecutive_atoms;
|
||
}
|
||
|
||
// Optimizes ab|ac|az to a(?:b|c|d).
|
||
void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
|
||
Zone* zone = compiler->zone();
|
||
ZoneList<RegExpTree*>* alternatives = this->alternatives();
|
||
int length = alternatives->length();
|
||
|
||
int write_posn = 0;
|
||
int i = 0;
|
||
while (i < length) {
|
||
RegExpTree* alternative = alternatives->at(i);
|
||
if (!alternative->IsAtom()) {
|
||
alternatives->at(write_posn++) = alternatives->at(i);
|
||
i++;
|
||
continue;
|
||
}
|
||
RegExpAtom* const atom = alternative->AsAtom();
|
||
JSRegExp::Flags flags = atom->flags();
|
||
#ifdef V8_INTL_SUPPORT
|
||
icu::UnicodeString common_prefix(atom->data().at(0));
|
||
#else
|
||
unibrow::uchar common_prefix = atom->data().at(0);
|
||
#endif // V8_INTL_SUPPORT
|
||
int first_with_prefix = i;
|
||
int prefix_length = atom->length();
|
||
i++;
|
||
while (i < length) {
|
||
alternative = alternatives->at(i);
|
||
if (!alternative->IsAtom()) break;
|
||
RegExpAtom* const atom = alternative->AsAtom();
|
||
if (atom->flags() != flags) break;
|
||
#ifdef V8_INTL_SUPPORT
|
||
icu::UnicodeString new_prefix(atom->data().at(0));
|
||
if (new_prefix != common_prefix) {
|
||
if (!IgnoreCase(flags)) break;
|
||
if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0)
|
||
break;
|
||
}
|
||
#else
|
||
unibrow::uchar new_prefix = atom->data().at(0);
|
||
if (new_prefix != common_prefix) {
|
||
if (!IgnoreCase(flags)) break;
|
||
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
|
||
compiler->isolate()->regexp_macro_assembler_canonicalize();
|
||
new_prefix = Canonical(canonicalize, new_prefix);
|
||
common_prefix = Canonical(canonicalize, common_prefix);
|
||
if (new_prefix != common_prefix) break;
|
||
}
|
||
#endif // V8_INTL_SUPPORT
|
||
prefix_length = Min(prefix_length, atom->length());
|
||
i++;
|
||
}
|
||
if (i > first_with_prefix + 2) {
|
||
// Found worthwhile run of alternatives with common prefix of at least one
|
||
// character. The sorting function above did not sort on more than one
|
||
// character for reasons of correctness, but there may still be a longer
|
||
// common prefix if the terms were similar or presorted in the input.
|
||
// Find out how long the common prefix is.
|
||
int run_length = i - first_with_prefix;
|
||
RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
|
||
for (int j = 1; j < run_length && prefix_length > 1; j++) {
|
||
RegExpAtom* old_atom =
|
||
alternatives->at(j + first_with_prefix)->AsAtom();
|
||
for (int k = 1; k < prefix_length; k++) {
|
||
if (atom->data().at(k) != old_atom->data().at(k)) {
|
||
prefix_length = k;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
RegExpAtom* prefix = new (zone)
|
||
RegExpAtom(atom->data().SubVector(0, prefix_length), flags);
|
||
ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
|
||
pair->Add(prefix, zone);
|
||
ZoneList<RegExpTree*>* suffixes =
|
||
new (zone) ZoneList<RegExpTree*>(run_length, zone);
|
||
for (int j = 0; j < run_length; j++) {
|
||
RegExpAtom* old_atom =
|
||
alternatives->at(j + first_with_prefix)->AsAtom();
|
||
int len = old_atom->length();
|
||
if (len == prefix_length) {
|
||
suffixes->Add(new (zone) RegExpEmpty(), zone);
|
||
} else {
|
||
RegExpTree* suffix = new (zone) RegExpAtom(
|
||
old_atom->data().SubVector(prefix_length, old_atom->length()),
|
||
flags);
|
||
suffixes->Add(suffix, zone);
|
||
}
|
||
}
|
||
pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
|
||
alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
|
||
} else {
|
||
// Just copy any non-worthwhile alternatives.
|
||
for (int j = first_with_prefix; j < i; j++) {
|
||
alternatives->at(write_posn++) = alternatives->at(j);
|
||
}
|
||
}
|
||
}
|
||
alternatives->Rewind(write_posn); // Trim end of array.
|
||
}
|
||
|
||
// Optimizes b|c|z to [bcz].
|
||
void RegExpDisjunction::FixSingleCharacterDisjunctions(
|
||
RegExpCompiler* compiler) {
|
||
Zone* zone = compiler->zone();
|
||
ZoneList<RegExpTree*>* alternatives = this->alternatives();
|
||
int length = alternatives->length();
|
||
|
||
int write_posn = 0;
|
||
int i = 0;
|
||
while (i < length) {
|
||
RegExpTree* alternative = alternatives->at(i);
|
||
if (!alternative->IsAtom()) {
|
||
alternatives->at(write_posn++) = alternatives->at(i);
|
||
i++;
|
||
continue;
|
||
}
|
||
RegExpAtom* const atom = alternative->AsAtom();
|
||
if (atom->length() != 1) {
|
||
alternatives->at(write_posn++) = alternatives->at(i);
|
||
i++;
|
||
continue;
|
||
}
|
||
JSRegExp::Flags flags = atom->flags();
|
||
DCHECK_IMPLIES(IsUnicode(flags),
|
||
!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
|
||
bool contains_trail_surrogate =
|
||
unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
|
||
int first_in_run = i;
|
||
i++;
|
||
// Find a run of single-character atom alternatives that have identical
|
||
// flags (case independence and unicode-ness).
|
||
while (i < length) {
|
||
alternative = alternatives->at(i);
|
||
if (!alternative->IsAtom()) break;
|
||
RegExpAtom* const atom = alternative->AsAtom();
|
||
if (atom->length() != 1) break;
|
||
if (atom->flags() != flags) break;
|
||
DCHECK_IMPLIES(IsUnicode(flags),
|
||
!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
|
||
contains_trail_surrogate |=
|
||
unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
|
||
i++;
|
||
}
|
||
if (i > first_in_run + 1) {
|
||
// Found non-trivial run of single-character alternatives.
|
||
int run_length = i - first_in_run;
|
||
ZoneList<CharacterRange>* ranges =
|
||
new (zone) ZoneList<CharacterRange>(2, zone);
|
||
for (int j = 0; j < run_length; j++) {
|
||
RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
|
||
DCHECK_EQ(old_atom->length(), 1);
|
||
ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
|
||
}
|
||
RegExpCharacterClass::CharacterClassFlags character_class_flags;
|
||
if (IsUnicode(flags) && contains_trail_surrogate) {
|
||
character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
|
||
}
|
||
alternatives->at(write_posn++) = new (zone)
|
||
RegExpCharacterClass(zone, ranges, flags, character_class_flags);
|
||
} else {
|
||
// Just copy any trivial alternatives.
|
||
for (int j = first_in_run; j < i; j++) {
|
||
alternatives->at(write_posn++) = alternatives->at(j);
|
||
}
|
||
}
|
||
}
|
||
alternatives->Rewind(write_posn); // Trim end of array.
|
||
}
|
||
|
||
RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
ZoneList<RegExpTree*>* alternatives = this->alternatives();
|
||
|
||
if (alternatives->length() > 2) {
|
||
bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
|
||
if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
|
||
FixSingleCharacterDisjunctions(compiler);
|
||
if (alternatives->length() == 1) {
|
||
return alternatives->at(0)->ToNode(compiler, on_success);
|
||
}
|
||
}
|
||
|
||
int length = alternatives->length();
|
||
|
||
ChoiceNode* result =
|
||
new (compiler->zone()) ChoiceNode(length, compiler->zone());
|
||
for (int i = 0; i < length; i++) {
|
||
GuardedAlternative alternative(
|
||
alternatives->at(i)->ToNode(compiler, on_success));
|
||
result->AddAlternative(alternative);
|
||
}
|
||
return result;
|
||
}
|
||
|
||
RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
|
||
}
|
||
|
||
namespace {
|
||
// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
|
||
// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
|
||
RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
|
||
RegExpNode* on_success,
|
||
RegExpAssertion::AssertionType type,
|
||
JSRegExp::Flags flags) {
|
||
DCHECK(NeedsUnicodeCaseEquivalents(flags));
|
||
Zone* zone = compiler->zone();
|
||
ZoneList<CharacterRange>* word_range =
|
||
new (zone) ZoneList<CharacterRange>(2, zone);
|
||
CharacterRange::AddClassEscape('w', word_range, true, zone);
|
||
int stack_register = compiler->UnicodeLookaroundStackRegister();
|
||
int position_register = compiler->UnicodeLookaroundPositionRegister();
|
||
ChoiceNode* result = new (zone) ChoiceNode(2, zone);
|
||
// Add two choices. The (non-)boundary could start with a word or
|
||
// a non-word-character.
|
||
for (int i = 0; i < 2; i++) {
|
||
bool lookbehind_for_word = i == 0;
|
||
bool lookahead_for_word =
|
||
(type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
|
||
// Look to the left.
|
||
RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
|
||
stack_register, position_register);
|
||
RegExpNode* backward = TextNode::CreateForCharacterRanges(
|
||
zone, word_range, true, lookbehind.on_match_success(), flags);
|
||
// Look to the right.
|
||
RegExpLookaround::Builder lookahead(lookahead_for_word,
|
||
lookbehind.ForMatch(backward),
|
||
stack_register, position_register);
|
||
RegExpNode* forward = TextNode::CreateForCharacterRanges(
|
||
zone, word_range, false, lookahead.on_match_success(), flags);
|
||
result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
|
||
}
|
||
return result;
|
||
}
|
||
} // anonymous namespace
|
||
|
||
RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
NodeInfo info;
|
||
Zone* zone = compiler->zone();
|
||
|
||
switch (assertion_type()) {
|
||
case START_OF_LINE:
|
||
return AssertionNode::AfterNewline(on_success);
|
||
case START_OF_INPUT:
|
||
return AssertionNode::AtStart(on_success);
|
||
case BOUNDARY:
|
||
return NeedsUnicodeCaseEquivalents(flags_)
|
||
? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
|
||
flags_)
|
||
: AssertionNode::AtBoundary(on_success);
|
||
case NON_BOUNDARY:
|
||
return NeedsUnicodeCaseEquivalents(flags_)
|
||
? BoundaryAssertionAsLookaround(compiler, on_success,
|
||
NON_BOUNDARY, flags_)
|
||
: AssertionNode::AtNonBoundary(on_success);
|
||
case END_OF_INPUT:
|
||
return AssertionNode::AtEnd(on_success);
|
||
case END_OF_LINE: {
|
||
// Compile $ in multiline regexps as an alternation with a positive
|
||
// lookahead in one side and an end-of-input on the other side.
|
||
// We need two registers for the lookahead.
|
||
int stack_pointer_register = compiler->AllocateRegister();
|
||
int position_register = compiler->AllocateRegister();
|
||
// The ChoiceNode to distinguish between a newline and end-of-input.
|
||
ChoiceNode* result = new (zone) ChoiceNode(2, zone);
|
||
// Create a newline atom.
|
||
ZoneList<CharacterRange>* newline_ranges =
|
||
new (zone) ZoneList<CharacterRange>(3, zone);
|
||
CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
|
||
JSRegExp::Flags default_flags = JSRegExp::Flags();
|
||
RegExpCharacterClass* newline_atom =
|
||
new (zone) RegExpCharacterClass('n', default_flags);
|
||
TextNode* newline_matcher =
|
||
new (zone) TextNode(newline_atom, false,
|
||
ActionNode::PositiveSubmatchSuccess(
|
||
stack_pointer_register, position_register,
|
||
0, // No captures inside.
|
||
-1, // Ignored if no captures.
|
||
on_success));
|
||
// Create an end-of-input matcher.
|
||
RegExpNode* end_of_line = ActionNode::BeginSubmatch(
|
||
stack_pointer_register, position_register, newline_matcher);
|
||
// Add the two alternatives to the ChoiceNode.
|
||
GuardedAlternative eol_alternative(end_of_line);
|
||
result->AddAlternative(eol_alternative);
|
||
GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
|
||
result->AddAlternative(end_alternative);
|
||
return result;
|
||
}
|
||
default:
|
||
UNREACHABLE();
|
||
}
|
||
return on_success;
|
||
}
|
||
|
||
RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
return new (compiler->zone())
|
||
BackReferenceNode(RegExpCapture::StartRegister(index()),
|
||
RegExpCapture::EndRegister(index()), flags_,
|
||
compiler->read_backward(), on_success);
|
||
}
|
||
|
||
RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
return on_success;
|
||
}
|
||
|
||
RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
|
||
int stack_pointer_register,
|
||
int position_register,
|
||
int capture_register_count,
|
||
int capture_register_start)
|
||
: is_positive_(is_positive),
|
||
on_success_(on_success),
|
||
stack_pointer_register_(stack_pointer_register),
|
||
position_register_(position_register) {
|
||
if (is_positive_) {
|
||
on_match_success_ = ActionNode::PositiveSubmatchSuccess(
|
||
stack_pointer_register, position_register, capture_register_count,
|
||
capture_register_start, on_success_);
|
||
} else {
|
||
Zone* zone = on_success_->zone();
|
||
on_match_success_ = new (zone) NegativeSubmatchSuccess(
|
||
stack_pointer_register, position_register, capture_register_count,
|
||
capture_register_start, zone);
|
||
}
|
||
}
|
||
|
||
RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
|
||
if (is_positive_) {
|
||
return ActionNode::BeginSubmatch(stack_pointer_register_,
|
||
position_register_, match);
|
||
} else {
|
||
Zone* zone = on_success_->zone();
|
||
// We use a ChoiceNode to represent the negative lookaround. The first
|
||
// alternative is the negative match. On success, the end node backtracks.
|
||
// On failure, the second alternative is tried and leads to success.
|
||
// NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
|
||
// first exit when calculating quick checks.
|
||
ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
|
||
GuardedAlternative(match), GuardedAlternative(on_success_), zone);
|
||
return ActionNode::BeginSubmatch(stack_pointer_register_,
|
||
position_register_, choice_node);
|
||
}
|
||
}
|
||
|
||
RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
int stack_pointer_register = compiler->AllocateRegister();
|
||
int position_register = compiler->AllocateRegister();
|
||
|
||
const int registers_per_capture = 2;
|
||
const int register_of_first_capture = 2;
|
||
int register_count = capture_count_ * registers_per_capture;
|
||
int register_start =
|
||
register_of_first_capture + capture_from_ * registers_per_capture;
|
||
|
||
RegExpNode* result;
|
||
bool was_reading_backward = compiler->read_backward();
|
||
compiler->set_read_backward(type() == LOOKBEHIND);
|
||
Builder builder(is_positive(), on_success, stack_pointer_register,
|
||
position_register, register_count, register_start);
|
||
RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
|
||
result = builder.ForMatch(match);
|
||
compiler->set_read_backward(was_reading_backward);
|
||
return result;
|
||
}
|
||
|
||
RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
return ToNode(body(), index(), compiler, on_success);
|
||
}
|
||
|
||
RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
|
||
RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
DCHECK_NOT_NULL(body);
|
||
int start_reg = RegExpCapture::StartRegister(index);
|
||
int end_reg = RegExpCapture::EndRegister(index);
|
||
if (compiler->read_backward()) std::swap(start_reg, end_reg);
|
||
RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
|
||
RegExpNode* body_node = body->ToNode(compiler, store_end);
|
||
return ActionNode::StorePosition(start_reg, true, body_node);
|
||
}
|
||
|
||
namespace {
|
||
|
||
class AssertionSequenceRewriter final {
|
||
public:
|
||
// TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
|
||
// instead of sprinkling rewrites into the AST->Node conversion process.
|
||
static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
|
||
AssertionSequenceRewriter rewriter(terms, zone);
|
||
|
||
static constexpr int kNoIndex = -1;
|
||
int from = kNoIndex;
|
||
|
||
for (int i = 0; i < terms->length(); i++) {
|
||
RegExpTree* t = terms->at(i);
|
||
if (from == kNoIndex && t->IsAssertion()) {
|
||
from = i; // Start a sequence.
|
||
} else if (from != kNoIndex && !t->IsAssertion()) {
|
||
// Terminate and process the sequence.
|
||
if (i - from > 1) rewriter.Rewrite(from, i);
|
||
from = kNoIndex;
|
||
}
|
||
}
|
||
|
||
if (from != kNoIndex && terms->length() - from > 1) {
|
||
rewriter.Rewrite(from, terms->length());
|
||
}
|
||
}
|
||
|
||
// All assertions are zero width. A consecutive sequence of assertions is
|
||
// order-independent. There's two ways we can optimize here:
|
||
// 1. fold all identical assertions.
|
||
// 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
|
||
// sequence fails.
|
||
void Rewrite(int from, int to) {
|
||
DCHECK_GT(to, from + 1);
|
||
|
||
// Bitfield of all seen assertions.
|
||
uint32_t seen_assertions = 0;
|
||
STATIC_ASSERT(RegExpAssertion::LAST_TYPE < kUInt32Size * kBitsPerByte);
|
||
|
||
// Flags must match for folding.
|
||
JSRegExp::Flags flags = terms_->at(from)->AsAssertion()->flags();
|
||
bool saw_mismatched_flags = false;
|
||
|
||
for (int i = from; i < to; i++) {
|
||
RegExpAssertion* t = terms_->at(i)->AsAssertion();
|
||
if (t->flags() != flags) saw_mismatched_flags = true;
|
||
const uint32_t bit = 1 << t->assertion_type();
|
||
|
||
if ((seen_assertions & bit) && !saw_mismatched_flags) {
|
||
// Fold duplicates.
|
||
terms_->Set(i, new (zone_) RegExpEmpty());
|
||
}
|
||
|
||
seen_assertions |= bit;
|
||
}
|
||
|
||
// Collapse failures.
|
||
const uint32_t always_fails_mask =
|
||
1 << RegExpAssertion::BOUNDARY | 1 << RegExpAssertion::NON_BOUNDARY;
|
||
if ((seen_assertions & always_fails_mask) == always_fails_mask) {
|
||
ReplaceSequenceWithFailure(from, to);
|
||
}
|
||
}
|
||
|
||
void ReplaceSequenceWithFailure(int from, int to) {
|
||
// Replace the entire sequence with a single node that always fails.
|
||
// TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
|
||
// negated '*' (everything) range serves the purpose.
|
||
ZoneList<CharacterRange>* ranges =
|
||
new (zone_) ZoneList<CharacterRange>(0, zone_);
|
||
RegExpCharacterClass* cc =
|
||
new (zone_) RegExpCharacterClass(zone_, ranges, JSRegExp::Flags());
|
||
terms_->Set(from, cc);
|
||
|
||
// Zero out the rest.
|
||
RegExpEmpty* empty = new (zone_) RegExpEmpty();
|
||
for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
|
||
}
|
||
|
||
private:
|
||
AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
|
||
: zone_(zone), terms_(terms) {}
|
||
|
||
Zone* zone_;
|
||
ZoneList<RegExpTree*>* terms_;
|
||
};
|
||
|
||
} // namespace
|
||
|
||
RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
|
||
RegExpNode* on_success) {
|
||
ZoneList<RegExpTree*>* children = nodes();
|
||
|
||
AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
|
||
|
||
RegExpNode* current = on_success;
|
||
if (compiler->read_backward()) {
|
||
for (int i = 0; i < children->length(); i++) {
|
||
current = children->at(i)->ToNode(compiler, current);
|
||
}
|
||
} else {
|
||
for (int i = children->length() - 1; i >= 0; i--) {
|
||
current = children->at(i)->ToNode(compiler, current);
|
||
}
|
||
}
|
||
return current;
|
||
}
|
||
|
||
static void AddClass(const int* elmv, int elmc,
|
||
ZoneList<CharacterRange>* ranges, Zone* zone) {
|
||
elmc--;
|
||
DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
|
||
for (int i = 0; i < elmc; i += 2) {
|
||
DCHECK(elmv[i] < elmv[i + 1]);
|
||
ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
|
||
}
|
||
}
|
||
|
||
static void AddClassNegated(const int* elmv, int elmc,
|
||
ZoneList<CharacterRange>* ranges, Zone* zone) {
|
||
elmc--;
|
||
DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
|
||
DCHECK_NE(0x0000, elmv[0]);
|
||
DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
|
||
uc16 last = 0x0000;
|
||
for (int i = 0; i < elmc; i += 2) {
|
||
DCHECK(last <= elmv[i] - 1);
|
||
DCHECK(elmv[i] < elmv[i + 1]);
|
||
ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
|
||
last = elmv[i + 1];
|
||
}
|
||
ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
|
||
}
|
||
|
||
void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
|
||
bool add_unicode_case_equivalents,
|
||
Zone* zone) {
|
||
if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
|
||
// See #sec-runtime-semantics-wordcharacters-abstract-operation
|
||
// In case of unicode and ignore_case, we need to create the closure over
|
||
// case equivalent characters before negating.
|
||
ZoneList<CharacterRange>* new_ranges =
|
||
new (zone) ZoneList<CharacterRange>(2, zone);
|
||
AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
|
||
AddUnicodeCaseEquivalents(new_ranges, zone);
|
||
if (type == 'W') {
|
||
ZoneList<CharacterRange>* negated =
|
||
new (zone) ZoneList<CharacterRange>(2, zone);
|
||
CharacterRange::Negate(new_ranges, negated, zone);
|
||
new_ranges = negated;
|
||
}
|
||
ranges->AddAll(*new_ranges, zone);
|
||
return;
|
||
}
|
||
AddClassEscape(type, ranges, zone);
|
||
}
|
||
|
||
void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
|
||
Zone* zone) {
|
||
switch (type) {
|
||
case 's':
|
||
AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
|
||
break;
|
||
case 'S':
|
||
AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
|
||
break;
|
||
case 'w':
|
||
AddClass(kWordRanges, kWordRangeCount, ranges, zone);
|
||
break;
|
||
case 'W':
|
||
AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
|
||
break;
|
||
case 'd':
|
||
AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
|
||
break;
|
||
case 'D':
|
||
AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
|
||
break;
|
||
case '.':
|
||
AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
|
||
zone);
|
||
break;
|
||
// This is not a character range as defined by the spec but a
|
||
// convenient shorthand for a character class that matches any
|
||
// character.
|
||
case '*':
|
||
ranges->Add(CharacterRange::Everything(), zone);
|
||
break;
|
||
// This is the set of characters matched by the $ and ^ symbols
|
||
// in multiline mode.
|
||
case 'n':
|
||
AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
|
||
break;
|
||
default:
|
||
UNREACHABLE();
|
||
}
|
||
}
|
||
|
||
Vector<const int> CharacterRange::GetWordBounds() {
|
||
return Vector<const int>(kWordRanges, kWordRangeCount - 1);
|
||
}
|
||
|
||
// static
|
||
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
|
||
ZoneList<CharacterRange>* ranges,
|
||
bool is_one_byte) {
|
||
CharacterRange::Canonicalize(ranges);
|
||
int range_count = ranges->length();
|
||
#ifdef V8_INTL_SUPPORT
|
||
icu::UnicodeSet others;
|
||
for (int i = 0; i < range_count; i++) {
|
||
CharacterRange range = ranges->at(i);
|
||
uc32 from = range.from();
|
||
if (from > String::kMaxUtf16CodeUnit) continue;
|
||
uc32 to = Min(range.to(), String::kMaxUtf16CodeUnit);
|
||
// Nothing to be done for surrogates.
|
||
if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
|
||
if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
|
||
if (from > String::kMaxOneByteCharCode) continue;
|
||
if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
|
||
}
|
||
others.add(from, to);
|
||
}
|
||
|
||
// Compute the set of additional characters that should be added,
|
||
// using UnicodeSet::closeOver. ECMA 262 defines slightly different
|
||
// case-folding rules than Unicode, so some characters that are
|
||
// added by closeOver do not match anything other than themselves in
|
||
// JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
|
||
// same case-insensitive character as 's' or 'S' according to
|
||
// Unicode, but does not match any other character in JS. To handle
|
||
// this case, we add such characters to the IgnoreSet and filter
|
||
// them out. We filter twice: once before calling closeOver (to
|
||
// prevent 'ſ' from adding 's'), and once after calling closeOver
|
||
// (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
|
||
// more information.
|
||
icu::UnicodeSet already_added(others);
|
||
others.removeAll(RegExpCaseFolding::IgnoreSet());
|
||
others.closeOver(USET_CASE_INSENSITIVE);
|
||
others.removeAll(RegExpCaseFolding::IgnoreSet());
|
||
others.removeAll(already_added);
|
||
|
||
// Add others to the ranges
|
||
for (int32_t i = 0; i < others.getRangeCount(); i++) {
|
||
UChar32 from = others.getRangeStart(i);
|
||
UChar32 to = others.getRangeEnd(i);
|
||
if (from == to) {
|
||
ranges->Add(CharacterRange::Singleton(from), zone);
|
||
} else {
|
||
ranges->Add(CharacterRange::Range(from, to), zone);
|
||
}
|
||
}
|
||
#else
|
||
for (int i = 0; i < range_count; i++) {
|
||
CharacterRange range = ranges->at(i);
|
||
uc32 bottom = range.from();
|
||
if (bottom > String::kMaxUtf16CodeUnit) continue;
|
||
uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
|
||
// Nothing to be done for surrogates.
|
||
if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
|
||
if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
|
||
if (bottom > String::kMaxOneByteCharCode) continue;
|
||
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
|
||
}
|
||
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
|
||
if (top == bottom) {
|
||
// If this is a singleton we just expand the one character.
|
||
int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
|
||
for (int i = 0; i < length; i++) {
|
||
uc32 chr = chars[i];
|
||
if (chr != bottom) {
|
||
ranges->Add(CharacterRange::Singleton(chars[i]), zone);
|
||
}
|
||
}
|
||
} else {
|
||
// If this is a range we expand the characters block by block, expanding
|
||
// contiguous subranges (blocks) one at a time. The approach is as
|
||
// follows. For a given start character we look up the remainder of the
|
||
// block that contains it (represented by the end point), for instance we
|
||
// find 'z' if the character is 'c'. A block is characterized by the
|
||
// property that all characters uncanonicalize in the same way, except
|
||
// that each entry in the result is incremented by the distance from the
|
||
// first element. So a-z is a block because 'a' uncanonicalizes to ['a',
|
||
// 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
|
||
// we've found the end point we look up its uncanonicalization and
|
||
// produce a range for each element. For instance for [c-f] we look up
|
||
// ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
|
||
// it is not already contained in the input, so [c-f] will be skipped but
|
||
// [C-F] will be added. If this range is not completely contained in a
|
||
// block we do this for all the blocks covered by the range (handling
|
||
// characters that is not in a block as a "singleton block").
|
||
unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
|
||
int pos = bottom;
|
||
while (pos <= top) {
|
||
int length =
|
||
isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
|
||
uc32 block_end;
|
||
if (length == 0) {
|
||
block_end = pos;
|
||
} else {
|
||
DCHECK_EQ(1, length);
|
||
block_end = equivalents[0];
|
||
}
|
||
int end = (block_end > top) ? top : block_end;
|
||
length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
|
||
equivalents);
|
||
for (int i = 0; i < length; i++) {
|
||
uc32 c = equivalents[i];
|
||
uc32 range_from = c - (block_end - pos);
|
||
uc32 range_to = c - (block_end - end);
|
||
if (!(bottom <= range_from && range_to <= top)) {
|
||
ranges->Add(CharacterRange::Range(range_from, range_to), zone);
|
||
}
|
||
}
|
||
pos = end + 1;
|
||
}
|
||
}
|
||
}
|
||
#endif // V8_INTL_SUPPORT
|
||
}
|
||
|
||
bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
|
||
DCHECK_NOT_NULL(ranges);
|
||
int n = ranges->length();
|
||
if (n <= 1) return true;
|
||
int max = ranges->at(0).to();
|
||
for (int i = 1; i < n; i++) {
|
||
CharacterRange next_range = ranges->at(i);
|
||
if (next_range.from() <= max + 1) return false;
|
||
max = next_range.to();
|
||
}
|
||
return true;
|
||
}
|
||
|
||
ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
|
||
if (ranges_ == nullptr) {
|
||
ranges_ = new (zone) ZoneList<CharacterRange>(2, zone);
|
||
CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
|
||
}
|
||
return ranges_;
|
||
}
|
||
|
||
// Move a number of elements in a zonelist to another position
|
||
// in the same list. Handles overlapping source and target areas.
|
||
static void MoveRanges(ZoneList<CharacterRange>* list, int from, int to,
|
||
int count) {
|
||
// Ranges are potentially overlapping.
|
||
if (from < to) {
|
||
for (int i = count - 1; i >= 0; i--) {
|
||
list->at(to + i) = list->at(from + i);
|
||
}
|
||
} else {
|
||
for (int i = 0; i < count; i++) {
|
||
list->at(to + i) = list->at(from + i);
|
||
}
|
||
}
|
||
}
|
||
|
||
static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
|
||
CharacterRange insert) {
|
||
// Inserts a range into list[0..count[, which must be sorted
|
||
// by from value and non-overlapping and non-adjacent, using at most
|
||
// list[0..count] for the result. Returns the number of resulting
|
||
// canonicalized ranges. Inserting a range may collapse existing ranges into
|
||
// fewer ranges, so the return value can be anything in the range 1..count+1.
|
||
uc32 from = insert.from();
|
||
uc32 to = insert.to();
|
||
int start_pos = 0;
|
||
int end_pos = count;
|
||
for (int i = count - 1; i >= 0; i--) {
|
||
CharacterRange current = list->at(i);
|
||
if (current.from() > to + 1) {
|
||
end_pos = i;
|
||
} else if (current.to() + 1 < from) {
|
||
start_pos = i + 1;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Inserted range overlaps, or is adjacent to, ranges at positions
|
||
// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
|
||
// not affected by the insertion.
|
||
// If start_pos == end_pos, the range must be inserted before start_pos.
|
||
// if start_pos < end_pos, the entire range from start_pos to end_pos
|
||
// must be merged with the insert range.
|
||
|
||
if (start_pos == end_pos) {
|
||
// Insert between existing ranges at position start_pos.
|
||
if (start_pos < count) {
|
||
MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
|
||
}
|
||
list->at(start_pos) = insert;
|
||
return count + 1;
|
||
}
|
||
if (start_pos + 1 == end_pos) {
|
||
// Replace single existing range at position start_pos.
|
||
CharacterRange to_replace = list->at(start_pos);
|
||
int new_from = Min(to_replace.from(), from);
|
||
int new_to = Max(to_replace.to(), to);
|
||
list->at(start_pos) = CharacterRange::Range(new_from, new_to);
|
||
return count;
|
||
}
|
||
// Replace a number of existing ranges from start_pos to end_pos - 1.
|
||
// Move the remaining ranges down.
|
||
|
||
int new_from = Min(list->at(start_pos).from(), from);
|
||
int new_to = Max(list->at(end_pos - 1).to(), to);
|
||
if (end_pos < count) {
|
||
MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
|
||
}
|
||
list->at(start_pos) = CharacterRange::Range(new_from, new_to);
|
||
return count - (end_pos - start_pos) + 1;
|
||
}
|
||
|
||
void CharacterSet::Canonicalize() {
|
||
// Special/default classes are always considered canonical. The result
|
||
// of calling ranges() will be sorted.
|
||
if (ranges_ == nullptr) return;
|
||
CharacterRange::Canonicalize(ranges_);
|
||
}
|
||
|
||
void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
|
||
if (character_ranges->length() <= 1) return;
|
||
// Check whether ranges are already canonical (increasing, non-overlapping,
|
||
// non-adjacent).
|
||
int n = character_ranges->length();
|
||
int max = character_ranges->at(0).to();
|
||
int i = 1;
|
||
while (i < n) {
|
||
CharacterRange current = character_ranges->at(i);
|
||
if (current.from() <= max + 1) {
|
||
break;
|
||
}
|
||
max = current.to();
|
||
i++;
|
||
}
|
||
// Canonical until the i'th range. If that's all of them, we are done.
|
||
if (i == n) return;
|
||
|
||
// The ranges at index i and forward are not canonicalized. Make them so by
|
||
// doing the equivalent of insertion sort (inserting each into the previous
|
||
// list, in order).
|
||
// Notice that inserting a range can reduce the number of ranges in the
|
||
// result due to combining of adjacent and overlapping ranges.
|
||
int read = i; // Range to insert.
|
||
int num_canonical = i; // Length of canonicalized part of list.
|
||
do {
|
||
num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
|
||
character_ranges->at(read));
|
||
read++;
|
||
} while (read < n);
|
||
character_ranges->Rewind(num_canonical);
|
||
|
||
DCHECK(CharacterRange::IsCanonical(character_ranges));
|
||
}
|
||
|
||
void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
|
||
ZoneList<CharacterRange>* negated_ranges,
|
||
Zone* zone) {
|
||
DCHECK(CharacterRange::IsCanonical(ranges));
|
||
DCHECK_EQ(0, negated_ranges->length());
|
||
int range_count = ranges->length();
|
||
uc32 from = 0;
|
||
int i = 0;
|
||
if (range_count > 0 && ranges->at(0).from() == 0) {
|
||
from = ranges->at(0).to() + 1;
|
||
i = 1;
|
||
}
|
||
while (i < range_count) {
|
||
CharacterRange range = ranges->at(i);
|
||
negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
|
||
from = range.to() + 1;
|
||
i++;
|
||
}
|
||
if (from < String::kMaxCodePoint) {
|
||
negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
|
||
zone);
|
||
}
|
||
}
|
||
|
||
// Scoped object to keep track of how much we unroll quantifier loops in the
|
||
// regexp graph generator.
|
||
class RegExpExpansionLimiter {
|
||
public:
|
||
static const int kMaxExpansionFactor = 6;
|
||
RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
|
||
: compiler_(compiler),
|
||
saved_expansion_factor_(compiler->current_expansion_factor()),
|
||
ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
|
||
DCHECK_LT(0, factor);
|
||
if (ok_to_expand_) {
|
||
if (factor > kMaxExpansionFactor) {
|
||
// Avoid integer overflow of the current expansion factor.
|
||
ok_to_expand_ = false;
|
||
compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
|
||
} else {
|
||
int new_factor = saved_expansion_factor_ * factor;
|
||
ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
|
||
compiler->set_current_expansion_factor(new_factor);
|
||
}
|
||
}
|
||
}
|
||
|
||
~RegExpExpansionLimiter() {
|
||
compiler_->set_current_expansion_factor(saved_expansion_factor_);
|
||
}
|
||
|
||
bool ok_to_expand() { return ok_to_expand_; }
|
||
|
||
private:
|
||
RegExpCompiler* compiler_;
|
||
int saved_expansion_factor_;
|
||
bool ok_to_expand_;
|
||
|
||
DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
|
||
};
|
||
|
||
RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
|
||
RegExpTree* body, RegExpCompiler* compiler,
|
||
RegExpNode* on_success,
|
||
bool not_at_start) {
|
||
// x{f, t} becomes this:
|
||
//
|
||
// (r++)<-.
|
||
// | `
|
||
// | (x)
|
||
// v ^
|
||
// (r=0)-->(?)---/ [if r < t]
|
||
// |
|
||
// [if r >= f] \----> ...
|
||
//
|
||
|
||
// 15.10.2.5 RepeatMatcher algorithm.
|
||
// The parser has already eliminated the case where max is 0. In the case
|
||
// where max_match is zero the parser has removed the quantifier if min was
|
||
// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
|
||
|
||
// If we know that we cannot match zero length then things are a little
|
||
// simpler since we don't need to make the special zero length match check
|
||
// from step 2.1. If the min and max are small we can unroll a little in
|
||
// this case.
|
||
static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
|
||
static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
|
||
if (max == 0) return on_success; // This can happen due to recursion.
|
||
bool body_can_be_empty = (body->min_match() == 0);
|
||
int body_start_reg = RegExpCompiler::kNoRegister;
|
||
Interval capture_registers = body->CaptureRegisters();
|
||
bool needs_capture_clearing = !capture_registers.is_empty();
|
||
Zone* zone = compiler->zone();
|
||
|
||
if (body_can_be_empty) {
|
||
body_start_reg = compiler->AllocateRegister();
|
||
} else if (compiler->optimize() && !needs_capture_clearing) {
|
||
// Only unroll if there are no captures and the body can't be
|
||
// empty.
|
||
{
|
||
RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
|
||
if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
|
||
int new_max = (max == kInfinity) ? max : max - min;
|
||
// Recurse once to get the loop or optional matches after the fixed
|
||
// ones.
|
||
RegExpNode* answer =
|
||
ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
|
||
// Unroll the forced matches from 0 to min. This can cause chains of
|
||
// TextNodes (which the parser does not generate). These should be
|
||
// combined if it turns out they hinder good code generation.
|
||
for (int i = 0; i < min; i++) {
|
||
answer = body->ToNode(compiler, answer);
|
||
}
|
||
return answer;
|
||
}
|
||
}
|
||
if (max <= kMaxUnrolledMaxMatches && min == 0) {
|
||
DCHECK_LT(0, max); // Due to the 'if' above.
|
||
RegExpExpansionLimiter limiter(compiler, max);
|
||
if (limiter.ok_to_expand()) {
|
||
// Unroll the optional matches up to max.
|
||
RegExpNode* answer = on_success;
|
||
for (int i = 0; i < max; i++) {
|
||
ChoiceNode* alternation = new (zone) ChoiceNode(2, zone);
|
||
if (is_greedy) {
|
||
alternation->AddAlternative(
|
||
GuardedAlternative(body->ToNode(compiler, answer)));
|
||
alternation->AddAlternative(GuardedAlternative(on_success));
|
||
} else {
|
||
alternation->AddAlternative(GuardedAlternative(on_success));
|
||
alternation->AddAlternative(
|
||
GuardedAlternative(body->ToNode(compiler, answer)));
|
||
}
|
||
answer = alternation;
|
||
if (not_at_start && !compiler->read_backward()) {
|
||
alternation->set_not_at_start();
|
||
}
|
||
}
|
||
return answer;
|
||
}
|
||
}
|
||
}
|
||
bool has_min = min > 0;
|
||
bool has_max = max < RegExpTree::kInfinity;
|
||
bool needs_counter = has_min || has_max;
|
||
int reg_ctr = needs_counter ? compiler->AllocateRegister()
|
||
: RegExpCompiler::kNoRegister;
|
||
LoopChoiceNode* center = new (zone) LoopChoiceNode(
|
||
body->min_match() == 0, compiler->read_backward(), min, zone);
|
||
if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
|
||
RegExpNode* loop_return =
|
||
needs_counter ? static_cast<RegExpNode*>(
|
||
ActionNode::IncrementRegister(reg_ctr, center))
|
||
: static_cast<RegExpNode*>(center);
|
||
if (body_can_be_empty) {
|
||
// If the body can be empty we need to check if it was and then
|
||
// backtrack.
|
||
loop_return =
|
||
ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
|
||
}
|
||
RegExpNode* body_node = body->ToNode(compiler, loop_return);
|
||
if (body_can_be_empty) {
|
||
// If the body can be empty we need to store the start position
|
||
// so we can bail out if it was empty.
|
||
body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
|
||
}
|
||
if (needs_capture_clearing) {
|
||
// Before entering the body of this loop we need to clear captures.
|
||
body_node = ActionNode::ClearCaptures(capture_registers, body_node);
|
||
}
|
||
GuardedAlternative body_alt(body_node);
|
||
if (has_max) {
|
||
Guard* body_guard = new (zone) Guard(reg_ctr, Guard::LT, max);
|
||
body_alt.AddGuard(body_guard, zone);
|
||
}
|
||
GuardedAlternative rest_alt(on_success);
|
||
if (has_min) {
|
||
Guard* rest_guard = new (compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
|
||
rest_alt.AddGuard(rest_guard, zone);
|
||
}
|
||
if (is_greedy) {
|
||
center->AddLoopAlternative(body_alt);
|
||
center->AddContinueAlternative(rest_alt);
|
||
} else {
|
||
center->AddContinueAlternative(rest_alt);
|
||
center->AddLoopAlternative(body_alt);
|
||
}
|
||
if (needs_counter) {
|
||
return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
|
||
} else {
|
||
return center;
|
||
}
|
||
}
|
||
|
||
} // namespace internal
|
||
} // namespace v8
|