Mypal/js/src/regexp/regexp-compiler-tonode.cc

1590 lines
61 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "regexp/regexp-compiler.h"
#include "regexp/regexp.h"
#ifdef V8_INTL_SUPPORT
#include "regexp/special-case.h"
#endif // V8_INTL_SUPPORT
#ifdef V8_INTL_SUPPORT
#include "unicode/locid.h"
#include "unicode/uniset.h"
#include "unicode/utypes.h"
#endif // V8_INTL_SUPPORT
namespace v8 {
namespace internal {
using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
// -------------------------------------------------------------------
// Tree to graph conversion
RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
ZoneList<TextElement>* elms =
new (compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
elms->Add(TextElement::Atom(this), compiler->zone());
return new (compiler->zone())
TextNode(elms, compiler->read_backward(), on_success);
}
RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
return new (compiler->zone())
TextNode(elements(), compiler->read_backward(), on_success);
}
static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
const int* special_class, int length) {
length--; // Remove final marker.
DCHECK_EQ(kRangeEndMarker, special_class[length]);
DCHECK_NE(0, ranges->length());
DCHECK_NE(0, length);
DCHECK_NE(0, special_class[0]);
if (ranges->length() != (length >> 1) + 1) {
return false;
}
CharacterRange range = ranges->at(0);
if (range.from() != 0) {
return false;
}
for (int i = 0; i < length; i += 2) {
if (special_class[i] != (range.to() + 1)) {
return false;
}
range = ranges->at((i >> 1) + 1);
if (special_class[i + 1] != range.from()) {
return false;
}
}
if (range.to() != String::kMaxCodePoint) {
return false;
}
return true;
}
static bool CompareRanges(ZoneList<CharacterRange>* ranges,
const int* special_class, int length) {
length--; // Remove final marker.
DCHECK_EQ(kRangeEndMarker, special_class[length]);
if (ranges->length() * 2 != length) {
return false;
}
for (int i = 0; i < length; i += 2) {
CharacterRange range = ranges->at(i >> 1);
if (range.from() != special_class[i] ||
range.to() != special_class[i + 1] - 1) {
return false;
}
}
return true;
}
bool RegExpCharacterClass::is_standard(Zone* zone) {
// TODO(lrn): Remove need for this function, by not throwing away information
// along the way.
if (is_negated()) {
return false;
}
if (set_.is_standard()) {
return true;
}
if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
set_.set_standard_set_type('s');
return true;
}
if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
set_.set_standard_set_type('S');
return true;
}
if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
kLineTerminatorRangeCount)) {
set_.set_standard_set_type('.');
return true;
}
if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
kLineTerminatorRangeCount)) {
set_.set_standard_set_type('n');
return true;
}
if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
set_.set_standard_set_type('w');
return true;
}
if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
set_.set_standard_set_type('W');
return true;
}
return false;
}
UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {
// The unicode range splitter categorizes given character ranges into:
// - Code points from the BMP representable by one code unit.
// - Code points outside the BMP that need to be split into surrogate pairs.
// - Lone lead surrogates.
// - Lone trail surrogates.
// Lone surrogates are valid code points, even though no actual characters.
// They require special matching to make sure we do not split surrogate pairs.
for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
}
void UnicodeRangeSplitter::AddRange(CharacterRange range) {
static constexpr uc32 kBmp1Start = 0;
static constexpr uc32 kBmp1End = kLeadSurrogateStart - 1;
static constexpr uc32 kBmp2Start = kTrailSurrogateEnd + 1;
static constexpr uc32 kBmp2End = kNonBmpStart - 1;
// Ends are all inclusive.
STATIC_ASSERT(kBmp1Start == 0);
STATIC_ASSERT(kBmp1Start < kBmp1End);
STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart);
STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd);
STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd);
STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start);
STATIC_ASSERT(kBmp2Start < kBmp2End);
STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart);
STATIC_ASSERT(kNonBmpStart < kNonBmpEnd);
static constexpr uc32 kStarts[] = {
kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,
kBmp2Start, kNonBmpStart,
};
static constexpr uc32 kEnds[] = {
kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
};
CharacterRangeVector* const kTargets[] = {
&bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,
};
static constexpr int kCount = arraysize(kStarts);
STATIC_ASSERT(kCount == arraysize(kEnds));
STATIC_ASSERT(kCount == arraysize(kTargets));
for (int i = 0; i < kCount; i++) {
if (kStarts[i] > range.to()) break;
const uc32 from = std::max(kStarts[i], range.from());
const uc32 to = std::min(kEnds[i], range.to());
if (from > to) continue;
kTargets[i]->emplace_back(CharacterRange::Range(from, to));
}
}
namespace {
// Translates between new and old V8-isms (SmallVector, ZoneList).
ZoneList<CharacterRange>* ToCanonicalZoneList(
const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {
if (v->empty()) return nullptr;
ZoneList<CharacterRange>* result =
new (zone) ZoneList<CharacterRange>(static_cast<int>(v->size()), zone);
for (size_t i = 0; i < v->size(); i++) {
result->Add(v->at(i), zone);
}
CharacterRange::Canonicalize(result);
return result;
}
void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
ZoneList<CharacterRange>* bmp =
ToCanonicalZoneList(splitter->bmp(), compiler->zone());
if (bmp == nullptr) return;
JSRegExp::Flags default_flags = JSRegExp::Flags();
result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
compiler->zone(), bmp, compiler->read_backward(), on_success,
default_flags)));
}
void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
RegExpNode* on_success,
UnicodeRangeSplitter* splitter) {
ZoneList<CharacterRange>* non_bmp =
ToCanonicalZoneList(splitter->non_bmp(), compiler->zone());
if (non_bmp == nullptr) return;
DCHECK(!compiler->one_byte());
Zone* zone = compiler->zone();
JSRegExp::Flags default_flags = JSRegExp::Flags();
CharacterRange::Canonicalize(non_bmp);
for (int i = 0; i < non_bmp->length(); i++) {
// Match surrogate pair.
// E.g. [\u10005-\u11005] becomes
// \ud800[\udc05-\udfff]|
// [\ud801-\ud803][\udc00-\udfff]|
// \ud804[\udc00-\udc05]
uc32 from = non_bmp->at(i).from();
uc32 to = non_bmp->at(i).to();
uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
if (from_l == to_l) {
// The lead surrogate is the same.
result->AddAlternative(
GuardedAlternative(TextNode::CreateForSurrogatePair(
zone, CharacterRange::Singleton(from_l),
CharacterRange::Range(from_t, to_t), compiler->read_backward(),
on_success, default_flags)));
} else {
if (from_t != kTrailSurrogateStart) {
// Add [from_l][from_t-\udfff]
result->AddAlternative(
GuardedAlternative(TextNode::CreateForSurrogatePair(
zone, CharacterRange::Singleton(from_l),
CharacterRange::Range(from_t, kTrailSurrogateEnd),
compiler->read_backward(), on_success, default_flags)));
from_l++;
}
if (to_t != kTrailSurrogateEnd) {
// Add [to_l][\udc00-to_t]
result->AddAlternative(
GuardedAlternative(TextNode::CreateForSurrogatePair(
zone, CharacterRange::Singleton(to_l),
CharacterRange::Range(kTrailSurrogateStart, to_t),
compiler->read_backward(), on_success, default_flags)));
to_l--;
}
if (from_l <= to_l) {
// Add [from_l-to_l][\udc00-\udfff]
result->AddAlternative(
GuardedAlternative(TextNode::CreateForSurrogatePair(
zone, CharacterRange::Range(from_l, to_l),
CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
compiler->read_backward(), on_success, default_flags)));
}
}
}
}
RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
JSRegExp::Flags flags) {
Zone* zone = compiler->zone();
RegExpNode* match_node = TextNode::CreateForCharacterRanges(
zone, match, read_backward, on_success, flags);
int stack_register = compiler->UnicodeLookaroundStackRegister();
int position_register = compiler->UnicodeLookaroundPositionRegister();
RegExpLookaround::Builder lookaround(false, match_node, stack_register,
position_register);
RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
return lookaround.ForMatch(negative_match);
}
RegExpNode* MatchAndNegativeLookaroundInReadDirection(
RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
bool read_backward, JSRegExp::Flags flags) {
Zone* zone = compiler->zone();
int stack_register = compiler->UnicodeLookaroundStackRegister();
int position_register = compiler->UnicodeLookaroundPositionRegister();
RegExpLookaround::Builder lookaround(false, on_success, stack_register,
position_register);
RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
zone, lookahead, read_backward, lookaround.on_match_success(), flags);
return TextNode::CreateForCharacterRanges(
zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
}
void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
RegExpNode* on_success,
UnicodeRangeSplitter* splitter) {
JSRegExp::Flags default_flags = JSRegExp::Flags();
ZoneList<CharacterRange>* lead_surrogates =
ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
if (lead_surrogates == nullptr) return;
Zone* zone = compiler->zone();
// E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
RegExpNode* match;
if (compiler->read_backward()) {
// Reading backward. Assert that reading forward, there is no trail
// surrogate, and then backward match the lead surrogate.
match = NegativeLookaroundAgainstReadDirectionAndMatch(
compiler, trail_surrogates, lead_surrogates, on_success, true,
default_flags);
} else {
// Reading forward. Forward match the lead surrogate and assert that
// no trail surrogate follows.
match = MatchAndNegativeLookaroundInReadDirection(
compiler, lead_surrogates, trail_surrogates, on_success, false,
default_flags);
}
result->AddAlternative(GuardedAlternative(match));
}
void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
RegExpNode* on_success,
UnicodeRangeSplitter* splitter) {
JSRegExp::Flags default_flags = JSRegExp::Flags();
ZoneList<CharacterRange>* trail_surrogates =
ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
if (trail_surrogates == nullptr) return;
Zone* zone = compiler->zone();
// E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
RegExpNode* match;
if (compiler->read_backward()) {
// Reading backward. Backward match the trail surrogate and assert that no
// lead surrogate precedes it.
match = MatchAndNegativeLookaroundInReadDirection(
compiler, trail_surrogates, lead_surrogates, on_success, true,
default_flags);
} else {
// Reading forward. Assert that reading backward, there is no lead
// surrogate, and then forward match the trail surrogate.
match = NegativeLookaroundAgainstReadDirectionAndMatch(
compiler, lead_surrogates, trail_surrogates, on_success, false,
default_flags);
}
result->AddAlternative(GuardedAlternative(match));
}
RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
RegExpNode* on_success) {
// This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
DCHECK(!compiler->read_backward());
Zone* zone = compiler->zone();
// Advance any character. If the character happens to be a lead surrogate and
// we advanced into the middle of a surrogate pair, it will work out, as
// nothing will match from there. We will have to advance again, consuming
// the associated trail surrogate.
ZoneList<CharacterRange>* range = CharacterRange::List(
zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
JSRegExp::Flags default_flags = JSRegExp::Flags();
return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
default_flags);
}
void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
#ifdef V8_INTL_SUPPORT
DCHECK(CharacterRange::IsCanonical(ranges));
// Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
// See also https://crbug.com/v8/6727.
// TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
// which we use frequently internally. But large ranges can also easily be
// created by the user. We might want to have a more general caching mechanism
// for such ranges.
if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
// Use ICU to compute the case fold closure over the ranges.
icu::UnicodeSet set;
for (int i = 0; i < ranges->length(); i++) {
set.add(ranges->at(i).from(), ranges->at(i).to());
}
ranges->Clear();
set.closeOver(USET_CASE_INSENSITIVE);
// Full case mapping map single characters to multiple characters.
// Those are represented as strings in the set. Remove them so that
// we end up with only simple and common case mappings.
set.removeAllStrings();
for (int i = 0; i < set.getRangeCount(); i++) {
ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
zone);
}
// No errors and everything we collected have been ranges.
CharacterRange::Canonicalize(ranges);
#endif // V8_INTL_SUPPORT
}
} // namespace
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
set_.Canonicalize();
Zone* zone = compiler->zone();
ZoneList<CharacterRange>* ranges = this->ranges(zone);
if (NeedsUnicodeCaseEquivalents(flags_)) {
AddUnicodeCaseEquivalents(ranges, zone);
}
if (IsUnicode(flags_) && !compiler->one_byte() &&
!contains_split_surrogate()) {
if (is_negated()) {
ZoneList<CharacterRange>* negated =
new (zone) ZoneList<CharacterRange>(2, zone);
CharacterRange::Negate(ranges, negated, zone);
ranges = negated;
}
if (ranges->length() == 0) {
JSRegExp::Flags default_flags;
RegExpCharacterClass* fail =
new (zone) RegExpCharacterClass(zone, ranges, default_flags);
return new (zone) TextNode(fail, compiler->read_backward(), on_success);
}
if (standard_type() == '*') {
return UnanchoredAdvance(compiler, on_success);
} else {
ChoiceNode* result = new (zone) ChoiceNode(2, zone);
UnicodeRangeSplitter splitter(ranges);
AddBmpCharacters(compiler, result, on_success, &splitter);
AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
return result;
}
} else {
return new (zone) TextNode(this, compiler->read_backward(), on_success);
}
}
int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
RegExpAtom* atom1 = (*a)->AsAtom();
RegExpAtom* atom2 = (*b)->AsAtom();
uc16 character1 = atom1->data().at(0);
uc16 character2 = atom2->data().at(0);
if (character1 < character2) return -1;
if (character1 > character2) return 1;
return 0;
}
#ifdef V8_INTL_SUPPORT
// Case Insensitve comparesion
int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) {
RegExpAtom* atom1 = (*a)->AsAtom();
RegExpAtom* atom2 = (*b)->AsAtom();
icu::UnicodeString character1(atom1->data().at(0));
return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT);
}
#else
static unibrow::uchar Canonical(
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
unibrow::uchar c) {
unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
int length = canonicalize->get(c, '\0', chars);
DCHECK_LE(length, 1);
unibrow::uchar canonical = c;
if (length == 1) canonical = chars[0];
return canonical;
}
int CompareFirstCharCaseIndependent(
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
RegExpTree* const* a, RegExpTree* const* b) {
RegExpAtom* atom1 = (*a)->AsAtom();
RegExpAtom* atom2 = (*b)->AsAtom();
unibrow::uchar character1 = atom1->data().at(0);
unibrow::uchar character2 = atom2->data().at(0);
if (character1 == character2) return 0;
if (character1 >= 'a' || character2 >= 'a') {
character1 = Canonical(canonicalize, character1);
character2 = Canonical(canonicalize, character2);
}
return static_cast<int>(character1) - static_cast<int>(character2);
}
#endif // V8_INTL_SUPPORT
// We can stable sort runs of atoms, since the order does not matter if they
// start with different characters.
// Returns true if any consecutive atoms were found.
bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
ZoneList<RegExpTree*>* alternatives = this->alternatives();
int length = alternatives->length();
bool found_consecutive_atoms = false;
for (int i = 0; i < length; i++) {
while (i < length) {
RegExpTree* alternative = alternatives->at(i);
if (alternative->IsAtom()) break;
i++;
}
// i is length or it is the index of an atom.
if (i == length) break;
int first_atom = i;
JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
i++;
while (i < length) {
RegExpTree* alternative = alternatives->at(i);
if (!alternative->IsAtom()) break;
if (alternative->AsAtom()->flags() != flags) break;
i++;
}
// Sort atoms to get ones with common prefixes together.
// This step is more tricky if we are in a case-independent regexp,
// because it would change /is|I/ to /I|is/, and order matters when
// the regexp parts don't match only disjoint starting points. To fix
// this we have a version of CompareFirstChar that uses case-
// independent character classes for comparison.
DCHECK_LT(first_atom, alternatives->length());
DCHECK_LE(i, alternatives->length());
DCHECK_LE(first_atom, i);
if (IgnoreCase(flags)) {
#ifdef V8_INTL_SUPPORT
alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom,
i - first_atom);
#else
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
compiler->isolate()->regexp_macro_assembler_canonicalize();
auto compare_closure = [canonicalize](RegExpTree* const* a,
RegExpTree* const* b) {
return CompareFirstCharCaseIndependent(canonicalize, a, b);
};
alternatives->StableSort(compare_closure, first_atom, i - first_atom);
#endif // V8_INTL_SUPPORT
} else {
alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
}
if (i - first_atom > 1) found_consecutive_atoms = true;
}
return found_consecutive_atoms;
}
// Optimizes ab|ac|az to a(?:b|c|d).
void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
Zone* zone = compiler->zone();
ZoneList<RegExpTree*>* alternatives = this->alternatives();
int length = alternatives->length();
int write_posn = 0;
int i = 0;
while (i < length) {
RegExpTree* alternative = alternatives->at(i);
if (!alternative->IsAtom()) {
alternatives->at(write_posn++) = alternatives->at(i);
i++;
continue;
}
RegExpAtom* const atom = alternative->AsAtom();
JSRegExp::Flags flags = atom->flags();
#ifdef V8_INTL_SUPPORT
icu::UnicodeString common_prefix(atom->data().at(0));
#else
unibrow::uchar common_prefix = atom->data().at(0);
#endif // V8_INTL_SUPPORT
int first_with_prefix = i;
int prefix_length = atom->length();
i++;
while (i < length) {
alternative = alternatives->at(i);
if (!alternative->IsAtom()) break;
RegExpAtom* const atom = alternative->AsAtom();
if (atom->flags() != flags) break;
#ifdef V8_INTL_SUPPORT
icu::UnicodeString new_prefix(atom->data().at(0));
if (new_prefix != common_prefix) {
if (!IgnoreCase(flags)) break;
if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0)
break;
}
#else
unibrow::uchar new_prefix = atom->data().at(0);
if (new_prefix != common_prefix) {
if (!IgnoreCase(flags)) break;
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
compiler->isolate()->regexp_macro_assembler_canonicalize();
new_prefix = Canonical(canonicalize, new_prefix);
common_prefix = Canonical(canonicalize, common_prefix);
if (new_prefix != common_prefix) break;
}
#endif // V8_INTL_SUPPORT
prefix_length = Min(prefix_length, atom->length());
i++;
}
if (i > first_with_prefix + 2) {
// Found worthwhile run of alternatives with common prefix of at least one
// character. The sorting function above did not sort on more than one
// character for reasons of correctness, but there may still be a longer
// common prefix if the terms were similar or presorted in the input.
// Find out how long the common prefix is.
int run_length = i - first_with_prefix;
RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
for (int j = 1; j < run_length && prefix_length > 1; j++) {
RegExpAtom* old_atom =
alternatives->at(j + first_with_prefix)->AsAtom();
for (int k = 1; k < prefix_length; k++) {
if (atom->data().at(k) != old_atom->data().at(k)) {
prefix_length = k;
break;
}
}
}
RegExpAtom* prefix = new (zone)
RegExpAtom(atom->data().SubVector(0, prefix_length), flags);
ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
pair->Add(prefix, zone);
ZoneList<RegExpTree*>* suffixes =
new (zone) ZoneList<RegExpTree*>(run_length, zone);
for (int j = 0; j < run_length; j++) {
RegExpAtom* old_atom =
alternatives->at(j + first_with_prefix)->AsAtom();
int len = old_atom->length();
if (len == prefix_length) {
suffixes->Add(new (zone) RegExpEmpty(), zone);
} else {
RegExpTree* suffix = new (zone) RegExpAtom(
old_atom->data().SubVector(prefix_length, old_atom->length()),
flags);
suffixes->Add(suffix, zone);
}
}
pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
} else {
// Just copy any non-worthwhile alternatives.
for (int j = first_with_prefix; j < i; j++) {
alternatives->at(write_posn++) = alternatives->at(j);
}
}
}
alternatives->Rewind(write_posn); // Trim end of array.
}
// Optimizes b|c|z to [bcz].
void RegExpDisjunction::FixSingleCharacterDisjunctions(
RegExpCompiler* compiler) {
Zone* zone = compiler->zone();
ZoneList<RegExpTree*>* alternatives = this->alternatives();
int length = alternatives->length();
int write_posn = 0;
int i = 0;
while (i < length) {
RegExpTree* alternative = alternatives->at(i);
if (!alternative->IsAtom()) {
alternatives->at(write_posn++) = alternatives->at(i);
i++;
continue;
}
RegExpAtom* const atom = alternative->AsAtom();
if (atom->length() != 1) {
alternatives->at(write_posn++) = alternatives->at(i);
i++;
continue;
}
JSRegExp::Flags flags = atom->flags();
DCHECK_IMPLIES(IsUnicode(flags),
!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
bool contains_trail_surrogate =
unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
int first_in_run = i;
i++;
// Find a run of single-character atom alternatives that have identical
// flags (case independence and unicode-ness).
while (i < length) {
alternative = alternatives->at(i);
if (!alternative->IsAtom()) break;
RegExpAtom* const atom = alternative->AsAtom();
if (atom->length() != 1) break;
if (atom->flags() != flags) break;
DCHECK_IMPLIES(IsUnicode(flags),
!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
contains_trail_surrogate |=
unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
i++;
}
if (i > first_in_run + 1) {
// Found non-trivial run of single-character alternatives.
int run_length = i - first_in_run;
ZoneList<CharacterRange>* ranges =
new (zone) ZoneList<CharacterRange>(2, zone);
for (int j = 0; j < run_length; j++) {
RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
DCHECK_EQ(old_atom->length(), 1);
ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
}
RegExpCharacterClass::CharacterClassFlags character_class_flags;
if (IsUnicode(flags) && contains_trail_surrogate) {
character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
}
alternatives->at(write_posn++) = new (zone)
RegExpCharacterClass(zone, ranges, flags, character_class_flags);
} else {
// Just copy any trivial alternatives.
for (int j = first_in_run; j < i; j++) {
alternatives->at(write_posn++) = alternatives->at(j);
}
}
}
alternatives->Rewind(write_posn); // Trim end of array.
}
RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
ZoneList<RegExpTree*>* alternatives = this->alternatives();
if (alternatives->length() > 2) {
bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
FixSingleCharacterDisjunctions(compiler);
if (alternatives->length() == 1) {
return alternatives->at(0)->ToNode(compiler, on_success);
}
}
int length = alternatives->length();
ChoiceNode* result =
new (compiler->zone()) ChoiceNode(length, compiler->zone());
for (int i = 0; i < length; i++) {
GuardedAlternative alternative(
alternatives->at(i)->ToNode(compiler, on_success));
result->AddAlternative(alternative);
}
return result;
}
RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
}
namespace {
// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
RegExpNode* on_success,
RegExpAssertion::AssertionType type,
JSRegExp::Flags flags) {
DCHECK(NeedsUnicodeCaseEquivalents(flags));
Zone* zone = compiler->zone();
ZoneList<CharacterRange>* word_range =
new (zone) ZoneList<CharacterRange>(2, zone);
CharacterRange::AddClassEscape('w', word_range, true, zone);
int stack_register = compiler->UnicodeLookaroundStackRegister();
int position_register = compiler->UnicodeLookaroundPositionRegister();
ChoiceNode* result = new (zone) ChoiceNode(2, zone);
// Add two choices. The (non-)boundary could start with a word or
// a non-word-character.
for (int i = 0; i < 2; i++) {
bool lookbehind_for_word = i == 0;
bool lookahead_for_word =
(type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
// Look to the left.
RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
stack_register, position_register);
RegExpNode* backward = TextNode::CreateForCharacterRanges(
zone, word_range, true, lookbehind.on_match_success(), flags);
// Look to the right.
RegExpLookaround::Builder lookahead(lookahead_for_word,
lookbehind.ForMatch(backward),
stack_register, position_register);
RegExpNode* forward = TextNode::CreateForCharacterRanges(
zone, word_range, false, lookahead.on_match_success(), flags);
result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
}
return result;
}
} // anonymous namespace
RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
NodeInfo info;
Zone* zone = compiler->zone();
switch (assertion_type()) {
case START_OF_LINE:
return AssertionNode::AfterNewline(on_success);
case START_OF_INPUT:
return AssertionNode::AtStart(on_success);
case BOUNDARY:
return NeedsUnicodeCaseEquivalents(flags_)
? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
flags_)
: AssertionNode::AtBoundary(on_success);
case NON_BOUNDARY:
return NeedsUnicodeCaseEquivalents(flags_)
? BoundaryAssertionAsLookaround(compiler, on_success,
NON_BOUNDARY, flags_)
: AssertionNode::AtNonBoundary(on_success);
case END_OF_INPUT:
return AssertionNode::AtEnd(on_success);
case END_OF_LINE: {
// Compile $ in multiline regexps as an alternation with a positive
// lookahead in one side and an end-of-input on the other side.
// We need two registers for the lookahead.
int stack_pointer_register = compiler->AllocateRegister();
int position_register = compiler->AllocateRegister();
// The ChoiceNode to distinguish between a newline and end-of-input.
ChoiceNode* result = new (zone) ChoiceNode(2, zone);
// Create a newline atom.
ZoneList<CharacterRange>* newline_ranges =
new (zone) ZoneList<CharacterRange>(3, zone);
CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
JSRegExp::Flags default_flags = JSRegExp::Flags();
RegExpCharacterClass* newline_atom =
new (zone) RegExpCharacterClass('n', default_flags);
TextNode* newline_matcher =
new (zone) TextNode(newline_atom, false,
ActionNode::PositiveSubmatchSuccess(
stack_pointer_register, position_register,
0, // No captures inside.
-1, // Ignored if no captures.
on_success));
// Create an end-of-input matcher.
RegExpNode* end_of_line = ActionNode::BeginSubmatch(
stack_pointer_register, position_register, newline_matcher);
// Add the two alternatives to the ChoiceNode.
GuardedAlternative eol_alternative(end_of_line);
result->AddAlternative(eol_alternative);
GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
result->AddAlternative(end_alternative);
return result;
}
default:
UNREACHABLE();
}
return on_success;
}
RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
return new (compiler->zone())
BackReferenceNode(RegExpCapture::StartRegister(index()),
RegExpCapture::EndRegister(index()), flags_,
compiler->read_backward(), on_success);
}
RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
return on_success;
}
RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
int stack_pointer_register,
int position_register,
int capture_register_count,
int capture_register_start)
: is_positive_(is_positive),
on_success_(on_success),
stack_pointer_register_(stack_pointer_register),
position_register_(position_register) {
if (is_positive_) {
on_match_success_ = ActionNode::PositiveSubmatchSuccess(
stack_pointer_register, position_register, capture_register_count,
capture_register_start, on_success_);
} else {
Zone* zone = on_success_->zone();
on_match_success_ = new (zone) NegativeSubmatchSuccess(
stack_pointer_register, position_register, capture_register_count,
capture_register_start, zone);
}
}
RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
if (is_positive_) {
return ActionNode::BeginSubmatch(stack_pointer_register_,
position_register_, match);
} else {
Zone* zone = on_success_->zone();
// We use a ChoiceNode to represent the negative lookaround. The first
// alternative is the negative match. On success, the end node backtracks.
// On failure, the second alternative is tried and leads to success.
// NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
// first exit when calculating quick checks.
ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
GuardedAlternative(match), GuardedAlternative(on_success_), zone);
return ActionNode::BeginSubmatch(stack_pointer_register_,
position_register_, choice_node);
}
}
RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
int stack_pointer_register = compiler->AllocateRegister();
int position_register = compiler->AllocateRegister();
const int registers_per_capture = 2;
const int register_of_first_capture = 2;
int register_count = capture_count_ * registers_per_capture;
int register_start =
register_of_first_capture + capture_from_ * registers_per_capture;
RegExpNode* result;
bool was_reading_backward = compiler->read_backward();
compiler->set_read_backward(type() == LOOKBEHIND);
Builder builder(is_positive(), on_success, stack_pointer_register,
position_register, register_count, register_start);
RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
result = builder.ForMatch(match);
compiler->set_read_backward(was_reading_backward);
return result;
}
RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
return ToNode(body(), index(), compiler, on_success);
}
RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,
RegExpCompiler* compiler,
RegExpNode* on_success) {
DCHECK_NOT_NULL(body);
int start_reg = RegExpCapture::StartRegister(index);
int end_reg = RegExpCapture::EndRegister(index);
if (compiler->read_backward()) std::swap(start_reg, end_reg);
RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
RegExpNode* body_node = body->ToNode(compiler, store_end);
return ActionNode::StorePosition(start_reg, true, body_node);
}
namespace {
class AssertionSequenceRewriter final {
public:
// TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
// instead of sprinkling rewrites into the AST->Node conversion process.
static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
AssertionSequenceRewriter rewriter(terms, zone);
static constexpr int kNoIndex = -1;
int from = kNoIndex;
for (int i = 0; i < terms->length(); i++) {
RegExpTree* t = terms->at(i);
if (from == kNoIndex && t->IsAssertion()) {
from = i; // Start a sequence.
} else if (from != kNoIndex && !t->IsAssertion()) {
// Terminate and process the sequence.
if (i - from > 1) rewriter.Rewrite(from, i);
from = kNoIndex;
}
}
if (from != kNoIndex && terms->length() - from > 1) {
rewriter.Rewrite(from, terms->length());
}
}
// All assertions are zero width. A consecutive sequence of assertions is
// order-independent. There's two ways we can optimize here:
// 1. fold all identical assertions.
// 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
// sequence fails.
void Rewrite(int from, int to) {
DCHECK_GT(to, from + 1);
// Bitfield of all seen assertions.
uint32_t seen_assertions = 0;
STATIC_ASSERT(RegExpAssertion::LAST_TYPE < kUInt32Size * kBitsPerByte);
// Flags must match for folding.
JSRegExp::Flags flags = terms_->at(from)->AsAssertion()->flags();
bool saw_mismatched_flags = false;
for (int i = from; i < to; i++) {
RegExpAssertion* t = terms_->at(i)->AsAssertion();
if (t->flags() != flags) saw_mismatched_flags = true;
const uint32_t bit = 1 << t->assertion_type();
if ((seen_assertions & bit) && !saw_mismatched_flags) {
// Fold duplicates.
terms_->Set(i, new (zone_) RegExpEmpty());
}
seen_assertions |= bit;
}
// Collapse failures.
const uint32_t always_fails_mask =
1 << RegExpAssertion::BOUNDARY | 1 << RegExpAssertion::NON_BOUNDARY;
if ((seen_assertions & always_fails_mask) == always_fails_mask) {
ReplaceSequenceWithFailure(from, to);
}
}
void ReplaceSequenceWithFailure(int from, int to) {
// Replace the entire sequence with a single node that always fails.
// TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
// negated '*' (everything) range serves the purpose.
ZoneList<CharacterRange>* ranges =
new (zone_) ZoneList<CharacterRange>(0, zone_);
RegExpCharacterClass* cc =
new (zone_) RegExpCharacterClass(zone_, ranges, JSRegExp::Flags());
terms_->Set(from, cc);
// Zero out the rest.
RegExpEmpty* empty = new (zone_) RegExpEmpty();
for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
}
private:
AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
: zone_(zone), terms_(terms) {}
Zone* zone_;
ZoneList<RegExpTree*>* terms_;
};
} // namespace
RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
ZoneList<RegExpTree*>* children = nodes();
AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
RegExpNode* current = on_success;
if (compiler->read_backward()) {
for (int i = 0; i < children->length(); i++) {
current = children->at(i)->ToNode(compiler, current);
}
} else {
for (int i = children->length() - 1; i >= 0; i--) {
current = children->at(i)->ToNode(compiler, current);
}
}
return current;
}
static void AddClass(const int* elmv, int elmc,
ZoneList<CharacterRange>* ranges, Zone* zone) {
elmc--;
DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
for (int i = 0; i < elmc; i += 2) {
DCHECK(elmv[i] < elmv[i + 1]);
ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
}
}
static void AddClassNegated(const int* elmv, int elmc,
ZoneList<CharacterRange>* ranges, Zone* zone) {
elmc--;
DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
DCHECK_NE(0x0000, elmv[0]);
DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
uc16 last = 0x0000;
for (int i = 0; i < elmc; i += 2) {
DCHECK(last <= elmv[i] - 1);
DCHECK(elmv[i] < elmv[i + 1]);
ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
last = elmv[i + 1];
}
ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
}
void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
bool add_unicode_case_equivalents,
Zone* zone) {
if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
// See #sec-runtime-semantics-wordcharacters-abstract-operation
// In case of unicode and ignore_case, we need to create the closure over
// case equivalent characters before negating.
ZoneList<CharacterRange>* new_ranges =
new (zone) ZoneList<CharacterRange>(2, zone);
AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
AddUnicodeCaseEquivalents(new_ranges, zone);
if (type == 'W') {
ZoneList<CharacterRange>* negated =
new (zone) ZoneList<CharacterRange>(2, zone);
CharacterRange::Negate(new_ranges, negated, zone);
new_ranges = negated;
}
ranges->AddAll(*new_ranges, zone);
return;
}
AddClassEscape(type, ranges, zone);
}
void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
Zone* zone) {
switch (type) {
case 's':
AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
break;
case 'S':
AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
break;
case 'w':
AddClass(kWordRanges, kWordRangeCount, ranges, zone);
break;
case 'W':
AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
break;
case 'd':
AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
break;
case 'D':
AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
break;
case '.':
AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
zone);
break;
// This is not a character range as defined by the spec but a
// convenient shorthand for a character class that matches any
// character.
case '*':
ranges->Add(CharacterRange::Everything(), zone);
break;
// This is the set of characters matched by the $ and ^ symbols
// in multiline mode.
case 'n':
AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
break;
default:
UNREACHABLE();
}
}
Vector<const int> CharacterRange::GetWordBounds() {
return Vector<const int>(kWordRanges, kWordRangeCount - 1);
}
// static
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges,
bool is_one_byte) {
CharacterRange::Canonicalize(ranges);
int range_count = ranges->length();
#ifdef V8_INTL_SUPPORT
icu::UnicodeSet others;
for (int i = 0; i < range_count; i++) {
CharacterRange range = ranges->at(i);
uc32 from = range.from();
if (from > String::kMaxUtf16CodeUnit) continue;
uc32 to = Min(range.to(), String::kMaxUtf16CodeUnit);
// Nothing to be done for surrogates.
if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
if (from > String::kMaxOneByteCharCode) continue;
if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;
}
others.add(from, to);
}
// Compute the set of additional characters that should be added,
// using UnicodeSet::closeOver. ECMA 262 defines slightly different
// case-folding rules than Unicode, so some characters that are
// added by closeOver do not match anything other than themselves in
// JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
// same case-insensitive character as 's' or 'S' according to
// Unicode, but does not match any other character in JS. To handle
// this case, we add such characters to the IgnoreSet and filter
// them out. We filter twice: once before calling closeOver (to
// prevent 'ſ' from adding 's'), and once after calling closeOver
// (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
// more information.
icu::UnicodeSet already_added(others);
others.removeAll(RegExpCaseFolding::IgnoreSet());
others.closeOver(USET_CASE_INSENSITIVE);
others.removeAll(RegExpCaseFolding::IgnoreSet());
others.removeAll(already_added);
// Add others to the ranges
for (int32_t i = 0; i < others.getRangeCount(); i++) {
UChar32 from = others.getRangeStart(i);
UChar32 to = others.getRangeEnd(i);
if (from == to) {
ranges->Add(CharacterRange::Singleton(from), zone);
} else {
ranges->Add(CharacterRange::Range(from, to), zone);
}
}
#else
for (int i = 0; i < range_count; i++) {
CharacterRange range = ranges->at(i);
uc32 bottom = range.from();
if (bottom > String::kMaxUtf16CodeUnit) continue;
uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
// Nothing to be done for surrogates.
if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
if (bottom > String::kMaxOneByteCharCode) continue;
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
}
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
if (top == bottom) {
// If this is a singleton we just expand the one character.
int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
for (int i = 0; i < length; i++) {
uc32 chr = chars[i];
if (chr != bottom) {
ranges->Add(CharacterRange::Singleton(chars[i]), zone);
}
}
} else {
// If this is a range we expand the characters block by block, expanding
// contiguous subranges (blocks) one at a time. The approach is as
// follows. For a given start character we look up the remainder of the
// block that contains it (represented by the end point), for instance we
// find 'z' if the character is 'c'. A block is characterized by the
// property that all characters uncanonicalize in the same way, except
// that each entry in the result is incremented by the distance from the
// first element. So a-z is a block because 'a' uncanonicalizes to ['a',
// 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
// we've found the end point we look up its uncanonicalization and
// produce a range for each element. For instance for [c-f] we look up
// ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
// it is not already contained in the input, so [c-f] will be skipped but
// [C-F] will be added. If this range is not completely contained in a
// block we do this for all the blocks covered by the range (handling
// characters that is not in a block as a "singleton block").
unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int pos = bottom;
while (pos <= top) {
int length =
isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
uc32 block_end;
if (length == 0) {
block_end = pos;
} else {
DCHECK_EQ(1, length);
block_end = equivalents[0];
}
int end = (block_end > top) ? top : block_end;
length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
equivalents);
for (int i = 0; i < length; i++) {
uc32 c = equivalents[i];
uc32 range_from = c - (block_end - pos);
uc32 range_to = c - (block_end - end);
if (!(bottom <= range_from && range_to <= top)) {
ranges->Add(CharacterRange::Range(range_from, range_to), zone);
}
}
pos = end + 1;
}
}
}
#endif // V8_INTL_SUPPORT
}
bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
DCHECK_NOT_NULL(ranges);
int n = ranges->length();
if (n <= 1) return true;
int max = ranges->at(0).to();
for (int i = 1; i < n; i++) {
CharacterRange next_range = ranges->at(i);
if (next_range.from() <= max + 1) return false;
max = next_range.to();
}
return true;
}
ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
if (ranges_ == nullptr) {
ranges_ = new (zone) ZoneList<CharacterRange>(2, zone);
CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
}
return ranges_;
}
// Move a number of elements in a zonelist to another position
// in the same list. Handles overlapping source and target areas.
static void MoveRanges(ZoneList<CharacterRange>* list, int from, int to,
int count) {
// Ranges are potentially overlapping.
if (from < to) {
for (int i = count - 1; i >= 0; i--) {
list->at(to + i) = list->at(from + i);
}
} else {
for (int i = 0; i < count; i++) {
list->at(to + i) = list->at(from + i);
}
}
}
static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
CharacterRange insert) {
// Inserts a range into list[0..count[, which must be sorted
// by from value and non-overlapping and non-adjacent, using at most
// list[0..count] for the result. Returns the number of resulting
// canonicalized ranges. Inserting a range may collapse existing ranges into
// fewer ranges, so the return value can be anything in the range 1..count+1.
uc32 from = insert.from();
uc32 to = insert.to();
int start_pos = 0;
int end_pos = count;
for (int i = count - 1; i >= 0; i--) {
CharacterRange current = list->at(i);
if (current.from() > to + 1) {
end_pos = i;
} else if (current.to() + 1 < from) {
start_pos = i + 1;
break;
}
}
// Inserted range overlaps, or is adjacent to, ranges at positions
// [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
// not affected by the insertion.
// If start_pos == end_pos, the range must be inserted before start_pos.
// if start_pos < end_pos, the entire range from start_pos to end_pos
// must be merged with the insert range.
if (start_pos == end_pos) {
// Insert between existing ranges at position start_pos.
if (start_pos < count) {
MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
}
list->at(start_pos) = insert;
return count + 1;
}
if (start_pos + 1 == end_pos) {
// Replace single existing range at position start_pos.
CharacterRange to_replace = list->at(start_pos);
int new_from = Min(to_replace.from(), from);
int new_to = Max(to_replace.to(), to);
list->at(start_pos) = CharacterRange::Range(new_from, new_to);
return count;
}
// Replace a number of existing ranges from start_pos to end_pos - 1.
// Move the remaining ranges down.
int new_from = Min(list->at(start_pos).from(), from);
int new_to = Max(list->at(end_pos - 1).to(), to);
if (end_pos < count) {
MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
}
list->at(start_pos) = CharacterRange::Range(new_from, new_to);
return count - (end_pos - start_pos) + 1;
}
void CharacterSet::Canonicalize() {
// Special/default classes are always considered canonical. The result
// of calling ranges() will be sorted.
if (ranges_ == nullptr) return;
CharacterRange::Canonicalize(ranges_);
}
void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
if (character_ranges->length() <= 1) return;
// Check whether ranges are already canonical (increasing, non-overlapping,
// non-adjacent).
int n = character_ranges->length();
int max = character_ranges->at(0).to();
int i = 1;
while (i < n) {
CharacterRange current = character_ranges->at(i);
if (current.from() <= max + 1) {
break;
}
max = current.to();
i++;
}
// Canonical until the i'th range. If that's all of them, we are done.
if (i == n) return;
// The ranges at index i and forward are not canonicalized. Make them so by
// doing the equivalent of insertion sort (inserting each into the previous
// list, in order).
// Notice that inserting a range can reduce the number of ranges in the
// result due to combining of adjacent and overlapping ranges.
int read = i; // Range to insert.
int num_canonical = i; // Length of canonicalized part of list.
do {
num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
character_ranges->at(read));
read++;
} while (read < n);
character_ranges->Rewind(num_canonical);
DCHECK(CharacterRange::IsCanonical(character_ranges));
}
void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
ZoneList<CharacterRange>* negated_ranges,
Zone* zone) {
DCHECK(CharacterRange::IsCanonical(ranges));
DCHECK_EQ(0, negated_ranges->length());
int range_count = ranges->length();
uc32 from = 0;
int i = 0;
if (range_count > 0 && ranges->at(0).from() == 0) {
from = ranges->at(0).to() + 1;
i = 1;
}
while (i < range_count) {
CharacterRange range = ranges->at(i);
negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
from = range.to() + 1;
i++;
}
if (from < String::kMaxCodePoint) {
negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
zone);
}
}
// Scoped object to keep track of how much we unroll quantifier loops in the
// regexp graph generator.
class RegExpExpansionLimiter {
public:
static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
: compiler_(compiler),
saved_expansion_factor_(compiler->current_expansion_factor()),
ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
DCHECK_LT(0, factor);
if (ok_to_expand_) {
if (factor > kMaxExpansionFactor) {
// Avoid integer overflow of the current expansion factor.
ok_to_expand_ = false;
compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
} else {
int new_factor = saved_expansion_factor_ * factor;
ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
compiler->set_current_expansion_factor(new_factor);
}
}
}
~RegExpExpansionLimiter() {
compiler_->set_current_expansion_factor(saved_expansion_factor_);
}
bool ok_to_expand() { return ok_to_expand_; }
private:
RegExpCompiler* compiler_;
int saved_expansion_factor_;
bool ok_to_expand_;
DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
};
RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
RegExpTree* body, RegExpCompiler* compiler,
RegExpNode* on_success,
bool not_at_start) {
// x{f, t} becomes this:
//
// (r++)<-.
// | `
// | (x)
// v ^
// (r=0)-->(?)---/ [if r < t]
// |
// [if r >= f] \----> ...
//
// 15.10.2.5 RepeatMatcher algorithm.
// The parser has already eliminated the case where max is 0. In the case
// where max_match is zero the parser has removed the quantifier if min was
// > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
// If we know that we cannot match zero length then things are a little
// simpler since we don't need to make the special zero length match check
// from step 2.1. If the min and max are small we can unroll a little in
// this case.
static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
if (max == 0) return on_success; // This can happen due to recursion.
bool body_can_be_empty = (body->min_match() == 0);
int body_start_reg = RegExpCompiler::kNoRegister;
Interval capture_registers = body->CaptureRegisters();
bool needs_capture_clearing = !capture_registers.is_empty();
Zone* zone = compiler->zone();
if (body_can_be_empty) {
body_start_reg = compiler->AllocateRegister();
} else if (compiler->optimize() && !needs_capture_clearing) {
// Only unroll if there are no captures and the body can't be
// empty.
{
RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
int new_max = (max == kInfinity) ? max : max - min;
// Recurse once to get the loop or optional matches after the fixed
// ones.
RegExpNode* answer =
ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
// Unroll the forced matches from 0 to min. This can cause chains of
// TextNodes (which the parser does not generate). These should be
// combined if it turns out they hinder good code generation.
for (int i = 0; i < min; i++) {
answer = body->ToNode(compiler, answer);
}
return answer;
}
}
if (max <= kMaxUnrolledMaxMatches && min == 0) {
DCHECK_LT(0, max); // Due to the 'if' above.
RegExpExpansionLimiter limiter(compiler, max);
if (limiter.ok_to_expand()) {
// Unroll the optional matches up to max.
RegExpNode* answer = on_success;
for (int i = 0; i < max; i++) {
ChoiceNode* alternation = new (zone) ChoiceNode(2, zone);
if (is_greedy) {
alternation->AddAlternative(
GuardedAlternative(body->ToNode(compiler, answer)));
alternation->AddAlternative(GuardedAlternative(on_success));
} else {
alternation->AddAlternative(GuardedAlternative(on_success));
alternation->AddAlternative(
GuardedAlternative(body->ToNode(compiler, answer)));
}
answer = alternation;
if (not_at_start && !compiler->read_backward()) {
alternation->set_not_at_start();
}
}
return answer;
}
}
}
bool has_min = min > 0;
bool has_max = max < RegExpTree::kInfinity;
bool needs_counter = has_min || has_max;
int reg_ctr = needs_counter ? compiler->AllocateRegister()
: RegExpCompiler::kNoRegister;
LoopChoiceNode* center = new (zone) LoopChoiceNode(
body->min_match() == 0, compiler->read_backward(), min, zone);
if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
RegExpNode* loop_return =
needs_counter ? static_cast<RegExpNode*>(
ActionNode::IncrementRegister(reg_ctr, center))
: static_cast<RegExpNode*>(center);
if (body_can_be_empty) {
// If the body can be empty we need to check if it was and then
// backtrack.
loop_return =
ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
}
RegExpNode* body_node = body->ToNode(compiler, loop_return);
if (body_can_be_empty) {
// If the body can be empty we need to store the start position
// so we can bail out if it was empty.
body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
}
if (needs_capture_clearing) {
// Before entering the body of this loop we need to clear captures.
body_node = ActionNode::ClearCaptures(capture_registers, body_node);
}
GuardedAlternative body_alt(body_node);
if (has_max) {
Guard* body_guard = new (zone) Guard(reg_ctr, Guard::LT, max);
body_alt.AddGuard(body_guard, zone);
}
GuardedAlternative rest_alt(on_success);
if (has_min) {
Guard* rest_guard = new (compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
rest_alt.AddGuard(rest_guard, zone);
}
if (is_greedy) {
center->AddLoopAlternative(body_alt);
center->AddContinueAlternative(rest_alt);
} else {
center->AddContinueAlternative(rest_alt);
center->AddLoopAlternative(body_alt);
}
if (needs_counter) {
return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
} else {
return center;
}
}
} // namespace internal
} // namespace v8