362 lines
12 KiB
C++
362 lines
12 KiB
C++
// Copyright 2016 the V8 project authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
#ifndef V8_REGEXP_REGEXP_PARSER_H_
|
|
#define V8_REGEXP_REGEXP_PARSER_H_
|
|
|
|
#include "regexp/regexp-ast.h"
|
|
#include "regexp/regexp-error.h"
|
|
|
|
namespace v8 {
|
|
namespace internal {
|
|
|
|
struct RegExpCompileData;
|
|
|
|
// A BufferedZoneList is an automatically growing list, just like (and backed
|
|
// by) a ZoneList, that is optimized for the case of adding and removing
|
|
// a single element. The last element added is stored outside the backing list,
|
|
// and if no more than one element is ever added, the ZoneList isn't even
|
|
// allocated.
|
|
// Elements must not be nullptr pointers.
|
|
template <typename T, int initial_size>
|
|
class BufferedZoneList {
|
|
public:
|
|
BufferedZoneList() : list_(nullptr), last_(nullptr) {}
|
|
|
|
// Adds element at end of list. This element is buffered and can
|
|
// be read using last() or removed using RemoveLast until a new Add or until
|
|
// RemoveLast or GetList has been called.
|
|
void Add(T* value, Zone* zone) {
|
|
if (last_ != nullptr) {
|
|
if (list_ == nullptr) {
|
|
list_ = new (zone) ZoneList<T*>(initial_size, zone);
|
|
}
|
|
list_->Add(last_, zone);
|
|
}
|
|
last_ = value;
|
|
}
|
|
|
|
T* last() {
|
|
DCHECK(last_ != nullptr);
|
|
return last_;
|
|
}
|
|
|
|
T* RemoveLast() {
|
|
DCHECK(last_ != nullptr);
|
|
T* result = last_;
|
|
if ((list_ != nullptr) && (list_->length() > 0))
|
|
last_ = list_->RemoveLast();
|
|
else
|
|
last_ = nullptr;
|
|
return result;
|
|
}
|
|
|
|
T* Get(int i) {
|
|
DCHECK((0 <= i) && (i < length()));
|
|
if (list_ == nullptr) {
|
|
DCHECK_EQ(0, i);
|
|
return last_;
|
|
} else {
|
|
if (i == list_->length()) {
|
|
DCHECK(last_ != nullptr);
|
|
return last_;
|
|
} else {
|
|
return list_->at(i);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Clear() {
|
|
list_ = nullptr;
|
|
last_ = nullptr;
|
|
}
|
|
|
|
int length() {
|
|
int length = (list_ == nullptr) ? 0 : list_->length();
|
|
return length + ((last_ == nullptr) ? 0 : 1);
|
|
}
|
|
|
|
ZoneList<T*>* GetList(Zone* zone) {
|
|
if (list_ == nullptr) {
|
|
list_ = new (zone) ZoneList<T*>(initial_size, zone);
|
|
}
|
|
if (last_ != nullptr) {
|
|
list_->Add(last_, zone);
|
|
last_ = nullptr;
|
|
}
|
|
return list_;
|
|
}
|
|
|
|
private:
|
|
ZoneList<T*>* list_;
|
|
T* last_;
|
|
};
|
|
|
|
|
|
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
|
|
class RegExpBuilder : public ZoneObject {
|
|
public:
|
|
RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
|
|
void AddCharacter(uc16 character);
|
|
void AddUnicodeCharacter(uc32 character);
|
|
void AddEscapedUnicodeCharacter(uc32 character);
|
|
// "Adds" an empty expression. Does nothing except consume a
|
|
// following quantifier
|
|
void AddEmpty();
|
|
void AddCharacterClass(RegExpCharacterClass* cc);
|
|
void AddCharacterClassForDesugaring(uc32 c);
|
|
void AddAtom(RegExpTree* tree);
|
|
void AddTerm(RegExpTree* tree);
|
|
void AddAssertion(RegExpTree* tree);
|
|
void NewAlternative(); // '|'
|
|
bool AddQuantifierToAtom(int min, int max,
|
|
RegExpQuantifier::QuantifierType type);
|
|
void FlushText();
|
|
RegExpTree* ToRegExp();
|
|
JSRegExp::Flags flags() const { return flags_; }
|
|
void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
|
|
|
|
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
|
|
bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
|
|
bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
|
|
|
|
private:
|
|
static const uc16 kNoPendingSurrogate = 0;
|
|
void AddLeadSurrogate(uc16 lead_surrogate);
|
|
void AddTrailSurrogate(uc16 trail_surrogate);
|
|
void FlushPendingSurrogate();
|
|
void FlushCharacters();
|
|
void FlushTerms();
|
|
bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
|
|
bool NeedsDesugaringForIgnoreCase(uc32 c);
|
|
Zone* zone() const { return zone_; }
|
|
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
|
|
|
|
Zone* zone_;
|
|
bool pending_empty_;
|
|
JSRegExp::Flags flags_;
|
|
ZoneList<uc16>* characters_;
|
|
uc16 pending_surrogate_;
|
|
BufferedZoneList<RegExpTree, 2> terms_;
|
|
BufferedZoneList<RegExpTree, 2> text_;
|
|
BufferedZoneList<RegExpTree, 2> alternatives_;
|
|
#ifdef DEBUG
|
|
enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
|
|
#define LAST(x) last_added_ = x;
|
|
#else
|
|
#define LAST(x)
|
|
#endif
|
|
};
|
|
|
|
class V8_EXPORT_PRIVATE RegExpParser {
|
|
public:
|
|
RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
|
|
Zone* zone);
|
|
|
|
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
|
|
JSRegExp::Flags flags, RegExpCompileData* result);
|
|
|
|
RegExpTree* ParsePattern();
|
|
RegExpTree* ParseDisjunction();
|
|
RegExpTree* ParseGroup();
|
|
|
|
// Parses a {...,...} quantifier and stores the range in the given
|
|
// out parameters.
|
|
bool ParseIntervalQuantifier(int* min_out, int* max_out);
|
|
|
|
// Parses and returns a single escaped character. The character
|
|
// must not be 'b' or 'B' since they are usually handle specially.
|
|
uc32 ParseClassCharacterEscape();
|
|
|
|
// Checks whether the following is a length-digit hexadecimal number,
|
|
// and sets the value if it is.
|
|
bool ParseHexEscape(int length, uc32* value);
|
|
bool ParseUnicodeEscape(uc32* value);
|
|
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
|
|
|
|
bool ParsePropertyClassName(ZoneVector<char>* name_1,
|
|
ZoneVector<char>* name_2);
|
|
bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
|
|
const ZoneVector<char>& name_1,
|
|
const ZoneVector<char>& name_2);
|
|
|
|
RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
|
|
RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
|
|
|
|
uc32 ParseOctalLiteral();
|
|
|
|
// Tries to parse the input as a back reference. If successful it
|
|
// stores the result in the output parameter and returns true. If
|
|
// it fails it will push back the characters read so the same characters
|
|
// can be reparsed.
|
|
bool ParseBackReferenceIndex(int* index_out);
|
|
|
|
// Parse inside a class. Either add escaped class to the range, or return
|
|
// false and pass parsed single character through |char_out|.
|
|
void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
|
|
bool add_unicode_case_equivalents, uc32* char_out,
|
|
bool* is_class_escape);
|
|
|
|
char ParseClassEscape();
|
|
|
|
RegExpTree* ReportError(RegExpError error);
|
|
void Advance();
|
|
void Advance(int dist);
|
|
void Reset(int pos);
|
|
|
|
// Reports whether the pattern might be used as a literal search string.
|
|
// Only use if the result of the parse is a single atom node.
|
|
bool simple();
|
|
bool contains_anchor() { return contains_anchor_; }
|
|
void set_contains_anchor() { contains_anchor_ = true; }
|
|
int captures_started() { return captures_started_; }
|
|
int position() { return next_pos_ - 1; }
|
|
bool failed() { return failed_; }
|
|
// The Unicode flag can't be changed using in-regexp syntax, so it's OK to
|
|
// just read the initial flag value here.
|
|
bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
|
|
|
|
static bool IsSyntaxCharacterOrSlash(uc32 c);
|
|
|
|
static const uc32 kEndMarker = (1 << 21);
|
|
|
|
private:
|
|
enum SubexpressionType {
|
|
INITIAL,
|
|
CAPTURE, // All positive values represent captures.
|
|
POSITIVE_LOOKAROUND,
|
|
NEGATIVE_LOOKAROUND,
|
|
GROUPING
|
|
};
|
|
|
|
class RegExpParserState : public ZoneObject {
|
|
public:
|
|
// Push a state on the stack.
|
|
RegExpParserState(RegExpParserState* previous_state,
|
|
SubexpressionType group_type,
|
|
RegExpLookaround::Type lookaround_type,
|
|
int disjunction_capture_index,
|
|
const ZoneVector<uc16>* capture_name,
|
|
JSRegExp::Flags flags, Zone* zone)
|
|
: previous_state_(previous_state),
|
|
builder_(new (zone) RegExpBuilder(zone, flags)),
|
|
group_type_(group_type),
|
|
lookaround_type_(lookaround_type),
|
|
disjunction_capture_index_(disjunction_capture_index),
|
|
capture_name_(capture_name) {}
|
|
// Parser state of containing expression, if any.
|
|
RegExpParserState* previous_state() const { return previous_state_; }
|
|
bool IsSubexpression() { return previous_state_ != nullptr; }
|
|
// RegExpBuilder building this regexp's AST.
|
|
RegExpBuilder* builder() const { return builder_; }
|
|
// Type of regexp being parsed (parenthesized group or entire regexp).
|
|
SubexpressionType group_type() const { return group_type_; }
|
|
// Lookahead or Lookbehind.
|
|
RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
|
|
// Index in captures array of first capture in this sub-expression, if any.
|
|
// Also the capture index of this sub-expression itself, if group_type
|
|
// is CAPTURE.
|
|
int capture_index() const { return disjunction_capture_index_; }
|
|
// The name of the current sub-expression, if group_type is CAPTURE. Only
|
|
// used for named captures.
|
|
const ZoneVector<uc16>* capture_name() const { return capture_name_; }
|
|
|
|
bool IsNamedCapture() const { return capture_name_ != nullptr; }
|
|
|
|
// Check whether the parser is inside a capture group with the given index.
|
|
bool IsInsideCaptureGroup(int index);
|
|
// Check whether the parser is inside a capture group with the given name.
|
|
bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
|
|
|
|
private:
|
|
// Linked list implementation of stack of states.
|
|
RegExpParserState* const previous_state_;
|
|
// Builder for the stored disjunction.
|
|
RegExpBuilder* const builder_;
|
|
// Stored disjunction type (capture, look-ahead or grouping), if any.
|
|
const SubexpressionType group_type_;
|
|
// Stored read direction.
|
|
const RegExpLookaround::Type lookaround_type_;
|
|
// Stored disjunction's capture index (if any).
|
|
const int disjunction_capture_index_;
|
|
// Stored capture name (if any).
|
|
const ZoneVector<uc16>* const capture_name_;
|
|
};
|
|
|
|
// Return the 1-indexed RegExpCapture object, allocate if necessary.
|
|
RegExpCapture* GetCapture(int index);
|
|
|
|
// Creates a new named capture at the specified index. Must be called exactly
|
|
// once for each named capture. Fails if a capture with the same name is
|
|
// encountered.
|
|
bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
|
|
|
|
// Parses the name of a capture group (?<name>pattern). The name must adhere
|
|
// to IdentifierName in the ECMAScript standard.
|
|
const ZoneVector<uc16>* ParseCaptureGroupName();
|
|
|
|
bool ParseNamedBackReference(RegExpBuilder* builder,
|
|
RegExpParserState* state);
|
|
RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
|
|
|
|
// After the initial parsing pass, patch corresponding RegExpCapture objects
|
|
// into all RegExpBackReferences. This is done after initial parsing in order
|
|
// to avoid complicating cases in which references comes before the capture.
|
|
void PatchNamedBackReferences();
|
|
|
|
Handle<FixedArray> CreateCaptureNameMap();
|
|
|
|
// Returns true iff the pattern contains named captures. May call
|
|
// ScanForCaptures to look ahead at the remaining pattern.
|
|
bool HasNamedCaptures();
|
|
|
|
Isolate* isolate() { return isolate_; }
|
|
Zone* zone() const { return zone_; }
|
|
|
|
uc32 current() { return current_; }
|
|
bool has_more() { return has_more_; }
|
|
bool has_next() { return next_pos_ < in()->length(); }
|
|
uc32 Next();
|
|
template <bool update_position>
|
|
uc32 ReadNext();
|
|
FlatStringReader* in() { return in_; }
|
|
void ScanForCaptures();
|
|
|
|
struct RegExpCaptureNameLess {
|
|
bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
|
|
DCHECK_NOT_NULL(lhs);
|
|
DCHECK_NOT_NULL(rhs);
|
|
return *lhs->name() < *rhs->name();
|
|
}
|
|
};
|
|
|
|
Isolate* isolate_;
|
|
Zone* zone_;
|
|
RegExpError error_ = RegExpError::kNone;
|
|
int error_pos_ = 0;
|
|
ZoneList<RegExpCapture*>* captures_;
|
|
ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
|
|
ZoneList<RegExpBackReference*>* named_back_references_;
|
|
FlatStringReader* in_;
|
|
uc32 current_;
|
|
// These are the flags specified outside the regexp syntax ie after the
|
|
// terminating '/' or in the second argument to the constructor. The current
|
|
// flags are stored on the RegExpBuilder.
|
|
JSRegExp::Flags top_level_flags_;
|
|
int next_pos_;
|
|
int captures_started_;
|
|
int capture_count_; // Only valid after we have scanned for captures.
|
|
bool has_more_;
|
|
bool simple_;
|
|
bool contains_anchor_;
|
|
bool is_scanned_for_captures_;
|
|
bool has_named_captures_; // Only valid after we have scanned for captures.
|
|
bool failed_;
|
|
};
|
|
|
|
} // namespace internal
|
|
} // namespace v8
|
|
|
|
#endif // V8_REGEXP_REGEXP_PARSER_H_
|