325 lines
11 KiB
C++
325 lines
11 KiB
C++
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
|
|
* vim: set ts=8 sts=4 et sw=4 tw=99: */
|
|
|
|
// Copyright 2012 the V8 project authors. All rights reserved.
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above
|
|
// copyright notice, this list of conditions and the following
|
|
// disclaimer in the documentation and/or other materials provided
|
|
// with the distribution.
|
|
// * Neither the name of Google Inc. nor the names of its
|
|
// contributors may be used to endorse or promote products derived
|
|
// from this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
#ifndef V8_PARSER_H_
|
|
#define V8_PARSER_H_
|
|
|
|
#include "irregexp/RegExpAST.h"
|
|
|
|
namespace js {
|
|
|
|
namespace frontend {
|
|
class TokenStream;
|
|
}
|
|
|
|
namespace irregexp {
|
|
|
|
bool
|
|
ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
|
|
bool multiline, bool match_only, bool unicode, bool ignore_case,
|
|
bool global, bool sticky, bool dotall, RegExpCompileData* data);
|
|
|
|
bool
|
|
ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
|
|
bool unicode, bool dotall);
|
|
|
|
// A BufferedVector is an automatically growing list, just like (and backed
|
|
// by) a Vector, that is optimized for the case of adding and removing
|
|
// a single element. The last element added is stored outside the backing list,
|
|
// and if no more than one element is ever added, the ZoneList isn't even
|
|
// allocated.
|
|
// Elements must not be nullptr pointers.
|
|
template <typename T, int initial_size>
|
|
class BufferedVector
|
|
{
|
|
public:
|
|
typedef InfallibleVector<T*, 1> VectorType;
|
|
|
|
BufferedVector() : list_(nullptr), last_(nullptr) {}
|
|
|
|
// Adds element at end of list. This element is buffered and can
|
|
// be read using last() or removed using RemoveLast until a new Add or until
|
|
// RemoveLast or GetList has been called.
|
|
void Add(LifoAlloc* alloc, T* value) {
|
|
if (last_ != nullptr) {
|
|
if (list_ == nullptr) {
|
|
list_ = alloc->newInfallible<VectorType>(*alloc);
|
|
list_->reserve(initial_size);
|
|
}
|
|
list_->append(last_);
|
|
}
|
|
last_ = value;
|
|
}
|
|
|
|
T* last() {
|
|
MOZ_ASSERT(last_ != nullptr);
|
|
return last_;
|
|
}
|
|
|
|
T* RemoveLast() {
|
|
MOZ_ASSERT(last_ != nullptr);
|
|
T* result = last_;
|
|
if ((list_ != nullptr) && (list_->length() > 0))
|
|
last_ = list_->popCopy();
|
|
else
|
|
last_ = nullptr;
|
|
return result;
|
|
}
|
|
|
|
T* Get(int i) {
|
|
MOZ_ASSERT((0 <= i) && (i < length()));
|
|
if (list_ == nullptr) {
|
|
MOZ_ASSERT(0 == i);
|
|
return last_;
|
|
} else {
|
|
if (size_t(i) == list_->length()) {
|
|
MOZ_ASSERT(last_ != nullptr);
|
|
return last_;
|
|
} else {
|
|
return (*list_)[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
void Clear() {
|
|
list_ = nullptr;
|
|
last_ = nullptr;
|
|
}
|
|
|
|
int length() {
|
|
int length = (list_ == nullptr) ? 0 : list_->length();
|
|
return length + ((last_ == nullptr) ? 0 : 1);
|
|
}
|
|
|
|
VectorType* GetList(LifoAlloc* alloc) {
|
|
if (list_ == nullptr)
|
|
list_ = alloc->newInfallible<VectorType>(*alloc);
|
|
if (last_ != nullptr) {
|
|
list_->append(last_);
|
|
last_ = nullptr;
|
|
}
|
|
return list_;
|
|
}
|
|
|
|
private:
|
|
VectorType* list_;
|
|
T* last_;
|
|
};
|
|
|
|
|
|
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
|
|
class RegExpBuilder
|
|
{
|
|
public:
|
|
explicit RegExpBuilder(LifoAlloc* alloc);
|
|
void AddCharacter(char16_t character);
|
|
// "Adds" an empty expression. Does nothing except consume a
|
|
// following quantifier
|
|
void AddEmpty();
|
|
void AddAtom(RegExpTree* tree);
|
|
void AddAssertion(RegExpTree* tree);
|
|
void NewAlternative(); // '|'
|
|
void AddQuantifierToAtom(int min, int max, RegExpQuantifier::QuantifierType type);
|
|
RegExpTree* ToRegExp();
|
|
|
|
private:
|
|
void FlushCharacters();
|
|
void FlushText();
|
|
void FlushTerms();
|
|
|
|
LifoAlloc* alloc;
|
|
bool pending_empty_;
|
|
CharacterVector* characters_;
|
|
BufferedVector<RegExpTree, 2> terms_;
|
|
BufferedVector<RegExpTree, 2> text_;
|
|
BufferedVector<RegExpTree, 2> alternatives_;
|
|
|
|
enum LastAdded {
|
|
ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM
|
|
};
|
|
#ifdef DEBUG
|
|
LastAdded last_added_;
|
|
#endif
|
|
};
|
|
|
|
// Characters parsed by RegExpParser can be either char16_t or kEndMarker.
|
|
typedef uint32_t widechar;
|
|
|
|
template <typename CharT>
|
|
class RegExpParser
|
|
{
|
|
public:
|
|
RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
|
|
const CharT* chars, const CharT* end, bool multiline_mode, bool unicode,
|
|
bool ignore_case, bool dotall);
|
|
|
|
RegExpTree* ParsePattern();
|
|
RegExpTree* ParseDisjunction();
|
|
RegExpTree* ParseCharacterClass();
|
|
|
|
// Parses a {...,...} quantifier and stores the range in the given
|
|
// out parameters.
|
|
bool ParseIntervalQuantifier(int* min_out, int* max_out);
|
|
|
|
// Tries to parse the input as a single escaped character. If successful
|
|
// it stores the result in the output parameter and returns true.
|
|
// Otherwise it throws an error and returns false. The character must not
|
|
// be 'b' or 'B' since they are usually handled specially.
|
|
bool ParseClassCharacterEscape(widechar* code);
|
|
|
|
// Checks whether the following is a length-digit hexadecimal number,
|
|
// and sets the value if it is.
|
|
bool ParseHexEscape(int length, widechar* value);
|
|
|
|
bool ParseBracedHexEscape(widechar* value);
|
|
bool ParseTrailSurrogate(widechar* value);
|
|
bool ParseRawSurrogatePair(char16_t* lead, char16_t* trail);
|
|
|
|
widechar ParseOctalLiteral();
|
|
|
|
// Tries to parse the input as a back reference. If successful it
|
|
// stores the result in the output parameter and returns true. If
|
|
// it fails it will push back the characters read so the same characters
|
|
// can be reparsed.
|
|
bool ParseBackReferenceIndex(int* index_out);
|
|
|
|
bool ParseClassAtom(char16_t* char_class, widechar *value);
|
|
RegExpTree* ReportError(unsigned errorNumber, const char* param = nullptr);
|
|
void Advance();
|
|
void Advance(int dist) {
|
|
next_pos_ += dist - 1;
|
|
Advance();
|
|
}
|
|
|
|
void Reset(const CharT* pos) {
|
|
next_pos_ = pos;
|
|
has_more_ = (pos < end_);
|
|
Advance();
|
|
}
|
|
|
|
// Reports whether the pattern might be used as a literal search string.
|
|
// Only use if the result of the parse is a single atom node.
|
|
bool simple() { return simple_; }
|
|
bool contains_anchor() { return contains_anchor_; }
|
|
void set_contains_anchor() { contains_anchor_ = true; }
|
|
int captures_started() { return captures_started_; }
|
|
const CharT* position() { return next_pos_ - 1; }
|
|
|
|
static const int kMaxCaptures = 1 << 16;
|
|
static const widechar kEndMarker = (1 << 21);
|
|
|
|
private:
|
|
enum SubexpressionType {
|
|
INITIAL,
|
|
CAPTURE, // All positive values represent captures.
|
|
POSITIVE_LOOKAROUND,
|
|
NEGATIVE_LOOKAROUND,
|
|
GROUPING
|
|
};
|
|
|
|
class RegExpParserState {
|
|
public:
|
|
RegExpParserState(LifoAlloc* alloc,
|
|
RegExpParserState* previous_state,
|
|
SubexpressionType group_type,
|
|
RegExpLookaround::Type lookaround_type,
|
|
int disjunction_capture_index)
|
|
: previous_state_(previous_state),
|
|
builder_(alloc->newInfallible<RegExpBuilder>(alloc)),
|
|
group_type_(group_type),
|
|
lookaround_type_(lookaround_type),
|
|
disjunction_capture_index_(disjunction_capture_index)
|
|
{}
|
|
// Parser state of containing expression, if any.
|
|
RegExpParserState* previous_state() { return previous_state_; }
|
|
bool IsSubexpression() { return previous_state_ != nullptr; }
|
|
// RegExpBuilder building this regexp's AST.
|
|
RegExpBuilder* builder() { return builder_; }
|
|
// Type of regexp being parsed (parenthesized group or entire regexp).
|
|
SubexpressionType group_type() { return group_type_; }
|
|
// Lookahead or Lookbehind.
|
|
RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
|
|
// Index in captures array of first capture in this sub-expression, if any.
|
|
// Also the capture index of this sub-expression itself, if group_type
|
|
// is CAPTURE.
|
|
int capture_index() { return disjunction_capture_index_; }
|
|
|
|
// Check whether the parser is inside a capture group with the given index.
|
|
bool IsInsideCaptureGroup(int index);
|
|
|
|
private:
|
|
// Linked list implementation of stack of states.
|
|
RegExpParserState* previous_state_;
|
|
// Builder for the stored disjunction.
|
|
RegExpBuilder* builder_;
|
|
// Stored disjunction type (capture, look-ahead or grouping), if any.
|
|
SubexpressionType group_type_;
|
|
// Stored read direction.
|
|
RegExpLookaround::Type lookaround_type_;
|
|
// Stored disjunction's capture index (if any).
|
|
int disjunction_capture_index_;
|
|
};
|
|
|
|
// Return the 1-indexed RegExpCapture object, allocate if necessary.
|
|
RegExpCapture* GetCapture(int index);
|
|
|
|
widechar current() { return current_; }
|
|
bool has_more() { return has_more_; }
|
|
bool has_next() { return next_pos_ < end_; }
|
|
widechar Next() {
|
|
if (has_next())
|
|
return *next_pos_;
|
|
return kEndMarker;
|
|
}
|
|
void ScanForCaptures();
|
|
|
|
frontend::TokenStream& ts;
|
|
LifoAlloc* alloc;
|
|
RegExpCaptureVector* captures_;
|
|
const CharT* next_pos_;
|
|
const CharT* end_;
|
|
widechar current_;
|
|
int captures_started_;
|
|
// The capture count is only valid after we have scanned for captures.
|
|
int capture_count_;
|
|
bool has_more_;
|
|
bool multiline_;
|
|
bool unicode_;
|
|
bool ignore_case_;
|
|
bool dotall_;
|
|
bool simple_;
|
|
bool contains_anchor_;
|
|
bool is_scanned_for_captures_;
|
|
};
|
|
|
|
} } // namespace js::irregexp
|
|
|
|
#endif // V8_PARSER_H_
|