Mypal/js/src/regexp/regexp-parser.h

// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_REGEXP_REGEXP_PARSER_H_
#define V8_REGEXP_REGEXP_PARSER_H_

#include "regexp/regexp-ast.h"
#include "regexp/regexp-error.h"

namespace v8 {
namespace internal {

struct RegExpCompileData;

// A BufferedZoneList is an automatically growing list, just like (and backed
// by) a ZoneList, that is optimized for the case of adding and removing
// a single element. The last element added is stored outside the backing list,
// and if no more than one element is ever added, the ZoneList isn't even
// allocated.
// Elements must not be nullptr pointers.
template <typename T, int initial_size>
class BufferedZoneList {
 public:
  BufferedZoneList() : list_(nullptr), last_(nullptr) {}

  // Adds element at end of list. This element is buffered and can
  // be read using last() or removed using RemoveLast until a new Add or until
  // RemoveLast or GetList has been called.
  void Add(T* value, Zone* zone) {
    if (last_ != nullptr) {
      if (list_ == nullptr) {
        list_ = new (zone) ZoneList<T*>(initial_size, zone);
      }
      list_->Add(last_, zone);
    }
    last_ = value;
  }

  T* last() {
    DCHECK(last_ != nullptr);
    return last_;
  }

  T* RemoveLast() {
    DCHECK(last_ != nullptr);
    T* result = last_;
    if ((list_ != nullptr) && (list_->length() > 0))
      last_ = list_->RemoveLast();
    else
      last_ = nullptr;
    return result;
  }

  T* Get(int i) {
    DCHECK((0 <= i) && (i < length()));
    if (list_ == nullptr) {
      DCHECK_EQ(0, i);
      return last_;
    } else {
      if (i == list_->length()) {
        DCHECK(last_ != nullptr);
        return last_;
      } else {
        return list_->at(i);
      }
    }
  }

  void Clear() {
    list_ = nullptr;
    last_ = nullptr;
  }

  int length() {
    int length = (list_ == nullptr) ? 0 : list_->length();
    return length + ((last_ == nullptr) ? 0 : 1);
  }

  ZoneList<T*>* GetList(Zone* zone) {
    if (list_ == nullptr) {
      list_ = new (zone) ZoneList<T*>(initial_size, zone);
    }
    if (last_ != nullptr) {
      list_->Add(last_, zone);
      last_ = nullptr;
    }
    return list_;
  }

 private:
  ZoneList<T*>* list_;
  T* last_;
};


// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder : public ZoneObject {
 public:
  RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
  void AddCharacter(uc16 character);
  void AddUnicodeCharacter(uc32 character);
  void AddEscapedUnicodeCharacter(uc32 character);
  // "Adds" an empty expression. Does nothing except consume a
  // following quantifier
  void AddEmpty();
  void AddCharacterClass(RegExpCharacterClass* cc);
  void AddCharacterClassForDesugaring(uc32 c);
  void AddAtom(RegExpTree* tree);
  void AddTerm(RegExpTree* tree);
  void AddAssertion(RegExpTree* tree);
  void NewAlternative();  // '|'
  bool AddQuantifierToAtom(int min, int max,
                           RegExpQuantifier::QuantifierType type);
  void FlushText();
  RegExpTree* ToRegExp();
  JSRegExp::Flags flags() const { return flags_; }
  void set_flags(JSRegExp::Flags flags) { flags_ = flags; }

  bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
  bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
  bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }

 private:
  static const uc16 kNoPendingSurrogate = 0;
  void AddLeadSurrogate(uc16 lead_surrogate);
  void AddTrailSurrogate(uc16 trail_surrogate);
  void FlushPendingSurrogate();
  void FlushCharacters();
  void FlushTerms();
  bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
  bool NeedsDesugaringForIgnoreCase(uc32 c);
  Zone* zone() const { return zone_; }
  bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }

  Zone* zone_;
  bool pending_empty_;
  JSRegExp::Flags flags_;
  ZoneList<uc16>* characters_;
  uc16 pending_surrogate_;
  BufferedZoneList<RegExpTree, 2> terms_;
  BufferedZoneList<RegExpTree, 2> text_;
  BufferedZoneList<RegExpTree, 2> alternatives_;
#ifdef DEBUG
  enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
#define LAST(x) last_added_ = x;
#else
#define LAST(x)
#endif
};

class V8_EXPORT_PRIVATE RegExpParser {
 public:
  RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
               Zone* zone);

  static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
                          JSRegExp::Flags flags, RegExpCompileData* result);

  RegExpTree* ParsePattern();
  RegExpTree* ParseDisjunction();
  RegExpTree* ParseGroup();

  // Parses a {...,...} quantifier and stores the range in the given
  // out parameters.
  bool ParseIntervalQuantifier(int* min_out, int* max_out);

  // Parses and returns a single escaped character.  The character
  // must not be 'b' or 'B' since they are usually handle specially.
  uc32 ParseClassCharacterEscape();

  // Checks whether the following is a length-digit hexadecimal number,
  // and sets the value if it is.
  bool ParseHexEscape(int length, uc32* value);
  bool ParseUnicodeEscape(uc32* value);
  bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);

  bool ParsePropertyClassName(ZoneVector<char>* name_1,
                              ZoneVector<char>* name_2);
  bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
                             const ZoneVector<char>& name_1,
                             const ZoneVector<char>& name_2);

  RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
  RegExpTree* ParseCharacterClass(const RegExpBuilder* state);

  uc32 ParseOctalLiteral();

  // Tries to parse the input as a back reference.  If successful it
  // stores the result in the output parameter and returns true.  If
  // it fails it will push back the characters read so the same characters
  // can be reparsed.
  bool ParseBackReferenceIndex(int* index_out);

  // Parse inside a class. Either add escaped class to the range, or return
  // false and pass parsed single character through |char_out|.
  void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
                        bool add_unicode_case_equivalents, uc32* char_out,
                        bool* is_class_escape);

  char ParseClassEscape();

  RegExpTree* ReportError(RegExpError error);
  void Advance();
  void Advance(int dist);
  void Reset(int pos);

  // Reports whether the pattern might be used as a literal search string.
  // Only use if the result of the parse is a single atom node.
  bool simple();
  bool contains_anchor() { return contains_anchor_; }
  void set_contains_anchor() { contains_anchor_ = true; }
  int captures_started() { return captures_started_; }
  int position() { return next_pos_ - 1; }
  bool failed() { return failed_; }
  // The Unicode flag can't be changed using in-regexp syntax, so it's OK to
  // just read the initial flag value here.
  bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }

  static bool IsSyntaxCharacterOrSlash(uc32 c);

  static const uc32 kEndMarker = (1 << 21);

 private:
  enum SubexpressionType {
    INITIAL,
    CAPTURE,  // All positive values represent captures.
    POSITIVE_LOOKAROUND,
    NEGATIVE_LOOKAROUND,
    GROUPING
  };

  class RegExpParserState : public ZoneObject {
   public:
    // Push a state on the stack.
    RegExpParserState(RegExpParserState* previous_state,
                      SubexpressionType group_type,
                      RegExpLookaround::Type lookaround_type,
                      int disjunction_capture_index,
                      const ZoneVector<uc16>* capture_name,
                      JSRegExp::Flags flags, Zone* zone)
        : previous_state_(previous_state),
          builder_(new (zone) RegExpBuilder(zone, flags)),
          group_type_(group_type),
          lookaround_type_(lookaround_type),
          disjunction_capture_index_(disjunction_capture_index),
          capture_name_(capture_name) {}
    // Parser state of containing expression, if any.
    RegExpParserState* previous_state() const { return previous_state_; }
    bool IsSubexpression() { return previous_state_ != nullptr; }
    // RegExpBuilder building this regexp's AST.
    RegExpBuilder* builder() const { return builder_; }
    // Type of regexp being parsed (parenthesized group or entire regexp).
    SubexpressionType group_type() const { return group_type_; }
    // Lookahead or Lookbehind.
    RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
    // Index in captures array of first capture in this sub-expression, if any.
    // Also the capture index of this sub-expression itself, if group_type
    // is CAPTURE.
    int capture_index() const { return disjunction_capture_index_; }
    // The name of the current sub-expression, if group_type is CAPTURE. Only
    // used for named captures.
    const ZoneVector<uc16>* capture_name() const { return capture_name_; }

    bool IsNamedCapture() const { return capture_name_ != nullptr; }

    // Check whether the parser is inside a capture group with the given index.
    bool IsInsideCaptureGroup(int index);
    // Check whether the parser is inside a capture group with the given name.
    bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);

   private:
    // Linked list implementation of stack of states.
    RegExpParserState* const previous_state_;
    // Builder for the stored disjunction.
    RegExpBuilder* const builder_;
    // Stored disjunction type (capture, look-ahead or grouping), if any.
    const SubexpressionType group_type_;
    // Stored read direction.
    const RegExpLookaround::Type lookaround_type_;
    // Stored disjunction's capture index (if any).
    const int disjunction_capture_index_;
    // Stored capture name (if any).
    const ZoneVector<uc16>* const capture_name_;
  };

  // Return the 1-indexed RegExpCapture object, allocate if necessary.
  RegExpCapture* GetCapture(int index);

  // Creates a new named capture at the specified index. Must be called exactly
  // once for each named capture. Fails if a capture with the same name is
  // encountered.
  bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);

  // Parses the name of a capture group (?<name>pattern). The name must adhere
  // to IdentifierName in the ECMAScript standard.
  const ZoneVector<uc16>* ParseCaptureGroupName();

  bool ParseNamedBackReference(RegExpBuilder* builder,
                               RegExpParserState* state);
  RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);

  // After the initial parsing pass, patch corresponding RegExpCapture objects
  // into all RegExpBackReferences. This is done after initial parsing in order
  // to avoid complicating cases in which references comes before the capture.
  void PatchNamedBackReferences();

  Handle<FixedArray> CreateCaptureNameMap();

  // Returns true iff the pattern contains named captures. May call
  // ScanForCaptures to look ahead at the remaining pattern.
  bool HasNamedCaptures();

  Isolate* isolate() { return isolate_; }
  Zone* zone() const { return zone_; }

  uc32 current() { return current_; }
  bool has_more() { return has_more_; }
  bool has_next() { return next_pos_ < in()->length(); }
  uc32 Next();
  template <bool update_position>
  uc32 ReadNext();
  FlatStringReader* in() { return in_; }
  void ScanForCaptures();

  struct RegExpCaptureNameLess {
    bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
      DCHECK_NOT_NULL(lhs);
      DCHECK_NOT_NULL(rhs);
      return *lhs->name() < *rhs->name();
    }
  };

  Isolate* isolate_;
  Zone* zone_;
  RegExpError error_ = RegExpError::kNone;
  int error_pos_ = 0;
  ZoneList<RegExpCapture*>* captures_;
  ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
  ZoneList<RegExpBackReference*>* named_back_references_;
  FlatStringReader* in_;
  uc32 current_;
  // These are the flags specified outside the regexp syntax ie after the
  // terminating '/' or in the second argument to the constructor.  The current
  // flags are stored on the RegExpBuilder.
  JSRegExp::Flags top_level_flags_;
  int next_pos_;
  int captures_started_;
  int capture_count_;  // Only valid after we have scanned for captures.
  bool has_more_;
  bool simple_;
  bool contains_anchor_;
  bool is_scanned_for_captures_;
  bool has_named_captures_;  // Only valid after we have scanned for captures.
  bool failed_;
};

}  // namespace internal
}  // namespace v8

#endif  // V8_REGEXP_REGEXP_PARSER_H_