Implement /s (dotAll) flag for Regexes.

This commit is contained in:
Fedor 2019-12-25 15:46:17 +03:00
parent d9300b2bb0
commit 85906a8fa5
12 changed files with 118 additions and 23 deletions

View File

@ -178,7 +178,7 @@ CheckPatternSyntax(JSContext* cx, HandleAtom pattern, RegExpFlag flags)
CompileOptions options(cx);
frontend::TokenStream dummyTokenStream(cx, options, nullptr, 0, nullptr);
return irregexp::ParsePatternSyntax(dummyTokenStream, cx->tempLifoAlloc(), pattern,
flags & UnicodeFlag);
flags & UnicodeFlag, flags & DotAllFlag);
}
enum RegExpSharedUse {
@ -664,6 +664,29 @@ js::regexp_multiline(JSContext* cx, unsigned argc, JS::Value* vp)
return CallNonGenericMethod<IsRegExpInstanceOrPrototype, regexp_multiline_impl>(cx, args);
}
// ES 2018 dotAll
MOZ_ALWAYS_INLINE bool
regexp_dotall_impl(JSContext* cx, const CallArgs& args)
{
MOZ_ASSERT(IsRegExpInstanceOrPrototype(args.thisv()));
if (!IsRegExpObject(args.thisv())) {
args.rval().setUndefined();
return true;
}
Rooted<RegExpObject*> reObj(cx, &args.thisv().toObject().as<RegExpObject>());
args.rval().setBoolean(reObj->dotall());
return true;
}
bool
js::regexp_dotall(JSContext* cx, unsigned argc, JS::Value* vp)
{
CallArgs args = CallArgsFromVp(argc, vp);
return CallNonGenericMethod<IsRegExpInstanceOrPrototype, regexp_dotall_impl>(cx, args);
}
// ES 2017 draft rev32 21.2.5.10.
MOZ_ALWAYS_INLINE bool
regexp_source_impl(JSContext* cx, const CallArgs& args)
@ -759,6 +782,7 @@ const JSPropertySpec js::regexp_properties[] = {
JS_PSG("source", regexp_source, 0),
JS_PSG("sticky", regexp_sticky, 0),
JS_PSG("unicode", regexp_unicode, 0),
JS_PSG("dotall", regexp_dotall, 0),
JS_PS_END
};
@ -1642,6 +1666,13 @@ js::RegExpPrototypeOptimizableRaw(JSContext* cx, JSObject* proto)
if (unicodeGetter != regexp_unicode)
return false;
JSNative dotAllGetter;
if (!GetOwnNativeGetterPure(cx, proto, NameToId(cx->names().dotall), &dotAllGetter))
return false;
if (dotAllGetter != regexp_dotall)
return false;
// Check if @@match, @@search, and exec are own data properties,
// those values should be tested in selfhosted JS.
bool has = false;

View File

@ -153,6 +153,8 @@ extern MOZ_MUST_USE bool
regexp_sticky(JSContext* cx, unsigned argc, JS::Value* vp);
extern MOZ_MUST_USE bool
regexp_unicode(JSContext* cx, unsigned argc, JS::Value* vp);
extern MOZ_MUST_USE bool
regexp_dotall(JSContext* cx, unsigned argc, JS::Value* vp);
} /* namespace js */

View File

@ -3,6 +3,7 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// ES6 draft rev34 (2015/02/20) 21.2.5.3 get RegExp.prototype.flags
// Updated for ES2018 /s (dotAll)
function RegExpFlagsGetter() {
// Steps 1-2.
var R = this;
@ -31,6 +32,10 @@ function RegExpFlagsGetter() {
// Steps 16-18.
if (R.sticky)
result += "y";
// ES2018
if (R.dotall)
result += "s";
// Step 19.
return result;

View File

@ -90,6 +90,7 @@
#define REGEXP_MULTILINE_FLAG 0x04
#define REGEXP_STICKY_FLAG 0x08
#define REGEXP_UNICODE_FLAG 0x10
#define REGEXP_DOTALL_FLAG 0x20
#define MODULE_OBJECT_ENVIRONMENT_SLOT 2

View File

@ -3975,6 +3975,7 @@ ParseRegExp(JSContext* cx, unsigned argc, Value* vp)
flags & MultilineFlag, match_only,
flags & UnicodeFlag, flags & IgnoreCaseFlag,
flags & GlobalFlag, flags & StickyFlag,
flags & DotAllFlag,
&data))
{
return false;

View File

@ -1843,6 +1843,8 @@ TokenStream::getTokenInternal(TokenKind* ttp, Modifier modifier)
reflags = RegExpFlag(reflags | StickyFlag);
else if (c == 'u' && !(reflags & UnicodeFlag))
reflags = RegExpFlag(reflags | UnicodeFlag);
else if (c == 's' && !(reflags & DotAllFlag))
reflags = RegExpFlag(reflags | DotAllFlag);
else
break;
getChar();

View File

@ -222,7 +222,7 @@ RegExpBuilder::AddQuantifierToAtom(int min, int max,
template <typename CharT>
RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
const CharT* chars, const CharT* end, bool multiline_mode,
bool unicode, bool ignore_case)
bool unicode, bool ignore_case, bool dotall)
: ts(ts),
alloc(alloc),
captures_(nullptr),
@ -235,6 +235,7 @@ RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
multiline_(multiline_mode),
unicode_(unicode),
ignore_case_(ignore_case),
dotall_(dotall),
simple_(false),
contains_anchor_(false),
is_scanned_for_captures_(false)
@ -1384,7 +1385,7 @@ UnicodeEverythingAtom(LifoAlloc* alloc)
{
RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
// everything except \x0a, \x0d, \u2028 and \u2029
// Everything except \x0a, \x0d, \u2028 and \u2029
CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
ranges->append(CharacterRange::Range(0x0, 0x09));
@ -1414,6 +1415,38 @@ UnicodeEverythingAtom(LifoAlloc* alloc)
return builder->ToRegExp();
}
static inline RegExpTree*
UnicodeDotAllAtom(LifoAlloc* alloc)
{
RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
// Full range excluding surrogates because /s was specified
CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
ranges->append(CharacterRange::Range(0x0, unicode::LeadSurrogateMin - 1));
ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max));
builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
builder->NewAlternative();
builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
unicode::TrailSurrogateMax));
builder->NewAlternative();
builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
builder->NewAlternative();
builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
return builder->ToRegExp();
}
RegExpTree*
UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case)
{
@ -1541,13 +1574,25 @@ RegExpParser<CharT>::ParseDisjunction()
}
case '.': {
Advance();
// everything except \x0a, \x0d, \u2028 and \u2029
if (unicode_) {
builder->AddAtom(UnicodeEverythingAtom(alloc));
if (dotall_) {
// Everything
builder->AddAtom(UnicodeDotAllAtom(alloc));
} else {
// Everything except \x0a, \x0d, \u2028 and \u2029
builder->AddAtom(UnicodeEverythingAtom(alloc));
}
break;
}
CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
CharacterRange::AddClassEscape(alloc, '.', ranges);
if (dotall_) {
// Everything
CharacterRange::AddClassEscape(alloc, '*', ranges);
} else {
// Everything except \x0a, \x0d, \u2028 and \u2029
CharacterRange::AddClassEscape(alloc, '.', ranges);
}
RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
builder->AddAtom(atom);
break;
@ -1880,7 +1925,7 @@ template <typename CharT>
static bool
ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length,
bool multiline, bool match_only, bool unicode, bool ignore_case,
bool global, bool sticky, RegExpCompileData* data)
bool global, bool sticky, bool dotall, RegExpCompileData* data)
{
if (match_only) {
// Try to strip a leading '.*' from the RegExp, but only if it is not
@ -1907,7 +1952,7 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si
}
}
RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case);
RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case, dotall);
data->tree = parser.ParsePattern();
if (!data->tree)
return false;
@ -1921,33 +1966,33 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si
bool
irregexp::ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
bool multiline, bool match_only, bool unicode, bool ignore_case,
bool global, bool sticky, RegExpCompileData* data)
bool global, bool sticky, bool dotall, RegExpCompileData* data)
{
JS::AutoCheckCannotGC nogc;
return str->hasLatin1Chars()
? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(),
multiline, match_only, unicode, ignore_case, global, sticky, data)
multiline, match_only, unicode, ignore_case, global, sticky, dotall, data)
: ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(),
multiline, match_only, unicode, ignore_case, global, sticky, data);
multiline, match_only, unicode, ignore_case, global, sticky, dotall, data);
}
template <typename CharT>
static bool
ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length,
bool unicode)
bool unicode, bool dotall)
{
LifoAllocScope scope(&alloc);
RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, false);
RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, dotall, false);
return parser.ParsePattern() != nullptr;
}
bool
irregexp::ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
bool unicode)
bool unicode, bool dotall)
{
JS::AutoCheckCannotGC nogc;
return str->hasLatin1Chars()
? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode)
: ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode);
? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode, dotall)
: ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode, dotall);
}

View File

@ -44,11 +44,11 @@ namespace irregexp {
bool
ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
bool multiline, bool match_only, bool unicode, bool ignore_case,
bool global, bool sticky, RegExpCompileData* data);
bool global, bool sticky, bool dotall, RegExpCompileData* data);
bool
ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
bool unicode);
bool unicode, bool dotall);
// A BufferedVector is an automatically growing list, just like (and backed
// by) a Vector, that is optimized for the case of adding and removing
@ -178,7 +178,7 @@ class RegExpParser
public:
RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
const CharT* chars, const CharT* end, bool multiline_mode, bool unicode,
bool ignore_case);
bool ignore_case, bool dotall);
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
@ -313,6 +313,7 @@ class RegExpParser
bool multiline_;
bool unicode_;
bool ignore_case_;
bool dotall_;
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;

View File

@ -5704,6 +5704,7 @@ JS_ObjectIsDate(JSContext* cx, JS::HandleObject obj, bool* isDate);
#define JSREG_MULTILINE 0x04u /* treat ^ and $ as begin and end of line */
#define JSREG_STICKY 0x08u /* only match starting at lastIndex */
#define JSREG_UNICODE 0x10u /* unicode */
#define JSREG_DOTALL 0x20u /* match . to everything including newlines */
extern JS_PUBLIC_API(JSObject*)
JS_NewRegExpObject(JSContext* cx, const char* bytes, size_t length, unsigned flags);

View File

@ -97,6 +97,7 @@
macro(displayURL, displayURL, "displayURL") \
macro(do, do_, "do") \
macro(done, done, "done") \
macro(dotall, dotall, "dotall") \
macro(dotGenerator, dotGenerator, ".generator") \
macro(dotThis, dotThis, ".this") \
macro(each, each, "each") \

View File

@ -49,6 +49,7 @@ JS_STATIC_ASSERT(GlobalFlag == JSREG_GLOB);
JS_STATIC_ASSERT(MultilineFlag == JSREG_MULTILINE);
JS_STATIC_ASSERT(StickyFlag == JSREG_STICKY);
JS_STATIC_ASSERT(UnicodeFlag == JSREG_UNICODE);
JS_STATIC_ASSERT(DotAllFlag == JSREG_DOTALL);
RegExpObject*
js::RegExpAlloc(ExclusiveContext* cx, HandleObject proto /* = nullptr */)
@ -267,7 +268,7 @@ RegExpObject::create(ExclusiveContext* cx, HandleAtom source, RegExpFlag flags,
tokenStream = dummyTokenStream.ptr();
}
if (!irregexp::ParsePatternSyntax(*tokenStream, alloc, source, flags & UnicodeFlag))
if (!irregexp::ParsePatternSyntax(*tokenStream, alloc, source, flags & UnicodeFlag, flags & DotAllFlag))
return nullptr;
Rooted<RegExpObject*> regexp(cx, RegExpAlloc(cx));
@ -1017,7 +1018,7 @@ RegExpShared::compile(JSContext* cx, HandleAtom pattern, HandleLinearString inpu
irregexp::RegExpCompileData data;
if (!irregexp::ParsePattern(dummyTokenStream, cx->tempLifoAlloc(), pattern,
multiline(), mode == MatchOnly, unicode(), ignoreCase(),
global(), sticky(), &data))
global(), sticky(), dotall(), &data))
{
return false;
}

View File

@ -53,16 +53,18 @@ enum RegExpFlag
MultilineFlag = 0x04,
StickyFlag = 0x08,
UnicodeFlag = 0x10,
DotAllFlag = 0x20,
NoFlags = 0x00,
AllFlags = 0x1f
AllFlags = 0x3f
};
static_assert(IgnoreCaseFlag == REGEXP_IGNORECASE_FLAG &&
GlobalFlag == REGEXP_GLOBAL_FLAG &&
MultilineFlag == REGEXP_MULTILINE_FLAG &&
StickyFlag == REGEXP_STICKY_FLAG &&
UnicodeFlag == REGEXP_UNICODE_FLAG,
UnicodeFlag == REGEXP_UNICODE_FLAG &&
DotAllFlag == REGEXP_DOTALL_FLAG,
"Flag values should be in sync with self-hosted JS");
enum RegExpRunStatus
@ -193,6 +195,7 @@ class RegExpShared
bool multiline() const { return flags & MultilineFlag; }
bool sticky() const { return flags & StickyFlag; }
bool unicode() const { return flags & UnicodeFlag; }
bool dotall() const { return flags & DotAllFlag; }
bool isCompiled(CompilationMode mode, bool latin1,
ForceByteCodeEnum force = DontForceByteCode) const {
@ -480,6 +483,7 @@ class RegExpObject : public NativeObject
bool multiline() const { return getFlags() & MultilineFlag; }
bool sticky() const { return getFlags() & StickyFlag; }
bool unicode() const { return getFlags() & UnicodeFlag; }
bool dotall() const { return getFlags() & DotAllFlag; }
static bool isOriginalFlagGetter(JSNative native, RegExpFlag* mask);