/*
* Copyright (C) 2009 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef YarrParser_h
#define YarrParser_h
#include "Yarr.h"
#include
#include
#include
namespace JSC { namespace Yarr {
#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
enum BuiltInCharacterClassID {
DigitClassID,
SpaceClassID,
WordClassID,
NewlineClassID,
};
// The Parser class should not be used directly - only via the Yarr::parse() method.
template
class Parser {
private:
template
friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
enum ErrorCode {
NoError,
PatternTooLarge,
QuantifierOutOfOrder,
QuantifierWithoutAtom,
QuantifierTooLarge,
MissingParentheses,
ParenthesesUnmatched,
ParenthesesTypeInvalid,
CharacterClassUnmatched,
CharacterClassOutOfOrder,
EscapeUnterminated,
NumberOfErrorCodes
};
/*
* CharacterClassParserDelegate:
*
* The class CharacterClassParserDelegate is used in the parsing of character
* classes. This class handles detection of character ranges. This class
* implements enough of the delegate interface such that it can be passed to
* parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
* to perform the parsing of escape characters in character sets.
*/
class CharacterClassParserDelegate {
public:
CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
: m_delegate(delegate)
, m_err(err)
, m_state(Empty)
, m_character(0)
{
}
/*
* begin():
*
* Called at beginning of construction.
*/
void begin(bool invert)
{
m_delegate.atomCharacterClassBegin(invert);
}
/*
* atomPatternCharacter():
*
* This method is called either from parseCharacterClass() (for an unescaped
* character in a character class), or from parseEscape(). In the former case
* the value true will be passed for the argument 'hyphenIsRange', and in this
* mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
* is different to /[a\-z]/).
*/
void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
{
switch (m_state) {
case AfterCharacterClass:
// Following a builtin character class we need look out for a hyphen.
// We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
// If we see a hyphen following a charater class then unlike usual
// we'll report it to the delegate immediately, and put ourself into
// a poisoned state. Any following calls to add another character or
// character class will result in an error. (A hypen following a
// character-class is itself valid, but only at the end of a regex).
if (hyphenIsRange && ch == '-') {
m_delegate.atomCharacterClassAtom('-');
m_state = AfterCharacterClassHyphen;
return;
}
// Otherwise just fall through - cached character so treat this as Empty.
case Empty:
m_character = ch;
m_state = CachedCharacter;
return;
case CachedCharacter:
if (hyphenIsRange && ch == '-')
m_state = CachedCharacterHyphen;
else {
m_delegate.atomCharacterClassAtom(m_character);
m_character = ch;
}
return;
case CachedCharacterHyphen:
if (ch ())
, m_size(pattern.length())
, m_index(0)
, m_parenthesesNestingDepth(0)
{
}
/*
* parseEscape():
*
* Helper for parseTokens() AND parseCharacterClass().
* Unlike the other parser methods, this function does not report tokens
* directly to the member delegate (m_delegate), instead tokens are
* emitted to the delegate provided as an argument. In the case of atom
* escapes, parseTokens() will call parseEscape() passing m_delegate as
* an argument, and as such the escape will be reported to the delegate.
*
* However this method may also be used by parseCharacterClass(), in which
* case a CharacterClassParserDelegate will be passed as the delegate that
* tokens should be added to. A boolean flag is also provided to indicate
* whether that an escape in a CharacterClass is being parsed (some parsing
* rules change in this context).
*
* The boolean value returned by this method indicates whether the token
* parsed was an atom (outside of a characted class \b and \B will be
* interpreted as assertions).
*/
template
bool parseEscape(EscapeDelegate& delegate)
{
ASSERT(!m_err);
ASSERT(peek() == '\\');
consume();
if (atEndOfPattern()) {
m_err = EscapeUnterminated;
return false;
}
switch (peek()) {
// Assertions
case 'b':
consume();
if (inCharacterClass)
delegate.atomPatternCharacter('\b');
else {
delegate.assertionWordBoundary(false);
return false;
}
break;
case 'B':
consume();
if (inCharacterClass)
delegate.atomPatternCharacter('B');
else {
delegate.assertionWordBoundary(true);
return false;
}
break;
// CharacterClassEscape
case 'd':
consume();
delegate.atomBuiltInCharacterClass(DigitClassID, false);
break;
case 's':
consume();
delegate.atomBuiltInCharacterClass(SpaceClassID, false);
break;
case 'w':
consume();
delegate.atomBuiltInCharacterClass(WordClassID, false);
break;
case 'D':
consume();
delegate.atomBuiltInCharacterClass(DigitClassID, true);
break;
case 'S':
consume();
delegate.atomBuiltInCharacterClass(SpaceClassID, true);
break;
case 'W':
consume();
delegate.atomBuiltInCharacterClass(WordClassID, true);
break;
// DecimalEscape
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
// To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
// First, try to parse this as backreference.
if (!inCharacterClass) {
ParseState state = saveState();
unsigned backReference = consumeNumber();
if (backReference = '8') {
delegate.atomPatternCharacter('\\');
break;
}
// Fall-through to handle this as an octal escape.
}
// Octal escape
case '0':
delegate.atomPatternCharacter(consumeOctal());
break;
// ControlEscape
case 'f':
consume();
delegate.atomPatternCharacter('\f');
break;
case 'n':
consume();
delegate.atomPatternCharacter('\n');
break;
case 'r':
consume();
delegate.atomPatternCharacter('\r');
break;
case 't':
consume();
delegate.atomPatternCharacter('\t');
break;
case 'v':
consume();
delegate.atomPatternCharacter('\v');
break;
// ControlLetter
case 'c': {
ParseState state = saveState();
consume();
if (!atEndOfPattern()) {
int control = consume();
// To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
delegate.atomPatternCharacter(control & 0x1f);
break;
}
}
restoreState(state);
delegate.atomPatternCharacter('\\');
break;
}
// HexEscape
case 'x': {
consume();
int x = tryConsumeHex(2);
if (x == -1)
delegate.atomPatternCharacter('x');
else
delegate.atomPatternCharacter(x);
break;
}
// UnicodeEscape
case 'u': {
consume();
int u = tryConsumeHex(4);
if (u == -1)
delegate.atomPatternCharacter('u');
else
delegate.atomPatternCharacter(u);
break;
}
// IdentityEscape
default:
delegate.atomPatternCharacter(consume());
}
return true;
}
/*
* parseAtomEscape(), parseCharacterClassEscape():
*
* These methods alias to parseEscape().
*/
bool parseAtomEscape()
{
return parseEscape(m_delegate);
}
void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
{
parseEscape(delegate);
}
/*
* parseCharacterClass():
*
* Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
* to an instance of CharacterClassParserDelegate, to describe the character class to the
* delegate.
*/
void parseCharacterClass()
{
ASSERT(!m_err);
ASSERT(peek() == '[');
consume();
CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
characterClassConstructor.begin(tryConsume('^'));
while (!atEndOfPattern()) {
switch (peek()) {
case ']':
consume();
characterClassConstructor.end();
return;
case '\\':
parseCharacterClassEscape(characterClassConstructor);
break;
default:
characterClassConstructor.atomPatternCharacter(consume(), true);
}
if (m_err)
return;
}
m_err = CharacterClassUnmatched;
}
/*
* parseParenthesesBegin():
*
* Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
*/
void parseParenthesesBegin()
{
ASSERT(!m_err);
ASSERT(peek() == '(');
consume();
if (tryConsume('?')) {
if (atEndOfPattern()) {
m_err = ParenthesesTypeInvalid;
return;
}
switch (consume()) {
case ':':
m_delegate.atomParenthesesSubpatternBegin(false);
break;
case '=':
m_delegate.atomParentheticalAssertionBegin();
break;
case '!':
m_delegate.atomParentheticalAssertionBegin(true);
break;
default:
m_err = ParenthesesTypeInvalid;
}
} else
m_delegate.atomParenthesesSubpatternBegin();
++m_parenthesesNestingDepth;
}
/*
* parseParenthesesEnd():
*
* Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
*/
void parseParenthesesEnd()
{
ASSERT(!m_err);
ASSERT(peek() == ')');
consume();
if (m_parenthesesNestingDepth > 0)
m_delegate.atomParenthesesEnd();
else
m_err = ParenthesesUnmatched;
--m_parenthesesNestingDepth;
}
/*
* parseQuantifier():
*
* Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
*/
void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
{
ASSERT(!m_err);
ASSERT(min 0)
m_err = MissingParentheses;
}
/*
* parse():
*
* This method calls parseTokens() to parse over the input and converts any
* error code to a const char* for a result.
*/
const char* parse()
{
if (m_size > MAX_PATTERN_SIZE)
m_err = PatternTooLarge;
else
parseTokens();
ASSERT(atEndOfPattern() || m_err);
// The order of this array must match the ErrorCode enum.
static const char* errorMessages[NumberOfErrorCodes] = {
0, // NoError
REGEXP_ERROR_PREFIX "regular expression too large",
REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
REGEXP_ERROR_PREFIX "nothing to repeat",
REGEXP_ERROR_PREFIX "number too large in {} quantifier",
REGEXP_ERROR_PREFIX "missing )",
REGEXP_ERROR_PREFIX "unmatched parentheses",
REGEXP_ERROR_PREFIX "unrecognized character after (?",
REGEXP_ERROR_PREFIX "missing terminating ] for character class",
REGEXP_ERROR_PREFIX "range out of order in character class",
REGEXP_ERROR_PREFIX "\\ at end of pattern"
};
return errorMessages[m_err];
}
// Misc helper functions:
typedef unsigned ParseState;
ParseState saveState()
{
return m_index;
}
void restoreState(ParseState state)
{
m_index = state;
}
bool atEndOfPattern()
{
ASSERT(m_index = n); ) {
n = newValue;
consume();
}
return n;
}
unsigned consumeOctal()
{
ASSERT(WTF::isASCIIOctalDigit(peek()));
unsigned n = consumeDigit();
while (n
const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
{
if (pattern.is8Bit())
return Parser(delegate, pattern, backReferenceLimit).parse();
return Parser(delegate, pattern, backReferenceLimit).parse();
}
} } // namespace JSC::Yarr
#endif // YarrParser_h